I build a foundationdb cluster with version 7.1.31 and three_data_center mode.
When I add storages in the cluster and when the number of storage servers exceeds 1200+ and is close to 1400+, the data distributor crashed.
<Event Severity="10" Time="1694420800.972599" DateTime="2023-09-11T08:26:40Z" Type="GetMagazineSample" ID="0000000000000000" Size="256" Backtrace="addr2line -e fdbserver.debug -p -C -f -i 0x4328d58 0x426a12a 0x426a355 0x40c65a6 0x40c690b 0x2730b65 0x272de50 0x272e485 0x272ec44 0x188fd90 0x18ad0a5 0x18aeaae 0x1850651 0x185485b 0x18595ae 0x17295bc 0x173093d 0x42b2f48 0xdb5eff 0x7f73df71f495" ThreadID="920255406301111743" Machine="100.71.8.128:30005" LogGroup="default" Roles="DD,RK" />
<Event Severity="20" Time="1694420800.972599" DateTime="2023-09-11T08:26:40Z" Type="Net2RunLoopTrace" ID="0000000000000000" TraceTime="1694420801.036488" Trace="addr2line -e fdbserver.debug -p -C -f -i 0x7f73dfad95d0 0x40c723d 0x24d760f 0x24d90c5 0x24cf36d 0x185045b 0x185485b 0x18595ae 0x17295bc 0x173093d 0x42b2f48 0xdb5eff 0x7f73df71f495 0xe18e42" ThreadID="920255406301111743" Machine="100.71.8.128:30005" LogGroup="default" Roles="DD,RK" />
<Event Severity="20" Time="1694420800.972599" DateTime="2023-09-11T08:26:40Z" Type="Net2RunLoopTrace" ID="0000000000000000" TraceTime="1694420801.161554" Trace="addr2line -e fdbserver.debug -p -C -f -i 0x7f73dfad95d0 0x40d0b83 0x11f7349 0x24c7359 0x1850e5d 0x185485b 0x18595ae 0x17295bc 0x173093d 0x42b2f48 0xdb5eff 0x7f73df71f495 0xe18e42" ThreadID="920255406301111743" Machine="100.71.8.128:30005" LogGroup="default" Roles="DD,RK" />
<Event Severity="20" Time="1694420803.655281" DateTime="2023-09-11T08:26:43Z" Type="Net2RunLoopTrace" ID="0000000000000000" TraceTime="1694420803.664908" Trace="addr2line -e fdbserver.debug -p -C -f -i 0x7f73dfad95d0 0x1ec9752 0x1ecf409 0x1efbaeb 0x42b2f48 0xdb5eff 0x7f73df71f495 0xe18e42" ThreadID="920255406301111743" Machine="100.71.8.128:30005" LogGroup="default" Roles="DD,RK" />
<Event Severity="10" Time="1694420823.354485" DateTime="2023-09-11T08:27:03Z" Type="HugeArenaSample" ID="0000000000000000" Count="1" Size="70404" Backtrace="addr2line -e fdbserver.debug -p -C -f -i 0x42630c7 0x42468b3 0x4246d66 0x40e4bc6 0x42b2f48 0xdb5eff 0x7f73df71f495" ThreadID="920255406301111743" Machine="100.71.8.128:30005" LogGroup="default" Roles="DD" />
<Event Severity="10" Time="1694420828.964950" DateTime="2023-09-11T08:27:08Z" Type="HugeArenaSample" ID="0000000000000000" Count="1" Size="15072" Backtrace="addr2line -e fdbserver.debug -p -C -f -i 0x42630c7 0x42468b3 0x4246d66 0x28250a1 0x37e6041 0x37e6a97 0x3724388 0x37ca538 0x2ab22d8 0xe87a90 0x40e1738 0x40e1ab8 0x42b2f48 0xdb5eff 0x7f73df71f495" ThreadID="920255406301111743" Machine="100.71.8.128:30005" LogGroup="default" Roles="DD" />
There’s the debug info:
# addr2line -e fdbserver.debug -p -C -f -i 0x4328d58 0x426a12a 0x426a355 0x40c65a6 0x40c690b 0x2730b65 0x272de50 0x272e485 0x272ec44 0x188fd90 0x18ad0a5 0x18aeaae 0x1850651 0x185485b 0x18595ae 0x17295bc
std::string::_M_rep() const at /opt/rh/devtoolset-11/root/usr/include/c++/11/bits/basic_string.h:3404
(inlined by) std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::string const&) at /opt/rh/devtoolset-11/root/usr/include/c++/11/bits/basic_string.h:3604
(inlined by) std::basic_string<char, std::char_traits<char>, std::allocator<char> > std::operator+<char, std::char_traits<char>, std::allocator<char> >(std::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, char const*) at /opt/rh/devtoolset-11/root/usr/include/c++/11/bits/basic_string.h:6135
(inlined by) BaseTraceEvent::backtrace(std::string const&) at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/flow/Trace.cpp:1216
std::string::_M_rep() const at /opt/rh/devtoolset-11/root/usr/include/c++/11/bits/basic_string.h:3404
(inlined by) std::basic_string<char, std::char_traits<char>, std::allocator<char> >::~basic_string() at /opt/rh/devtoolset-11/root/usr/include/c++/11/bits/basic_string.h:3768
(inlined by) FastAllocator<256>::getMagazine() at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/flow/FastAlloc.cpp:518
FastAllocator<256>::allocate() at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/flow/FastAlloc.cpp:335
waitForContinuousFailure(IFailureMonitor* const&, Endpoint const&, double const&, double const&) at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/flow/FastAlloc.h:217
(inlined by) waitForContinuousFailure(IFailureMonitor* const&, Endpoint const&, double const&, double const&) at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/fdbrpc/FailureMonitor.actor.cpp:33
IFailureMonitor::onFailedFor(Endpoint const&, double, double) at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/fdbrpc/FailureMonitor.actor.cpp:72
ReplyPromise<Void>::getEndpoint(TaskPriority) const at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/fdbrpc/fdbrpc.h:148 (discriminator 8)
(inlined by) void setReplyPriority<Void>(ReplyPromise<Void> const&, TaskPriority) at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/fdbrpc/fdbrpc.h:255 (discriminator 8)
(inlined by) Future<decltype (((((getReplyPromise((declval<ReplyPromise<Void> >)())).getFuture)()).getValue)())> RequestStream<ReplyPromise<Void> >::getReply<ReplyPromise<Void> >(ReplyPromise<Void> const&, TaskPriority) const at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/fdbrpc/fdbrpc.h:703 (discriminator 8)
(inlined by) Future<ErrorOr<decltype (((((getReplyPromise((declval<ReplyPromise<Void> >)())).getFuture)()).getValue)())> > RequestStream<ReplyPromise<Void> >::getReplyUnlessFailedFor<ReplyPromise<Void> >(ReplyPromise<Void> const&, double, double, TaskPriority) const at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/fdbrpc/fdbrpc.h:805 (discriminator 8)
Future<ErrorOr<Void> >::Future(Future<ErrorOr<Void> > const&) at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/flow/flow.h:810
(inlined by) StrictFuture<ErrorOr<Void> >::StrictFuture(Future<ErrorOr<Void> > const&) at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/flow/flow.h:896
(inlined by) a_body1loopBody1 at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/fdbserver/WaitFailure.actor.cpp:48
waitFailureClient(RequestStream<ReplyPromise<Void> > const&, double const&, double const&, bool const&, TaskPriority const&) at /home/foundationdb_ci/foundationdb_build_output/dbdbdbdbdbdbdbdbdbdbdbdbdbdbdbdb/fdbserver/WaitFailure.actor.g.cpp:315
(inlined by) a_body1 at /home/foundationdb_ci/foundationdb_build_output/dbdbdbdbdbdbdbdbdbdbdbdbdbdbdbdb/fdbserver/WaitFailure.actor.g.cpp:294
(inlined by) WaitFailureClientActor at /home/foundationdb_ci/foundationdb_build_output/dbdbdbdbdbdbdbdbdbdbdbdbdbdbdbdb/fdbserver/WaitFailure.actor.g.cpp:665
(inlined by) waitFailureClient(RequestStream<ReplyPromise<Void> > const&, double const&, double const&, bool const&, TaskPriority const&) at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/fdbserver/WaitFailure.actor.cpp:40
Future<Void>::Future(Future<Void> const&) at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/flow/flow.h:810
(inlined by) StrictFuture<Void>::StrictFuture(Future<Void> const&) at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/flow/flow.h:896
(inlined by) a_body1loopBody1 at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/fdbserver/WaitFailure.actor.cpp:74
(inlined by) a_body1loopHead1 at /home/foundationdb_ci/foundationdb_build_output/dbdbdbdbdbdbdbdbdbdbdbdbdbdbdbdb/fdbserver/WaitFailure.actor.g.cpp:745
(inlined by) a_body1 at /home/foundationdb_ci/foundationdb_build_output/dbdbdbdbdbdbdbdbdbdbdbdbdbdbdbdb/fdbserver/WaitFailure.actor.g.cpp:724
(inlined by) WaitFailureClientStrictActor at /home/foundationdb_ci/foundationdb_build_output/dbdbdbdbdbdbdbdbdbdbdbdbdbdbdbdb/fdbserver/WaitFailure.actor.g.cpp:991
(inlined by) waitFailureClientStrict(RequestStream<ReplyPromise<Void> > const&, double const&, TaskPriority const&) at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/fdbserver/WaitFailure.actor.cpp:70
DDTeamCollectionImpl::StorageServerFailureTrackerActorState<DDTeamCollectionImpl::StorageServerFailureTrackerActor>::a_body1loopBody1(int) at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/fdbserver/DDTeamCollection.actor.cpp:1578 (discriminator 2)
DDTeamCollectionImpl::StorageServerFailureTrackerActorState<DDTeamCollectionImpl::StorageServerFailureTrackerActor>::a_body1loopHead1(int) at /home/foundationdb_ci/foundationdb_build_output/dbdbdbdbdbdbdbdbdbdbdbdbdbdbdbdb/fdbserver/DDTeamCollection.actor.g.cpp:8548
(inlined by) DDTeamCollectionImpl::StorageServerFailureTrackerActorState<DDTeamCollectionImpl::StorageServerFailureTrackerActor>::a_body1(int) at /home/foundationdb_ci/foundationdb_build_output/dbdbdbdbdbdbdbdbdbdbdbdbdbdbdbdb/fdbserver/DDTeamCollection.actor.g.cpp:8515
(inlined by) DDTeamCollectionImpl::StorageServerFailureTrackerActor::StorageServerFailureTrackerActor(DDTeamCollection* const&, TCServerInfo* const&, Database const&, ServerStatus* const&, long const&) at /home/foundationdb_ci/foundationdb_build_output/dbdbdbdbdbdbdbdbdbdbdbdbdbdbdbdb/fdbserver/DDTeamCollection.actor.g.cpp:9011
(inlined by) DDTeamCollectionImpl::storageServerFailureTracker(DDTeamCollection* const&, TCServerInfo* const&, Database const&, ServerStatus* const&, long const&) at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/fdbserver/DDTeamCollection.actor.cpp:1527
(inlined by) DDTeamCollectionImpl::StorageServerTrackerActorState<DDTeamCollectionImpl::StorageServerTrackerActor>::a_body1loopBody1cont1(int) at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/fdbserver/DDTeamCollection.actor.cpp:1185
DDTeamCollectionImpl::StorageServerTrackerActorState<DDTeamCollectionImpl::StorageServerTrackerActor>::a_body1loopBody1(int) at /home/foundationdb_ci/foundationdb_build_output/dbdbdbdbdbdbdbdbdbdbdbdbdbdbdbdb/fdbserver/DDTeamCollection.actor.g.cpp:5141
DDTeamCollectionImpl::StorageServerTrackerActorState<DDTeamCollectionImpl::StorageServerTrackerActor>::a_body1loopHead1(int) at /home/foundationdb_ci/foundationdb_build_output/dbdbdbdbdbdbdbdbdbdbdbdbdbdbdbdb/fdbserver/DDTeamCollection.actor.g.cpp:4898
(inlined by) DDTeamCollectionImpl::StorageServerTrackerActorState<DDTeamCollectionImpl::StorageServerTrackerActor>::a_body1(int) at /home/foundationdb_ci/foundationdb_build_output/dbdbdbdbdbdbdbdbdbdbdbdbdbdbdbdb/fdbserver/DDTeamCollection.actor.g.cpp:4833
(inlined by) DDTeamCollectionImpl::StorageServerTrackerActor::StorageServerTrackerActor(DDTeamCollection* const&, Database const&, TCServerInfo* const&, Promise<Void> const&, long const&, DDEnabledState const* const&, bool const&) at /home/foundationdb_ci/foundationdb_build_output/dbdbdbdbdbdbdbdbdbdbdbdbdbdbdbdb/fdbserver/DDTeamCollection.actor.g.cpp:6591
(inlined by) DDTeamCollectionImpl::storageServerTracker(DDTeamCollection* const&, Database const&, TCServerInfo* const&, Promise<Void> const&, long const&, DDEnabledState const* const&, bool const&) at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/fdbserver/DDTeamCollection.actor.cpp:998
(inlined by) DDTeamCollection::storageServerTracker(Database, TCServerInfo*, Promise<Void>, long, DDEnabledState const&, bool) at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/fdbserver/DDTeamCollection.actor.cpp:3426
(inlined by) DDTeamCollection::addServer(StorageServerInterface, ProcessClass, Promise<Void>, long, DDEnabledState const&) at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/fdbserver/DDTeamCollection.actor.cpp:4779
DDTeamCollectionImpl::InitActorState<DDTeamCollectionImpl::InitActor>::a_body1(int) at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/fdbserver/DDTeamCollection.actor.cpp:482
DDTeamCollectionImpl::InitActor::InitActor(DDTeamCollection* const&, Reference<InitialDataDistribution> const&, DDEnabledState const* const&) at /home/foundationdb_ci/foundationdb_build_output/dbdbdbdbdbdbdbdbdbdbdbdbdbdbdbdb/fdbserver/DDTeamCollection.actor.g.cpp:3258
(inlined by) DDTeamCollectionImpl::init(DDTeamCollection* const&, Reference<InitialDataDistribution> const&, DDEnabledState const* const&) at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/fdbserver/DDTeamCollection.actor.cpp:463
(inlined by) DDTeamCollection::init(Reference<InitialDataDistribution>, DDEnabledState const&) at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/fdbserver/DDTeamCollection.actor.cpp:3406
(inlined by) DDTeamCollectionImpl::RunActorState<DDTeamCollectionImpl::RunActor>::a_body1(int) at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/fdbserver/DDTeamCollection.actor.cpp:2945
(inlined by) DDTeamCollectionImpl::RunActor::RunActor(Reference<DDTeamCollection> const&, Reference<InitialDataDistribution> const&, TeamCollectionInterface const&, Reference<IAsyncListener<RequestStream<RecruitStorageRequest> > > const&, DDEnabledState const* const&) at /home/foundationdb_ci/foundationdb_build_output/dbdbdbdbdbdbdbdbdbdbdbdbdbdbdbdb/fdbserver/DDTeamCollection.actor.g.cpp:22169
(inlined by) DDTeamCollectionImpl::run(Reference<DDTeamCollection> const&, Reference<InitialDataDistribution> const&, TeamCollectionInterface const&, Reference<IAsyncListener<RequestStream<RecruitStorageRequest> > > const&, DDEnabledState const* const&) at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/fdbserver/DDTeamCollection.actor.cpp:2934
(inlined by) DDTeamCollection::run(Reference<DDTeamCollection>, Reference<InitialDataDistribution>, TeamCollectionInterface, Reference<IAsyncListener<RequestStream<RecruitStorageRequest> > >, DDEnabledState const&) at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/fdbserver/DDTeamCollection.actor.cpp:5143
(anonymous namespace)::DataDistributionActorState<(anonymous namespace)::DataDistributionActor>::a_body1loopBody1cont2loopBody1(int) at /home/foundationdb_ci/src/oOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOoOo/foundationdb/fdbserver/DataDistribution.actor.cpp:816
(inlined by) a_body1loopBody1cont2break1 at /home/foundationdb_ci/foundationdb_build_output/dbdbdbdbdbdbdbdbdbdbdbdbdbdbdbdb/fdbserver/DataDistribution.actor.g.cpp:5889
(inlined by) a_body1loopBody1cont2loopBody1 at /home/foundationdb_ci/foundationdb_build_output/dbdbdbdbdbdbdbdbdbdbdbdbdbdbdbdb/fdbserver/DataDistribution.actor.g.cpp:5826
Is there a bug in DD?
We can easily produce the error using these scripts:
#!/bin/bash
# set -x
fdblogdir=/tmp/logs
public_address=100.71.8.128
pkill -9 fdbserver && sleep 5 && rm -rf /fdb-log/data/* /fdb-storage/data/* /var/lib/foundationdb/data/* $fdblogdir/*
mkdir -p /etc/foundationdb/ && echo "8cKn7vrK:83b6hqP1@$public_address:30000" > /etc/foundationdb/fdb.cluster
function scale_out_stateless()
{
if [ $# -lt 4 ]; then echo "usage: scale_out_stateless machine_count start_port process_count" && exit 1; fi
machine_count=$1
start_port=$2
stateless_count=$3
datacenter=$4
if [ -z $datacenter ]; then datacenter=one_dc ; fi
stateless_id=0
for machine_id in `seq 0 $((machine_count-1))`; do
for i in `seq 0 $((stateless_count-1))`; do
port=$((stateless_id+start_port))
/usr/sbin/fdbserver \
--datadir /var/lib/foundationdb/data/$port \
--locality-diskid=diskstateless-$port \
--machine-id machine-$machine_id-$datacenter \
--datacenter-id $datacenter \
--public-address $public_address:$port \
--class stateless --listen-address public --cluster-file /etc/foundationdb/fdb.cluster --logdir $fdblogdir &
((stateless_id++))
done
done
}
function scale_out_log()
{
if [ $# -lt 4 ]; then echo "usage: scale_out_log machine_count start_port process_count" && exit 1; fi
machine_count=$1
start_port=$2
log_count=$3
datacenter=$4
if [ -z $datacenter ]; then datacenter=one_dc ; fi
log_id=0
for machine_id in `seq 0 $((machine_count-1))`; do
for i in `seq 0 $((log_count-1))`; do
port=$((log_id+start_port))
/usr/sbin/fdbserver \
--datadir /fdb-log/data/$port \
--locality-diskid=disklog-$port \
--machine-id machine-$machine_id-$datacenter \
--datacenter-id $datacenter \
--public-address $public_address:$port \
--class log --listen-address public --cluster-file /etc/foundationdb/fdb.cluster --logdir $fdblogdir &
((log_id++))
echo "fdbserver log $port started"
sleep 0.09
done
done
}
function scale_out_storage()
{
if [ $# -lt 4 ]; then echo "usage: scale_out_storage machine_count start_port process_count" && exit 1; fi
machine_count=$1
start_port=$2
storage_count=$3
datacenter=$4
if [ -z $datacenter ]; then datacenter=one_dc ; fi
storage_id=0
for machine_id in `seq 0 $((machine_count-1))`; do
for i in `seq 0 $((storage_count-1))`; do
port=$((storage_id+start_port))
/usr/sbin/fdbserver \
--datadir /fdb-storage/data/$port \
--locality-diskid=diskstorage-$port \
--machine-id machine-$machine_id-$datacenter \
--datacenter-id $datacenter \
--public-address $public_address:$port \
--class storage --listen-address public --cluster-file /etc/foundationdb/fdb.cluster --logdir $fdblogdir &
((storage_id++))
echo "fdbserver storage $port started"
sleep 0.1
done
done
}
scale_out_stateless 10 30000 1 dc1
scale_out_log 10 38000 1 dc1
scale_out_storage 100 39000 4 dc1
scale_out_stateless 10 40000 1 dc2
scale_out_log 10 48000 1 dc2
scale_out_storage 100 49000 4 dc2
scale_out_stateless 10 50000 1 dc3
scale_out_log 10 58000 1 dc3
scale_out_storage 100 59000 4 dc3
sleep 10 && fdbcli --exec 'configure new three_datacenter ssd; configure logs=5; configure proxies=4; coordinators auto'