It runs fine in the start, when it is writing range bytes. But when it starts to write log bytes, the pods in the cluster starts to fail Last logged error: BackupFailed: connection_failed
I’m running on a kubernetes cluster with the fdb kubernetes operator. The cluster is small (~20 GB), has lots of capacity, and is not in use otherwise.
Status json from 2 of the machines that crash + some of the early parts of status json (not enough space for the whole file)
Summary
"20a9f323e5b8a58dbf853745c49e9f8d" : {
"address" : "10.0.224.234:4501",
"class_source" : "command_line",
"class_type" : "stateless",
"command_line" : "/usr/bin/fdbserver --class=stateless --cluster_file=/var/fdb/data/fdb.cluster --datadir=/var/fdb/data --knob_fetch_keys_parallelism_bytes=100000000 --knob_max_storage_commit_time=3600.0 --knob_relocation_parallelism_per_source_server=20 --knob_storage_server_list_fetch_timeout=60.0 --listen_address=10.240.84.162:4501 --locality_instance_id=stateless-1 --locality_machineid=aks-fdbts1-26081535-vmss0000c2 --locality_zoneid=aks-fdbts1-26081535-vmss0000c2 --logdir=/var/log/fdb-trace-logs --loggroup=fastrestore-cluster --public_address=10.0.224.234:4501 --seed_cluster_file=/var/dynamic-conf/fdb.cluster",
"cpu" : {
"usage_cores" : 0.050200700000000001
},
"disk" : {
"busy" : 0.00079999800000000001,
"free_bytes" : 22384537600,
"reads" : {
"counter" : 16099,
"hz" : 0,
"sectors" : 0
},
"total_bytes" : 41442029568,
"writes" : {
"counter" : 39136,
"hz" : 0.20000000000000001,
"sectors" : 8
}
},
"excluded" : false,
"fault_domain" : "aks-fdbts1-26081535-vmss0000c2",
"locality" : {
"instance_id" : "stateless-1",
"machineid" : "aks-fdbts1-26081535-vmss0000c2",
"processid" : "20a9f323e5b8a58dbf853745c49e9f8d",
"zoneid" : "aks-fdbts1-26081535-vmss0000c2"
},
"machine_id" : "aks-fdbts1-26081535-vmss0000c2",
"memory" : {
"available_bytes" : 8589934592,
"limit_bytes" : 8589934592,
"unused_allocated_memory" : 61603680,
"used_bytes" : 2886213632
},
"messages" : [
{
"description" : "BackupFailed: connection_failed at Fri Jul 2 13:48:44 2021",
"name" : "connection_failed",
"raw_log_message" : "\"Severity\"=\"40\", \"Time\"=\"1625233724.192535\", \"DateTime\"=\"2021-07-02T13:48:44Z\", \"Type\"=\"BackupFailed\", \"ID\"=\"9c7d90f273fde00b\", \"Error\"=\"connection_failed\", \"ErrorDescription\"=\"Network connection failed\", \"ErrorCode\"=\"1026\", \"Reason\"=\"Error\", \"Backtrace\"=\"addr2line -e fdbserver.debug -p -C -f -i 0x23592dc 0x2358908 0x2358b71 0x1426691 0x1426e6d 0x1426fa4 0x14156ec 0x1415cce 0x7d17a9 0x142bdbe 0x1452f4e 0x7d17a9 0x8323e2 0x8326d2 0x7d17a9 0x81fa7e 0x7d17a9 0x838fed 0x7d17a9 0x7d17a9 0x19ef6ea 0x7d17a9 0x1a2c58e 0x1a2ce8d 0x85c988 0x1a20355 0x1a214ec 0x85c988 0x870add 0xbcce70 0x2302524 0x77ea72 0x7f4730e99bf7\", \"Machine\"=\"10.0.224.234:4501\", \"LogGroup\"=\"fastrestore-cluster\", \"Roles\"=\"BK,MP,RV\"",
"time" : 1625230000.0000002,
"type" : "BackupFailed"
}
],
"network" : {
"connection_errors" : {
"hz" : 0
},
"connections_closed" : {
"hz" : 0
},
"connections_established" : {
"hz" : 0
},
"current_connections" : 37,
"megabits_received" : {
"hz" : 5.7349199999999998
},
"megabits_sent" : {
"hz" : 0.62063199999999996
},
"tls_policy_failures" : {
"hz" : 0
}
},
"roles" : [
{
"commit_latency_statistics" : {
"count" : 423,
"max" : 0.29223899999999997,
"mean" : 0.025854599999999998,
"median" : 0.018527499999999999,
"min" : 0.0083341600000000002,
"p25" : 0.0141029,
"p90" : 0.044687699999999997,
"p95" : 0.0546033,
"p99" : 0.136104,
"p99.9" : 0.263129
},
"grv_latency_statistics" : {
"default" : {
"count" : 1306,
"max" : 2.5611600000000001,
"mean" : 0.087367099999999989,
"median" : 0.0015635500000000001,
"min" : 0.0012454999999999999,
"p25" : 0.00136185,
"p90" : 0.105823,
"p95" : 0.57120000000000004,
"p99" : 1.7317300000000002,
"p99.9" : 2.4263500000000002
}
},
"id" : "a7c8b60d1591a9a3",
"role" : "proxy"
},
{
"id" : "1106b50133d07b4f",
"role" : "resolver"
}
],
"run_loop_busy" : 0.031430899999999998,
"uptime_seconds" : 3620.1100000000001,
"version" : "6.3.12"
},
"29b91b2dbf1884d17729b8123106e966" : {
"address" : "10.0.170.217:4501",
"class_source" : "command_line",
"class_type" : "storage",
"command_line" : "/usr/bin/fdbserver --class=storage --cluster_file=/var/fdb/data/fdb.cluster --datadir=/var/fdb/data --knob_fetch_keys_parallelism_bytes=100000000 --knob_max_storage_commit_time=3600.0 --knob_relocation_parallelism_per_source_server=20 --knob_storage_server_list_fetch_timeout=60.0 --listen_address=10.240.47.10:4501 --locality_instance_id=storage-4 --locality_machineid=aks-fdbts1-26081535-vmss0000cf --locality_zoneid=aks-fdbts1-26081535-vmss0000cf --logdir=/var/log/fdb-trace-logs --loggroup=fastrestore-cluster --public_address=10.0.170.217:4501 --seed_cluster_file=/var/dynamic-conf/fdb.cluster",
"cpu" : {
"usage_cores" : 0.25643199999999999
},
"disk" : {
"busy" : 0.58719599999999994,
"free_bytes" : 1075805536256,
"reads" : {
"counter" : 1229946,
"hz" : 351.59800000000001,
"sectors" : 14088
},
"total_bytes" : 1081180868608,
"writes" : {
"counter" : 4654534,
"hz" : 1789.1900000000001,
"sectors" : 163440
}
},
"excluded" : false,
"fault_domain" : "aks-fdbts1-26081535-vmss0000cf",
"locality" : {
"instance_id" : "storage-4",
"machineid" : "aks-fdbts1-26081535-vmss0000cf",
"processid" : "29b91b2dbf1884d17729b8123106e966",
"zoneid" : "aks-fdbts1-26081535-vmss0000cf"
},
"machine_id" : "aks-fdbts1-26081535-vmss0000cf",
"memory" : {
"available_bytes" : 8589934592,
"limit_bytes" : 8589934592,
"unused_allocated_memory" : 259375712,
"used_bytes" : 3879833600
},
"messages" : [
{
"description" : "BackupFailed: connection_failed at Fri Jul 2 13:48:44 2021",
"name" : "connection_failed",
"raw_log_message" : "\"Severity\"=\"40\", \"Time\"=\"1625233724.193909\", \"DateTime\"=\"2021-07-02T13:48:44Z\", \"Type\"=\"BackupFailed\", \"ID\"=\"62dee2851ac53a7d\", \"Error\"=\"connection_failed\", \"ErrorDescription\"=\"Network connection failed\", \"ErrorCode\"=\"1026\", \"Reason\"=\"Error\", \"Backtrace\"=\"addr2line -e fdbserver.debug -p -C -f -i 0x23592dc 0x2358908 0x2358b71 0x1426691 0x1426e6d 0x1426fa4 0x14156ec 0x1415cce 0x7d17a9 0x142bdbe 0x1452f4e 0x7d17a9 0x8323e2 0x8326d2 0x7d17a9 0x81fa7e 0x7d17a9 0x838fed 0x7d17a9 0x7d17a9 0x19ef6ea 0x7d17a9 0x1a2c58e 0x1a2ce8d 0x85c988 0x1a20355 0x1a214ec 0x85c988 0x870add 0xbcce70 0x2302524 0x77ea72 0x7f9086d3fbf7\", \"Machine\"=\"10.0.170.217:4501\", \"LogGroup\"=\"fastrestore-cluster\", \"Roles\"=\"BK,CD,SS\"",
"time" : 1625230000.0000002,
"type" : "BackupFailed"
}
],
"network" : {
"connection_errors" : {
"hz" : 0
},
"connections_closed" : {
"hz" : 0
},
"connections_established" : {
"hz" : 0.19999899999999998
},
"current_connections" : 34,
"megabits_received" : {
"hz" : 4.774
},
"megabits_sent" : {
"hz" : 2.33378
},
"tls_policy_failures" : {
"hz" : 0
}
},
"roles" : [
{
"role" : "coordinator"
},
{
"bytes_queried" : {
"counter" : 7338315433,
"hz" : 2233840,
"roughness" : 910608
},
"data_lag" : {
"seconds" : 0.0024269999999999999,
"versions" : 2427
},
"data_version" : 253594016803,
"durability_lag" : {
"seconds" : 5.0882199999999997,
"versions" : 5088219
},
"durable_bytes" : {
"counter" : 56671581658,
"hz" : 5188890,
"roughness" : 3585230
},
"durable_version" : 253588928584,
"finished_queries" : {
"counter" : 351841,
"hz" : 47.999099999999999,
"roughness" : 18.566400000000002
},
"id" : "112213284979716a",
"input_bytes" : {
"counter" : 56706876536,
"hz" : 6481870,
"roughness" : 3293690
},
"keys_queried" : {
"counter" : 42281276,
"hz" : 20842.599999999999,
"roughness" : 8495.3199999999997
},
"kvstore_available_bytes" : 1077151645696,
"kvstore_free_bytes" : 1075805536256,
"kvstore_inline_keys" : 0,
"kvstore_total_bytes" : 1081180868608,
"kvstore_total_nodes" : 0,
"kvstore_total_size" : 0,
"kvstore_used_bytes" : 5278769352,
"local_rate" : 100,
"low_priority_queries" : {
"counter" : 0,
"hz" : 0,
"roughness" : -1
},
"mutation_bytes" : {
"counter" : 6652978213,
"hz" : 589748,
"roughness" : 299673
},
"mutations" : {
"counter" : 56221264,
"hz" : 6851.8699999999999,
"roughness" : 3480.6999999999998
},
"query_queue_max" : 15,
"read_latency_statistics" : {
"count" : 13901,
"max" : 0.0829453,
"mean" : 0.0020889699999999999,
"median" : 1.7643e-05,
"min" : 3.5762799999999998e-06,
"p25" : 1.1444099999999999e-05,
"p90" : 0.0045762099999999998,
"p95" : 0.013200799999999999,
"p99" : 0.040642299999999999,
"p99.9" : 0.065924899999999995
},
"role" : "storage",
"stored_bytes" : 3173224373,
"total_queries" : {
"counter" : 351841,
"hz" : 47.999099999999999,
"roughness" : 18.942900000000002
}
}
],
"run_loop_busy" : 0.22185299999999999,
"uptime_seconds" : 3641.21,
"version" : "6.3.12"
},
“configuration” : {
“backup_worker_enabled” : 1,
“coordinators_count” : 5,
“excluded_servers” : [
],
“log_routers” : -1,
“log_spill” : 2,
“logs” : 6,
“proxies” : 3,
“redundancy_mode” : “triple”,
“remote_logs” : -1,
“resolvers” : 1,
“storage_engine” : “ssd-2”,
“usable_regions” : 1
},
“connection_string” : “fastrestore_cluster:JDuIVunMOmgDbs77FJKxPd3ogvMLgVzy@10.0.16.27:4501,10.0.18.250:4501,10.0.170.217:4501,10.0.179.174:4501,10.0.217.29:4501”,
“data” : {
“average_partition_size_bytes” : 13144000,
“least_operating_space_bytes_log_server” : 1020515968631,
“least_operating_space_bytes_storage_server” : 1021157111823,
“moving_data” : {
“highest_priority” : 950,
“in_flight_bytes” : 736064000,
“in_queue_bytes” : 0,
“total_written_bytes” : 1552298318
},
“partitions_count” : 1580,
“state” : {
“description” : “Repartitioning”,
“healthy” : true,
“min_replicas_remaining” : 3,
“name” : “healthy_repartitioning”
},
“system_kv_size_bytes” : 5321304694,
“team_trackers” : [
{
“in_flight_bytes” : 1569019680,
“primary” : true,
“state” : {
“description” : “Repartitioning”,
“healthy” : true,
“min_replicas_remaining” : 3,
“name” : “healthy_repartitioning”
},
“unhealthy_servers” : 0
}
],
“total_disk_used_bytes” : 108928196752,
“total_kv_size_bytes” : 20045924400
},
“database_available” : true,
“database_lock_state” : {
“lock_uid” : “9ee1e6de1e4cd4cc2712364c14708492”,
“locked” : true
},
“datacenter_lag” : {
“seconds” : 0,
“versions” : 0
},
“degraded_processes” : 0,
“fault_tolerance” : {
“max_zone_failures_without_losing_availability” : 2,
“max_zone_failures_without_losing_data” : 2
},
“full_replication” : true,
“generation” : 10,
“incompatible_connections” : [
],
“latency_probe” : {
“batch_priority_transaction_start_seconds” : 1.3482499999999999,
“commit_seconds” : 0.020109200000000001,
“immediate_priority_transaction_start_seconds” : 0.00327897,
“read_seconds” : 0.00146174,
“transaction_start_seconds” : 0.31443399999999999
},
“layers” : {
“_valid” : true,
“backup” : {
“blob_recent_io” : {
“bytes_per_second” : 512838.17745126353,
“bytes_sent” : 19771741,
“requests_failed” : 0,
“requests_successful” : 6
},
“instances” : {
“02c71d821d2b83c8dd5dab60154de298” : {
“blob_stats” : {
“recent” : {
0
},
“total” : {
“bytes_sent” : 2255943111,
“requests_failed” : 40,
“requests_successful” : 14462
}
},
“configured_workers” : 10,
“id” : “02c71d821d2b83c8dd5dab60154de298”,
“last_updated” : 1625234200.3824546,
“main_thread_cpu_seconds” : 213.70202500000002,
“memory_usage” : 306892800,
“process_cpu_seconds” : 214.962558,
“resident_size” : 54292480,
“version” : “6.3.12”
},
“2a5d58750a1b0203d8fe7212dc810396” : {
“blob_stats” : {
“recent” : {
0
},
“total” : {
“bytes_sent” : 2203510753,
“requests_failed” : 60,
“requests_successful” : 18140
}
},
“configured_workers” : 10,
“id” : “2a5d58750a1b0203d8fe7212dc810396”,
“last_updated” : 1625234205.2863677,
“main_thread_cpu_seconds” : 248.40060199999999,
“memory_usage” : 312696832,
“process_cpu_seconds” : 249.684021,
“resident_size” : 62464000,
“version” : “6.3.12”
},
“4e0268b78aabbdeb155f3e14389ca0cb” : {
“blob_stats” : {
“recent” : {
0
},
“total” : {
“bytes_sent” : 2128135621,
“requests_failed” : 0,
“requests_successful” : 746
}
},
“configured_workers” : 10,
“id” : “4e0268b78aabbdeb155f3e14389ca0cb”,
“last_updated” : 1625234201.8532672,
“main_thread_cpu_seconds” : 43.015414999999997,
“memory_usage” : 365826048,
“process_cpu_seconds” : 43.422720999999996,
“resident_size” : 115535872,
“version” : “6.3.12”
},
“6b9dadef3e45f5f83e6e9167f5f9f15e” : {
“blob_stats” : {
“recent” : {
0
},
“total” : {
“bytes_sent” : 68505442,
“requests_failed” : 0,
“requests_successful” : 27
}
},
“configured_workers” : 10,
“id” : “6b9dadef3e45f5f83e6e9167f5f9f15e”,
“last_updated” : 1625234199.5132859,
“main_thread_cpu_seconds” : 5.3838940000000006,
“memory_usage” : 272863232,
“process_cpu_seconds” : 5.6143000000000001,
“resident_size” : 22249472,
“version” : “6.3.12”
},
“c80fde6c1f2785a9333723a996924715” : {
“blob_stats” : {
“recent” : {
0
},
“total” : {
“bytes_sent” : 2083582043,
“requests_failed” : 0,
“requests_successful” : 1552
}
},
“configured_workers” : 10,
“id” : “c80fde6c1f2785a9333723a996924715”,
“last_updated” : 1625234217.1097019,
“main_thread_cpu_seconds” : 59.917552999999998,
“memory_usage” : 300843008,
“process_cpu_seconds” : 60.457518999999998,
“resident_size” : 50429952,
“version” : “6.3.12”
},
“e70819e5a504aca673cdb4d41f6a6195” : {
“blob_stats” : {
“recent” : {
0
},
“total” : {
“bytes_sent” : 2096429217,
“requests_failed” : 0,
“requests_successful” : 5754
}
},
“configured_workers” : 10,
“id” : “e70819e5a504aca673cdb4d41f6a6195”,
“last_updated” : 1625234225.4263587,
“main_thread_cpu_seconds” : 100.567784,
“memory_usage” : 298971136,
“process_cpu_seconds” : 101.30439,
“resident_size” : 48992256,
“version” : “6.3.12”
},
“e79bf28ed40107bb3aacfe10f3cb0f51” : {
“blob_stats” : {
“recent” : {
“bytes_per_second” : 512838.17745126353,
“bytes_sent” : 19771741,
“requests_failed” : 0,
“requests_successful” : 6
},
“total” : {
“bytes_sent” : 2199469262,
“requests_failed” : 0,
“requests_successful” : 6703
}
},
“configured_workers” : 10,
“id” : “e79bf28ed40107bb3aacfe10f3cb0f51”,
“last_updated” : 1625234225.1540778,
“main_thread_cpu_seconds” : 117.33948100000001,
“memory_usage” : 307281920,
“process_cpu_seconds” : 118.24613199999999,
“resident_size” : 58974208,
“version” : “6.3.12”
}
},
“instances_running” : 7,
“last_updated” : 1625234225.4263587,
“paused” : false,
“tags” : {
“fastRestore” : {
“current_container” : “blobstore://minio:public-minio-password@minio-gateway.timeseries.svc:9000/automatic_backups/fastrestore2021-07-02?bucket=foundationdb-backup-container&sc=0”,
“current_status” : “has been started”,
“last_restorable_seconds_behind” : 253592.236274,
“last_restorable_version” : 0,
“mutation_log_bytes_written” : 0,
“mutation_stream_id” : “50f8e0c8e3bf64a2cbebb871f9c9d5a6”,
“range_bytes_written” : 13017977287,
“running_backup” : true,
“running_backup_is_restorable” : false
}
},
“total_workers” : 70
}
},