It runs fine in the start, when it is writing range bytes. But when it starts to write log bytes, the pods in the cluster starts to fail Last logged error: BackupFailed: connection_failed
I’m running on a kubernetes cluster with the fdb kubernetes operator. The cluster is small (~20 GB), has lots of capacity, and is not in use otherwise.
Status json from 2 of the machines that crash + some of the early parts of status json (not enough space for the whole file)
Summary
        "20a9f323e5b8a58dbf853745c49e9f8d" : {
            "address" : "10.0.224.234:4501",
            "class_source" : "command_line",
            "class_type" : "stateless",
            "command_line" : "/usr/bin/fdbserver --class=stateless --cluster_file=/var/fdb/data/fdb.cluster --datadir=/var/fdb/data --knob_fetch_keys_parallelism_bytes=100000000 --knob_max_storage_commit_time=3600.0 --knob_relocation_parallelism_per_source_server=20 --knob_storage_server_list_fetch_timeout=60.0 --listen_address=10.240.84.162:4501 --locality_instance_id=stateless-1 --locality_machineid=aks-fdbts1-26081535-vmss0000c2 --locality_zoneid=aks-fdbts1-26081535-vmss0000c2 --logdir=/var/log/fdb-trace-logs --loggroup=fastrestore-cluster --public_address=10.0.224.234:4501 --seed_cluster_file=/var/dynamic-conf/fdb.cluster",
            "cpu" : {
                "usage_cores" : 0.050200700000000001
            },
            "disk" : {
                "busy" : 0.00079999800000000001,
                "free_bytes" : 22384537600,
                "reads" : {
                    "counter" : 16099,
                    "hz" : 0,
                    "sectors" : 0
                },
                "total_bytes" : 41442029568,
                "writes" : {
                    "counter" : 39136,
                    "hz" : 0.20000000000000001,
                    "sectors" : 8
                }
            },
            "excluded" : false,
            "fault_domain" : "aks-fdbts1-26081535-vmss0000c2",
            "locality" : {
                "instance_id" : "stateless-1",
                "machineid" : "aks-fdbts1-26081535-vmss0000c2",
                "processid" : "20a9f323e5b8a58dbf853745c49e9f8d",
                "zoneid" : "aks-fdbts1-26081535-vmss0000c2"
            },
            "machine_id" : "aks-fdbts1-26081535-vmss0000c2",
            "memory" : {
                "available_bytes" : 8589934592,
                "limit_bytes" : 8589934592,
                "unused_allocated_memory" : 61603680,
                "used_bytes" : 2886213632
            },
            "messages" : [
                {
                    "description" : "BackupFailed: connection_failed at Fri Jul  2 13:48:44 2021",
                    "name" : "connection_failed",
                    "raw_log_message" : "\"Severity\"=\"40\", \"Time\"=\"1625233724.192535\", \"DateTime\"=\"2021-07-02T13:48:44Z\", \"Type\"=\"BackupFailed\", \"ID\"=\"9c7d90f273fde00b\", \"Error\"=\"connection_failed\", \"ErrorDescription\"=\"Network connection failed\", \"ErrorCode\"=\"1026\", \"Reason\"=\"Error\", \"Backtrace\"=\"addr2line -e fdbserver.debug -p -C -f -i 0x23592dc 0x2358908 0x2358b71 0x1426691 0x1426e6d 0x1426fa4 0x14156ec 0x1415cce 0x7d17a9 0x142bdbe 0x1452f4e 0x7d17a9 0x8323e2 0x8326d2 0x7d17a9 0x81fa7e 0x7d17a9 0x838fed 0x7d17a9 0x7d17a9 0x19ef6ea 0x7d17a9 0x1a2c58e 0x1a2ce8d 0x85c988 0x1a20355 0x1a214ec 0x85c988 0x870add 0xbcce70 0x2302524 0x77ea72 0x7f4730e99bf7\", \"Machine\"=\"10.0.224.234:4501\", \"LogGroup\"=\"fastrestore-cluster\", \"Roles\"=\"BK,MP,RV\"",
                    "time" : 1625230000.0000002,
                    "type" : "BackupFailed"
                }
            ],
            "network" : {
                "connection_errors" : {
                    "hz" : 0
                },
                "connections_closed" : {
                    "hz" : 0
                },
                "connections_established" : {
                    "hz" : 0
                },
                "current_connections" : 37,
                "megabits_received" : {
                    "hz" : 5.7349199999999998
                },
                "megabits_sent" : {
                    "hz" : 0.62063199999999996
                },
                "tls_policy_failures" : {
                    "hz" : 0
                }
            },
            "roles" : [
                {
                    "commit_latency_statistics" : {
                        "count" : 423,
                        "max" : 0.29223899999999997,
                        "mean" : 0.025854599999999998,
                        "median" : 0.018527499999999999,
                        "min" : 0.0083341600000000002,
                        "p25" : 0.0141029,
                        "p90" : 0.044687699999999997,
                        "p95" : 0.0546033,
                        "p99" : 0.136104,
                        "p99.9" : 0.263129
                    },
                    "grv_latency_statistics" : {
                        "default" : {
                            "count" : 1306,
                            "max" : 2.5611600000000001,
                            "mean" : 0.087367099999999989,
                            "median" : 0.0015635500000000001,
                            "min" : 0.0012454999999999999,
                            "p25" : 0.00136185,
                            "p90" : 0.105823,
                            "p95" : 0.57120000000000004,
                            "p99" : 1.7317300000000002,
                            "p99.9" : 2.4263500000000002
                        }
                    },
                    "id" : "a7c8b60d1591a9a3",
                    "role" : "proxy"
                },
                {
                    "id" : "1106b50133d07b4f",
                    "role" : "resolver"
                }
            ],
            "run_loop_busy" : 0.031430899999999998,
            "uptime_seconds" : 3620.1100000000001,
            "version" : "6.3.12"
        },
        "29b91b2dbf1884d17729b8123106e966" : {
            "address" : "10.0.170.217:4501",
            "class_source" : "command_line",
            "class_type" : "storage",
            "command_line" : "/usr/bin/fdbserver --class=storage --cluster_file=/var/fdb/data/fdb.cluster --datadir=/var/fdb/data --knob_fetch_keys_parallelism_bytes=100000000 --knob_max_storage_commit_time=3600.0 --knob_relocation_parallelism_per_source_server=20 --knob_storage_server_list_fetch_timeout=60.0 --listen_address=10.240.47.10:4501 --locality_instance_id=storage-4 --locality_machineid=aks-fdbts1-26081535-vmss0000cf --locality_zoneid=aks-fdbts1-26081535-vmss0000cf --logdir=/var/log/fdb-trace-logs --loggroup=fastrestore-cluster --public_address=10.0.170.217:4501 --seed_cluster_file=/var/dynamic-conf/fdb.cluster",
            "cpu" : {
                "usage_cores" : 0.25643199999999999
            },
            "disk" : {
                "busy" : 0.58719599999999994,
                "free_bytes" : 1075805536256,
                "reads" : {
                    "counter" : 1229946,
                    "hz" : 351.59800000000001,
                    "sectors" : 14088
                },
                "total_bytes" : 1081180868608,
                "writes" : {
                    "counter" : 4654534,
                    "hz" : 1789.1900000000001,
                    "sectors" : 163440
                }
            },
            "excluded" : false,
            "fault_domain" : "aks-fdbts1-26081535-vmss0000cf",
            "locality" : {
                "instance_id" : "storage-4",
                "machineid" : "aks-fdbts1-26081535-vmss0000cf",
                "processid" : "29b91b2dbf1884d17729b8123106e966",
                "zoneid" : "aks-fdbts1-26081535-vmss0000cf"
            },
            "machine_id" : "aks-fdbts1-26081535-vmss0000cf",
            "memory" : {
                "available_bytes" : 8589934592,
                "limit_bytes" : 8589934592,
                "unused_allocated_memory" : 259375712,
                "used_bytes" : 3879833600
            },
            "messages" : [
                {
                    "description" : "BackupFailed: connection_failed at Fri Jul  2 13:48:44 2021",
                    "name" : "connection_failed",
                    "raw_log_message" : "\"Severity\"=\"40\", \"Time\"=\"1625233724.193909\", \"DateTime\"=\"2021-07-02T13:48:44Z\", \"Type\"=\"BackupFailed\", \"ID\"=\"62dee2851ac53a7d\", \"Error\"=\"connection_failed\", \"ErrorDescription\"=\"Network connection failed\", \"ErrorCode\"=\"1026\", \"Reason\"=\"Error\", \"Backtrace\"=\"addr2line -e fdbserver.debug -p -C -f -i 0x23592dc 0x2358908 0x2358b71 0x1426691 0x1426e6d 0x1426fa4 0x14156ec 0x1415cce 0x7d17a9 0x142bdbe 0x1452f4e 0x7d17a9 0x8323e2 0x8326d2 0x7d17a9 0x81fa7e 0x7d17a9 0x838fed 0x7d17a9 0x7d17a9 0x19ef6ea 0x7d17a9 0x1a2c58e 0x1a2ce8d 0x85c988 0x1a20355 0x1a214ec 0x85c988 0x870add 0xbcce70 0x2302524 0x77ea72 0x7f9086d3fbf7\", \"Machine\"=\"10.0.170.217:4501\", \"LogGroup\"=\"fastrestore-cluster\", \"Roles\"=\"BK,CD,SS\"",
                    "time" : 1625230000.0000002,
                    "type" : "BackupFailed"
                }
            ],
            "network" : {
                "connection_errors" : {
                    "hz" : 0
                },
                "connections_closed" : {
                    "hz" : 0
                },
                "connections_established" : {
                    "hz" : 0.19999899999999998
                },
                "current_connections" : 34,
                "megabits_received" : {
                    "hz" : 4.774
                },
                "megabits_sent" : {
                    "hz" : 2.33378
                },
                "tls_policy_failures" : {
                    "hz" : 0
                }
            },
            "roles" : [
                {
                    "role" : "coordinator"
                },
                {
                    "bytes_queried" : {
                        "counter" : 7338315433,
                        "hz" : 2233840,
                        "roughness" : 910608
                    },
                    "data_lag" : {
                        "seconds" : 0.0024269999999999999,
                        "versions" : 2427
                    },
                    "data_version" : 253594016803,
                    "durability_lag" : {
                        "seconds" : 5.0882199999999997,
                        "versions" : 5088219
                    },
                    "durable_bytes" : {
                        "counter" : 56671581658,
                        "hz" : 5188890,
                        "roughness" : 3585230
                    },
                    "durable_version" : 253588928584,
                    "finished_queries" : {
                        "counter" : 351841,
                        "hz" : 47.999099999999999,
                        "roughness" : 18.566400000000002
                    },
                    "id" : "112213284979716a",
                    "input_bytes" : {
                        "counter" : 56706876536,
                        "hz" : 6481870,
                        "roughness" : 3293690
                    },
                    "keys_queried" : {
                        "counter" : 42281276,
                        "hz" : 20842.599999999999,
                        "roughness" : 8495.3199999999997
                    },
                    "kvstore_available_bytes" : 1077151645696,
                    "kvstore_free_bytes" : 1075805536256,
                    "kvstore_inline_keys" : 0,
                    "kvstore_total_bytes" : 1081180868608,
                    "kvstore_total_nodes" : 0,
                    "kvstore_total_size" : 0,
                    "kvstore_used_bytes" : 5278769352,
                    "local_rate" : 100,
                    "low_priority_queries" : {
                        "counter" : 0,
                        "hz" : 0,
                        "roughness" : -1
                    },
                    "mutation_bytes" : {
                        "counter" : 6652978213,
                        "hz" : 589748,
                        "roughness" : 299673
                    },
                    "mutations" : {
                        "counter" : 56221264,
                        "hz" : 6851.8699999999999,
                        "roughness" : 3480.6999999999998
                    },
                    "query_queue_max" : 15,
                    "read_latency_statistics" : {
                        "count" : 13901,
                        "max" : 0.0829453,
                        "mean" : 0.0020889699999999999,
                        "median" : 1.7643e-05,
                        "min" : 3.5762799999999998e-06,
                        "p25" : 1.1444099999999999e-05,
                        "p90" : 0.0045762099999999998,
                        "p95" : 0.013200799999999999,
                        "p99" : 0.040642299999999999,
                        "p99.9" : 0.065924899999999995
                    },
                    "role" : "storage",
                    "stored_bytes" : 3173224373,
                    "total_queries" : {
                        "counter" : 351841,
                        "hz" : 47.999099999999999,
                        "roughness" : 18.942900000000002
                    }
                }
            ],
            "run_loop_busy" : 0.22185299999999999,
            "uptime_seconds" : 3641.21,
            "version" : "6.3.12"
        },
“configuration” : {
“backup_worker_enabled” : 1,
“coordinators_count” : 5,
“excluded_servers” : [
],
“log_routers” : -1,
“log_spill” : 2,
“logs” : 6,
“proxies” : 3,
“redundancy_mode” : “triple”,
“remote_logs” : -1,
“resolvers” : 1,
“storage_engine” : “ssd-2”,
“usable_regions” : 1
},
“connection_string” : “fastrestore_cluster:JDuIVunMOmgDbs77FJKxPd3ogvMLgVzy@10.0.16.27:4501,10.0.18.250:4501,10.0.170.217:4501,10.0.179.174:4501,10.0.217.29:4501”,
“data” : {
“average_partition_size_bytes” : 13144000,
“least_operating_space_bytes_log_server” : 1020515968631,
“least_operating_space_bytes_storage_server” : 1021157111823,
“moving_data” : {
“highest_priority” : 950,
“in_flight_bytes” : 736064000,
“in_queue_bytes” : 0,
“total_written_bytes” : 1552298318
},
“partitions_count” : 1580,
“state” : {
“description” : “Repartitioning”,
“healthy” : true,
“min_replicas_remaining” : 3,
“name” : “healthy_repartitioning”
},
“system_kv_size_bytes” : 5321304694,
“team_trackers” : [
{
“in_flight_bytes” : 1569019680,
“primary” : true,
“state” : {
“description” : “Repartitioning”,
“healthy” : true,
“min_replicas_remaining” : 3,
“name” : “healthy_repartitioning”
},
“unhealthy_servers” : 0
}
],
“total_disk_used_bytes” : 108928196752,
“total_kv_size_bytes” : 20045924400
},
“database_available” : true,
“database_lock_state” : {
“lock_uid” : “9ee1e6de1e4cd4cc2712364c14708492”,
“locked” : true
},
“datacenter_lag” : {
“seconds” : 0,
“versions” : 0
},
“degraded_processes” : 0,
“fault_tolerance” : {
“max_zone_failures_without_losing_availability” : 2,
“max_zone_failures_without_losing_data” : 2
},
“full_replication” : true,
“generation” : 10,
“incompatible_connections” : [
],
“latency_probe” : {
“batch_priority_transaction_start_seconds” : 1.3482499999999999,
“commit_seconds” : 0.020109200000000001,
“immediate_priority_transaction_start_seconds” : 0.00327897,
“read_seconds” : 0.00146174,
“transaction_start_seconds” : 0.31443399999999999
},
“layers” : {
“_valid” : true,
“backup” : {
“blob_recent_io” : {
“bytes_per_second” : 512838.17745126353,
“bytes_sent” : 19771741,
“requests_failed” : 0,
“requests_successful” : 6
},
“instances” : {
“02c71d821d2b83c8dd5dab60154de298” : {
“blob_stats” : {
“recent” : {
0
},
“total” : {
“bytes_sent” : 2255943111,
“requests_failed” : 40,
“requests_successful” : 14462
}
},
“configured_workers” : 10,
“id” : “02c71d821d2b83c8dd5dab60154de298”,
“last_updated” : 1625234200.3824546,
“main_thread_cpu_seconds” : 213.70202500000002,
“memory_usage” : 306892800,
“process_cpu_seconds” : 214.962558,
“resident_size” : 54292480,
“version” : “6.3.12”
},
“2a5d58750a1b0203d8fe7212dc810396” : {
“blob_stats” : {
“recent” : {
0
},
“total” : {
“bytes_sent” : 2203510753,
“requests_failed” : 60,
“requests_successful” : 18140
}
},
“configured_workers” : 10,
“id” : “2a5d58750a1b0203d8fe7212dc810396”,
“last_updated” : 1625234205.2863677,
“main_thread_cpu_seconds” : 248.40060199999999,
“memory_usage” : 312696832,
“process_cpu_seconds” : 249.684021,
“resident_size” : 62464000,
“version” : “6.3.12”
},
“4e0268b78aabbdeb155f3e14389ca0cb” : {
“blob_stats” : {
“recent” : {
0
},
“total” : {
“bytes_sent” : 2128135621,
“requests_failed” : 0,
“requests_successful” : 746
}
},
“configured_workers” : 10,
“id” : “4e0268b78aabbdeb155f3e14389ca0cb”,
“last_updated” : 1625234201.8532672,
“main_thread_cpu_seconds” : 43.015414999999997,
“memory_usage” : 365826048,
“process_cpu_seconds” : 43.422720999999996,
“resident_size” : 115535872,
“version” : “6.3.12”
},
“6b9dadef3e45f5f83e6e9167f5f9f15e” : {
“blob_stats” : {
“recent” : {
0
},
“total” : {
“bytes_sent” : 68505442,
“requests_failed” : 0,
“requests_successful” : 27
}
},
“configured_workers” : 10,
“id” : “6b9dadef3e45f5f83e6e9167f5f9f15e”,
“last_updated” : 1625234199.5132859,
“main_thread_cpu_seconds” : 5.3838940000000006,
“memory_usage” : 272863232,
“process_cpu_seconds” : 5.6143000000000001,
“resident_size” : 22249472,
“version” : “6.3.12”
},
“c80fde6c1f2785a9333723a996924715” : {
“blob_stats” : {
“recent” : {
0
},
“total” : {
“bytes_sent” : 2083582043,
“requests_failed” : 0,
“requests_successful” : 1552
}
},
“configured_workers” : 10,
“id” : “c80fde6c1f2785a9333723a996924715”,
“last_updated” : 1625234217.1097019,
“main_thread_cpu_seconds” : 59.917552999999998,
“memory_usage” : 300843008,
“process_cpu_seconds” : 60.457518999999998,
“resident_size” : 50429952,
“version” : “6.3.12”
},
“e70819e5a504aca673cdb4d41f6a6195” : {
“blob_stats” : {
“recent” : {
0
},
“total” : {
“bytes_sent” : 2096429217,
“requests_failed” : 0,
“requests_successful” : 5754
}
},
“configured_workers” : 10,
“id” : “e70819e5a504aca673cdb4d41f6a6195”,
“last_updated” : 1625234225.4263587,
“main_thread_cpu_seconds” : 100.567784,
“memory_usage” : 298971136,
“process_cpu_seconds” : 101.30439,
“resident_size” : 48992256,
“version” : “6.3.12”
},
“e79bf28ed40107bb3aacfe10f3cb0f51” : {
“blob_stats” : {
“recent” : {
“bytes_per_second” : 512838.17745126353,
“bytes_sent” : 19771741,
“requests_failed” : 0,
“requests_successful” : 6
},
“total” : {
“bytes_sent” : 2199469262,
“requests_failed” : 0,
“requests_successful” : 6703
}
},
“configured_workers” : 10,
“id” : “e79bf28ed40107bb3aacfe10f3cb0f51”,
“last_updated” : 1625234225.1540778,
“main_thread_cpu_seconds” : 117.33948100000001,
“memory_usage” : 307281920,
“process_cpu_seconds” : 118.24613199999999,
“resident_size” : 58974208,
“version” : “6.3.12”
}
},
“instances_running” : 7,
“last_updated” : 1625234225.4263587,
“paused” : false,
“tags” : {
“fastRestore” : {
“current_container” : “blobstore://minio:public-minio-password@minio-gateway.timeseries.svc:9000/automatic_backups/fastrestore2021-07-02?bucket=foundationdb-backup-container&sc=0”,
“current_status” : “has been started”,
“last_restorable_seconds_behind” : 253592.236274,
“last_restorable_version” : 0,
“mutation_log_bytes_written” : 0,
“mutation_stream_id” : “50f8e0c8e3bf64a2cbebb871f9c9d5a6”,
“range_bytes_written” : 13017977287,
“running_backup” : true,
“running_backup_is_restorable” : false
}
},
“total_workers” : 70
}
},