test: Avoid race in disconnect_nodes helper #26138

maflcko · 2022-09-20T13:49:40Z

Also wait for the other node to notice the closed socket. Otherwise, the other node is not able to use the connect helper.

Fixes #26014

stickies-v

Makes sense to verify the disconnect has been picked up by both nodes, so this seems like a good improvement to me.

In connect_nodes(), relying on the number of connected/veracked peers instead of the actual peer seems like a very fragile approach though. I would expect it to break again in the future. I'm not sure we don't have better ways to check 2 nodes being connected?

For example, I think building on faeea28 this (MVP - open to improvements) would work:

git diff

diff --git a/test/functional/test_framework/test_framework.py b/test/functional/test_framework/test_framework.py
index b1164b98f..f79b931b6 100755
--- a/test/functional/test_framework/test_framework.py
+++ b/test/functional/test_framework/test_framework.py
@@ -581,8 +581,6 @@ class BitcoinTestFramework(metaclass=BitcoinTestMetaClass):
     def connect_nodes(self, a, b):
         from_connection = self.nodes[a]
         to_connection = self.nodes[b]
-        from_num_peers = 1 + len(from_connection.getpeerinfo())
-        to_num_peers = 1 + len(to_connection.getpeerinfo())
         ip_port = "127.0.0.1:" + str(p2p_port(b))
         from_connection.addnode(ip_port, "onetry")
         # poll until version handshake complete to avoid race conditions
@@ -590,30 +588,27 @@ class BitcoinTestFramework(metaclass=BitcoinTestMetaClass):
         # See comments in net_processing:
         # * Must have a version message before anything else
         # * Must have a verack message before anything else
-        self.wait_until(lambda: sum(peer['version'] != 0 for peer in from_connection.getpeerinfo()) == from_num_peers)
-        self.wait_until(lambda: sum(peer['version'] != 0 for peer in to_connection.getpeerinfo()) == to_num_peers)
-        self.wait_until(lambda: sum(peer['bytesrecv_per_msg'].pop('verack', 0) == 24 for peer in from_connection.getpeerinfo()) == from_num_peers)
-        self.wait_until(lambda: sum(peer['bytesrecv_per_msg'].pop('verack', 0) == 24 for peer in to_connection.getpeerinfo()) == to_num_peers)
+        def is_connection_established(from_node: TestNode, to_node: TestNode) -> bool:
+            connected = from_node.get_connected_peers_info(to_node.index)
+            veracked = [peer for peer in connected if peer['bytesrecv_per_msg'].pop('verack', 0) == 24]
+            return len(veracked) > 0
+
+        self.wait_until(lambda: is_connection_established(from_connection, to_connection))
+        self.wait_until(lambda: is_connection_established(to_connection, from_connection))
+
 
     def disconnect_nodes(self, a, b):
         def disconnect_nodes_helper(node_a, node_b):
-            def get_peer_ids(from_connection, node_num):
-                result = []
-                for peer in from_connection.getpeerinfo():
-                    if "testnode{}".format(node_num) in peer['subver']:
-                        result.append(peer['id'])
-                return result
-
-            peer_ids = get_peer_ids(node_a, node_b.index)
-            if not peer_ids:
+            peers = node_a.get_connected_peers_info(node_b.index)
+            if not peers:
                 self.log.warning("disconnect_nodes: {} and {} were not connected".format(
                     node_a.index,
                     node_b.index,
                 ))
                 return
-            for peer_id in peer_ids:
+            for peer in peers:
                 try:
-                    node_a.disconnectnode(nodeid=peer_id)
+                    node_a.disconnectnode(nodeid=peer["id"])
                 except JSONRPCException as e:
                     # If this node is disconnected between calculating the peer id
                     # and issuing the disconnect, don't worry about it.
@@ -622,8 +617,8 @@ class BitcoinTestFramework(metaclass=BitcoinTestMetaClass):
                         raise
 
             # wait to disconnect
-            self.wait_until(lambda: not get_peer_ids(node_a, node_b.index), timeout=5)
-            self.wait_until(lambda: not get_peer_ids(node_b, node_a.index), timeout=5)
+            self.wait_until(lambda: not node_a.get_connected_peers_info(node_b.index), timeout=5)
+            self.wait_until(lambda: not node_b.get_connected_peers_info(node_a.index), timeout=5)
 
         disconnect_nodes_helper(self.nodes[a], self.nodes[b])
 
diff --git a/test/functional/test_framework/test_node.py b/test/functional/test_framework/test_node.py
index e35cae006..73ad492a5 100755
--- a/test/functional/test_framework/test_node.py
+++ b/test/functional/test_framework/test_node.py
@@ -21,6 +21,7 @@ import collections
 import shlex
 import sys
 from pathlib import Path
+from typing import Any, Dict, List
 
 from .authproxy import JSONRPCException
 from .descriptors import descsum_create
@@ -645,6 +646,13 @@ class TestNode():
 
         return p2p_conn
 
+    def get_connected_peers_info(self, other_node_index: int) -> List[Dict[str, Any]]:
+        result = []
+        for peer in self.getpeerinfo():
+            if "testnode{}".format(other_node_index) in peer['subver']:
+                result.append(peer)
+        return result
+
     def num_test_p2p_connections(self):
         """Return number of test framework p2p connections to the node."""
         return len([peer for peer in self.getpeerinfo() if peer['subver'] == P2P_SUBVERSION])

maflcko · 2022-09-21T08:08:03Z

get_connected_peers_info would only work with Bitcoin Core peers, not mininode peers, so I'd prefer not to make it public

maflcko · 2022-09-21T08:10:47Z

I would expect it to break again in the future.

The only way I can see it would "break" is when you attempt to connect to the same peer twice. In which case the failure is probably wanted. I am not sure if your approach silently dismisses that failure.

stickies-v · 2022-09-22T10:41:52Z

get_connected_peers_info would only work with Bitcoin Core peers, not mininode peers, so I'd prefer not to make it public

Good point, I did not consider that. I've updated the diff (below) to keep it private. Also reduces the diff quite a bit. (Note: get_peer_ids() would benefit from the same list comprehension imo, but not a required change for this to work)

The only way I can see it would "break" is when you attempt to connect to the same peer twice. In which case the failure is probably wanted. I am not sure if your approach silently dismisses that failure.

Updated to only return True if we have exactly 1 connection to b, which I think addresses your concern. I don't have any specific ideas of how it would break otherwise, but since connect_nodes() doesn't own the nodes, I think it's not unreasonable to assume that the node may update its peers in the background? If without additional complexity we can check that a is now connected to b and not just to any new node, that seems like a strict improvement to me?

git diff

diff --git a/test/functional/test_framework/test_framework.py b/test/functional/test_framework/test_framework.py
index b1164b98f..fa3ddebe3 100755
--- a/test/functional/test_framework/test_framework.py
+++ b/test/functional/test_framework/test_framework.py
@@ -581,8 +581,6 @@ class BitcoinTestFramework(metaclass=BitcoinTestMetaClass):
     def connect_nodes(self, a, b):
         from_connection = self.nodes[a]
         to_connection = self.nodes[b]
-        from_num_peers = 1 + len(from_connection.getpeerinfo())
-        to_num_peers = 1 + len(to_connection.getpeerinfo())
         ip_port = "127.0.0.1:" + str(p2p_port(b))
         from_connection.addnode(ip_port, "onetry")
         # poll until version handshake complete to avoid race conditions
@@ -590,10 +588,14 @@ class BitcoinTestFramework(metaclass=BitcoinTestMetaClass):
         # See comments in net_processing:
         # * Must have a version message before anything else
         # * Must have a verack message before anything else
-        self.wait_until(lambda: sum(peer['version'] != 0 for peer in from_connection.getpeerinfo()) == from_num_peers)
-        self.wait_until(lambda: sum(peer['version'] != 0 for peer in to_connection.getpeerinfo()) == to_num_peers)
-        self.wait_until(lambda: sum(peer['bytesrecv_per_msg'].pop('verack', 0) == 24 for peer in from_connection.getpeerinfo()) == from_num_peers)
-        self.wait_until(lambda: sum(peer['bytesrecv_per_msg'].pop('verack', 0) == 24 for peer in to_connection.getpeerinfo()) == to_num_peers)
+        def is_connection_established(from_node: TestNode, to_node: TestNode) -> bool:
+            connected = [peer for peer in from_connection.getpeerinfo() if f"testnode{b}" in peer["subver"]]
+            veracked = [peer for peer in connected if peer["bytesrecv_per_msg"].pop('verack', 0) == 24]
+            return len(veracked) == 1
+
+        self.wait_until(lambda: is_connection_established(from_connection, to_connection))
+        self.wait_until(lambda: is_connection_established(to_connection, from_connection))
+
 
     def disconnect_nodes(self, a, b):
         def disconnect_nodes_helper(node_a, node_b):

stickies-v

ACK faeea28

I would prefer making connect_nodes() more robust too as per my suggestion, but I think this is a solid improvement on its own and my comment could be implemented in a separate PR if that's preferable or the proposal is contentious.

A few nits, no blockers.

stickies-v · 2022-09-22T10:51:06Z

test/functional/test_framework/test_framework.py

+                    node_a.index,
+                    node_b.index,


nit: If you're touching, would use f-string instead:

self.log.warning(f"disconnect_nodes: {node_a.index} and {node_b.index} were not connected")

stickies-v · 2022-09-22T10:52:56Z

test/functional/test_framework/test_framework.py

+        def disconnect_nodes_helper(node_a, node_b):
+            def get_peer_ids(from_connection, node_num):


nit: Since you're touching, I think adding type hints here would be helpful. Also, for consistency would prefer calling it node instead of connection and index instead of num?

Suggested change

def disconnect_nodes_helper(node_a, node_b):

def get_peer_ids(from_connection, node_num):

def disconnect_nodes(self, a: int, b: int) -> None:

def disconnect_nodes_helper(node_a: TestNode, node_b: TestNode) -> None:

def get_peer_ids(from_node: TestNode, to_node_index: int) -> List[int]:

I kept the name as-is to make the diff minimal. If there are additional changes, they can be made in a follow-up.

If there is a follow-up, it would probably be fine to also make to_node_index of type TestNode.

@stickies-v maybe you can follow up here?

DrahtBot · 2022-09-23T03:19:44Z

The following sections might be updated with supplementary metadata relevant to reviewers and maintainers.

Conflicts

No conflicts as of last run.

glozow

ACK faeea28

glozow · 2022-09-28T15:19:56Z

test/functional/test_framework/test_framework.py

-            self.wait_until(lambda: not get_peer_ids(), timeout=5)
+            self.wait_until(lambda: not get_peer_ids(node_a, node_b.index), timeout=5)
+            self.wait_until(lambda: not get_peer_ids(node_b, node_a.index), timeout=5)


Ok I interpret this race to have been

call disconnect(1, 2), wait until 2 no longer shows up in node1.getpeerinfo()

call connect(0, 2). get to_num_peers = 2 when querying node2.getpeerinfo()

node1 actually disconnected from node2. now there's only 1 peer in node2.getpeerinfo().

lambda: sum(peer['version'] != 0 for peer in to_connection.getpeerinfo()) == to_num_peers == 2

hang

Which is why we need to wait for both.

And now the order is

call disconnect(1, 2), wait until 2 no longer shows up in node1.getpeerinfo() AND wait until 1 no longer shows up in node2.getpeerinfo()

call connect(0, 2). get to_num_peers = 1 when querying node2.getpeerinfo()

lambda: sum(peer['version'] != 0 for peer in to_connection.getpeerinfo()) == to_num_peers == 1

maflcko · 2022-09-28T18:57:07Z

I think it's not unreasonable to assume that the node may update its peers in the background?

The functional test main thread is a single thread, so there should be nothing happening in the background. If something were to happen in the background, it is either irrelevant to the test, or it could result in a rare intermittent false-failure.

faeea28 test: Avoid race in disconnect_nodes helper (MacroFake) Pull request description: Backport of #26138 ACKs for top commit: fanquake: ACK faeea28 Tree-SHA512: f967c38750220bd6c245db953055f8e6d5402b3a24081ca03795a8403c2ed4eab772b2e9c2d3b581c3bc55d191dd4e22711b5f97d39856d676f10799fc64a9c7

test: Avoid race in disconnect_nodes helper

faeea28

maflcko added this to the 24.0 milestone Sep 20, 2022

fanquake added the Tests label Sep 20, 2022

stickies-v reviewed Sep 20, 2022

View reviewed changes

stickies-v approved these changes Sep 22, 2022

View reviewed changes

glozow reviewed Sep 28, 2022

View reviewed changes

glozow merged commit b2da6dd into bitcoin:master Sep 28, 2022

fanquake added the Needs backport (24.x) label Sep 28, 2022

maflcko mentioned this pull request Sep 28, 2022

[24.x] test: Avoid race in disconnect_nodes helper #26197

Merged

maflcko removed the Needs backport (24.x) label Sep 28, 2022

sidhujag pushed a commit to syscoin/syscoin that referenced this pull request Sep 28, 2022

Merge bitcoin#26138: test: Avoid race in disconnect_nodes helper

aec13d6

maflcko deleted the 2209-test-race-🍄 branch September 29, 2022 09:27

bitcoin locked and limited conversation to collaborators Sep 29, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

test: Avoid race in disconnect_nodes helper #26138

test: Avoid race in disconnect_nodes helper #26138

Uh oh!

maflcko commented Sep 20, 2022 •

edited

Loading

Uh oh!

stickies-v left a comment

Uh oh!

maflcko commented Sep 21, 2022

Uh oh!

maflcko commented Sep 21, 2022

Uh oh!

stickies-v commented Sep 22, 2022

Uh oh!

stickies-v left a comment •

edited

Loading

Uh oh!

stickies-v Sep 22, 2022

Uh oh!

stickies-v Sep 22, 2022

Uh oh!

maflcko Sep 28, 2022

Uh oh!

fanquake Sep 29, 2022

Uh oh!

DrahtBot commented Sep 23, 2022 •

edited

Loading

Uh oh!

glozow left a comment

Uh oh!

glozow Sep 28, 2022

Uh oh!

maflcko commented Sep 28, 2022

Uh oh!

Uh oh!

		def disconnect_nodes_helper(node_a, node_b):
		def get_peer_ids(from_connection, node_num):

-        def disconnect_nodes_helper(node_a, node_b):
-            def get_peer_ids(from_connection, node_num):
+    def disconnect_nodes(self, a: int, b: int) -> None:
+        def disconnect_nodes_helper(node_a: TestNode, node_b: TestNode) -> None:
+            def get_peer_ids(from_node: TestNode, to_node_index: int) -> List[int]:

test: Avoid race in disconnect_nodes helper #26138

test: Avoid race in disconnect_nodes helper #26138

Uh oh!

Conversation

maflcko commented Sep 20, 2022 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

stickies-v left a comment

Choose a reason for hiding this comment

Uh oh!

maflcko commented Sep 21, 2022

Uh oh!

maflcko commented Sep 21, 2022

Uh oh!

stickies-v commented Sep 22, 2022

Uh oh!

stickies-v left a comment • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

stickies-v Sep 22, 2022

Choose a reason for hiding this comment

Uh oh!

stickies-v Sep 22, 2022

Choose a reason for hiding this comment

Uh oh!

maflcko Sep 28, 2022

Choose a reason for hiding this comment

Uh oh!

fanquake Sep 29, 2022

Choose a reason for hiding this comment

Uh oh!

DrahtBot commented Sep 23, 2022 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Conflicts

Uh oh!

glozow left a comment

Choose a reason for hiding this comment

Uh oh!

glozow Sep 28, 2022

Choose a reason for hiding this comment

Uh oh!

maflcko commented Sep 28, 2022

Uh oh!

Uh oh!

maflcko commented Sep 20, 2022 •

edited

Loading

stickies-v left a comment •

edited

Loading

DrahtBot commented Sep 23, 2022 •

edited

Loading