-
Notifications
You must be signed in to change notification settings - Fork 4k
Closed
Labels
S-1-stabilitySevere stability issues that can be fixed by upgrading, but usually don’t resolve by restartingSevere stability issues that can be fixed by upgrading, but usually don’t resolve by restarting
Milestone
Description
It seems that Gossip.SetStorage and Node.gossipStores grab mutexes in different order.
My understanding is:
- Gossip.SetStorage grabs gossip lock, calls ReadBootstrapInfo which grabs stores rlock.
- Node.gossipStores calls VisitStores, which grabs stores rlock, in the closure: Gossip.AddInfoProto->AddInfo, which grabs gossip lock.
Steps to reporoduce (@5151da6461852f57785db18511efd8ccf918b39c), using go-deadlock:
go get github.com/sasha-s/go-deadlock/...
cd src/github.com/cockroachdb
find . -name "*.go" | xargs -n 1 sed -i '' 's/sync.RWMutex/deadlock.RWMutex/'
find . -name "*.go" | xargs -n 1 sed -i '' 's/sync.Mutex/deadlock.Mutex/'
goimports -w .
cd cockroach
make acceptance TESTS='' TESTFLAGS='-v -d 1200s -l .' TESTTIMEOUT=1210s
look into acceptance/TestDockerC/roach_/stderr._
Stack traces from go-deadlock:
POTENTIAL DEADLOCK: Inconsistent locking. saw this ordering in one goroutine:
happened before
src/github.com/cockroachdb/cockroach/gossip/gossip.go:261 gossip.(*Gossip).SetStorage { g.mu.Lock() } <<<<<
src/github.com/cockroachdb/cockroach/server/node.go:437 server.(*Node).initStores { if err := n.ctx.Gossip.SetStorage(n.stores); err != nil { }
src/github.com/cockroachdb/cockroach/server/node.go:332 server.(*Node).start { if err := n.initStores(engines, n.stopper); err != nil { }
src/github.com/cockroachdb/cockroach/server/server.go:402 server.(*Server).Start { if err := s.node.start(unresolvedAddr, s.ctx.Engines, s.ctx.NodeAttributes); err != nil { }
src/github.com/cockroachdb/cockroach/cli/start.go:341 cli.runStart { if err := s.Start(); err != nil { }
src/github.com/spf13/cobra/command.go:600 cobra.(*Command).execute ???
src/github.com/spf13/cobra/command.go:690 cobra.(*Command).ExecuteC ???
src/github.com/spf13/cobra/command.go:649 cobra.(*Command).Execute ???
src/github.com/cockroachdb/cockroach/cli/cli.go:95 cli.Run { return cockroachCmd.Execute() }
src/github.com/cockroachdb/cockroach/main.go:37 main.main { if err := cli.Run(os.Args[1:]); err != nil { }
/usr/local/go/src/runtime/proc.go:188 runtime.main { main_main() }
happened after
src/github.com/cockroachdb/cockroach/storage/stores.go:279 storage.(*Stores).ReadBootstrapInfo { ls.mu.RLock() } <<<<<
src/github.com/cockroachdb/cockroach/gossip/gossip.go:267 gossip.(*Gossip).SetStorage { err := storage.ReadBootstrapInfo(&storedBI) }
src/github.com/cockroachdb/cockroach/server/node.go:437 server.(*Node).initStores { if err := n.ctx.Gossip.SetStorage(n.stores); err != nil { }
src/github.com/cockroachdb/cockroach/server/node.go:332 server.(*Node).start { if err := n.initStores(engines, n.stopper); err != nil { }
src/github.com/cockroachdb/cockroach/server/server.go:402 server.(*Server).Start { if err := s.node.start(unresolvedAddr, s.ctx.Engines, s.ctx.NodeAttributes); err != nil { }
src/github.com/cockroachdb/cockroach/cli/start.go:341 cli.runStart { if err := s.Start(); err != nil { }
src/github.com/spf13/cobra/command.go:600 cobra.(*Command).execute ???
src/github.com/spf13/cobra/command.go:690 cobra.(*Command).ExecuteC ???
src/github.com/spf13/cobra/command.go:649 cobra.(*Command).Execute ???
src/github.com/cockroachdb/cockroach/cli/cli.go:95 cli.Run { return cockroachCmd.Execute() }
src/github.com/cockroachdb/cockroach/main.go:37 main.main { if err := cli.Run(os.Args[1:]); err != nil { }
/usr/local/go/src/runtime/proc.go:188 runtime.main { main_main() }
in another goroutine: happened before
src/github.com/cockroachdb/cockroach/storage/stores.go:118 storage.(*Stores).VisitStores { ls.mu.RLock() } <<<<<
src/github.com/cockroachdb/cockroach/server/node.go:588 server.(*Node).gossipStores { if err := n.stores.VisitStores(func(s *storage.Store) error { }
src/github.com/cockroachdb/cockroach/server/node.go:568 server.(*Node).startGossip.func1 { n.gossipStores() // one-off run before going to sleep }
src/github.com/cockroachdb/cockroach/util/stop/stopper.go:180 stop.(*Stopper).RunWorker.func1 { f() }
happend after
src/github.com/cockroachdb/cockroach/gossip/gossip.go:545 gossip.(*Gossip).AddInfo { g.mu.Lock() } <<<<<
src/github.com/cockroachdb/cockroach/gossip/gossip.go:561 gossip.(*Gossip).AddInfoProto { return g.AddInfo(key, bytes, ttl) }
src/github.com/cockroachdb/cockroach/storage/store.go:1341 storage.(*Store).GossipStore { if err := s.ctx.Gossip.AddInfoProto(gossipStoreKey, storeDesc, ttlStoreGossip); err != nil { }
src/github.com/cockroachdb/cockroach/server/node.go:589 server.(*Node).gossipStores.func1 { s.GossipStore() }
src/github.com/cockroachdb/cockroach/storage/stores.go:121 storage.(*Stores).VisitStores { if err := visitor(s); err != nil { }
src/github.com/cockroachdb/cockroach/server/node.go:588 server.(*Node).gossipStores { if err := n.stores.VisitStores(func(s *storage.Store) error { }
src/github.com/cockroachdb/cockroach/server/node.go:568 server.(*Node).startGossip.func1 { n.gossipStores() // one-off run before going to sleep }
src/github.com/cockroachdb/cockroach/util/stop/stopper.go:180 stop.(*Stopper).RunWorker.func1 { f() }
ahjdzx
Metadata
Metadata
Assignees
Labels
S-1-stabilitySevere stability issues that can be fixed by upgrading, but usually don’t resolve by restartingSevere stability issues that can be fixed by upgrading, but usually don’t resolve by restarting