From f63ed9f5106227dc56c11112e7e8024092ac79fc Mon Sep 17 00:00:00 2001 From: n0ctal <4c866w5fn9@privaterelay.appleid.com> Date: Sat, 20 Jun 2026 03:38:25 +0500 Subject: [PATCH] fix(jobs): isolate per-node background goroutines from panics (#5397) A panic in a goroutine without a recover takes the whole panel down. The per-node heartbeat and traffic-sync goroutines run remote network I/O for each node with no panic isolation, so one misbehaving node could crash the master. Add common.GoRecover(name, fn), which runs fn in a goroutine guarded by a recover that logs the panic with a stack trace instead of crashing, and use it for the per-node heartbeat, traffic-sync and global-push goroutines. The deferred WaitGroup/semaphore releases still run during panic unwind, so the group never stalls. Other background goroutines can adopt the same helper. --- internal/util/common/err.go | 15 +++++++++ internal/util/common/gorecover_test.go | 41 +++++++++++++++++++++++ internal/web/job/node_heartbeat_job.go | 6 ++-- internal/web/job/node_traffic_sync_job.go | 12 ++++--- 4 files changed, 67 insertions(+), 7 deletions(-) create mode 100644 internal/util/common/gorecover_test.go diff --git a/internal/util/common/err.go b/internal/util/common/err.go index 54d66e8a8..bd4237a37 100644 --- a/internal/util/common/err.go +++ b/internal/util/common/err.go @@ -4,6 +4,7 @@ package common import ( "errors" "fmt" + "runtime/debug" "github.com/mhsanaei/3x-ui/v3/internal/logger" ) @@ -30,3 +31,17 @@ func Recover(msg string) any { } return panicErr } + +// GoRecover runs fn in a new goroutine guarded by a recover, so a panic in a +// background goroutine is logged (with name and a stack trace) instead of taking +// the whole process down. name identifies the goroutine in the log. +func GoRecover(name string, fn func()) { + go func() { + defer func() { + if r := recover(); r != nil { + logger.Error("panic in goroutine", name, ":", r, "\n"+string(debug.Stack())) + } + }() + fn() + }() +} diff --git a/internal/util/common/gorecover_test.go b/internal/util/common/gorecover_test.go new file mode 100644 index 000000000..d9d0f3b04 --- /dev/null +++ b/internal/util/common/gorecover_test.go @@ -0,0 +1,41 @@ +package common + +import ( + "os" + "testing" + "time" + + "github.com/mhsanaei/3x-ui/v3/internal/logger" + "github.com/op/go-logging" +) + +func TestMain(m *testing.M) { + logger.InitLogger(logging.ERROR) + os.Exit(m.Run()) +} + +func TestGoRecover_RunsFn(t *testing.T) { + done := make(chan struct{}) + GoRecover("test-run", func() { close(done) }) + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatal("fn did not run") + } +} + +func TestGoRecover_RecoversPanic(t *testing.T) { + done := make(chan struct{}) + // If GoRecover did not recover, this panic would crash the test binary. + GoRecover("test-panic", func() { + defer close(done) + panic("boom") + }) + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatal("goroutine did not complete") + } + // Let the deferred recover+log run before the test ends. + time.Sleep(50 * time.Millisecond) +} diff --git a/internal/web/job/node_heartbeat_job.go b/internal/web/job/node_heartbeat_job.go index 16111bf1e..c8c14b419 100644 --- a/internal/web/job/node_heartbeat_job.go +++ b/internal/web/job/node_heartbeat_job.go @@ -9,6 +9,7 @@ import ( "github.com/mhsanaei/3x-ui/v3/internal/database/model" "github.com/mhsanaei/3x-ui/v3/internal/eventbus" "github.com/mhsanaei/3x-ui/v3/internal/logger" + "github.com/mhsanaei/3x-ui/v3/internal/util/common" "github.com/mhsanaei/3x-ui/v3/internal/web/service" "github.com/mhsanaei/3x-ui/v3/internal/web/websocket" ) @@ -50,11 +51,12 @@ func (j *NodeHeartbeatJob) Run() { } wg.Add(1) sem <- struct{}{} - go func(n *model.Node) { + n := n + common.GoRecover("node-heartbeat:"+n.Name, func() { defer wg.Done() defer func() { <-sem }() j.probeOne(n) - }(n) + }) } wg.Wait() diff --git a/internal/web/job/node_traffic_sync_job.go b/internal/web/job/node_traffic_sync_job.go index d7f3e8f35..eddc0b20b 100644 --- a/internal/web/job/node_traffic_sync_job.go +++ b/internal/web/job/node_traffic_sync_job.go @@ -8,10 +8,10 @@ import ( "github.com/mhsanaei/3x-ui/v3/internal/database/model" "github.com/mhsanaei/3x-ui/v3/internal/logger" + "github.com/mhsanaei/3x-ui/v3/internal/util/common" "github.com/mhsanaei/3x-ui/v3/internal/web/runtime" "github.com/mhsanaei/3x-ui/v3/internal/web/service" "github.com/mhsanaei/3x-ui/v3/internal/web/websocket" - "github.com/mhsanaei/3x-ui/v3/internal/xray" ) const ( @@ -96,11 +96,12 @@ func (j *NodeTrafficSyncJob) Run() { } wg.Add(1) sem <- struct{}{} - go func(n *model.Node) { + n := n + common.GoRecover("node-traffic-sync:"+n.Name, func() { defer wg.Done() defer func() { <-sem }() j.syncOne(mgr, n, doIpSync) - }(n) + }) } wg.Wait() @@ -211,7 +212,8 @@ func (j *NodeTrafficSyncJob) maybePushGlobals(mgr *runtime.Manager, nodes []*mod } wg.Add(1) sem <- struct{}{} - go func(n *model.Node, remote *runtime.Remote, traffics []*xray.ClientTraffic) { + n, remote, traffics := n, remote, traffics + common.GoRecover("node-global-push:"+n.Name, func() { defer wg.Done() defer func() { <-sem }() ctx, cancel := context.WithTimeout(context.Background(), nodeTrafficSyncRequestTimeout) @@ -225,7 +227,7 @@ func (j *NodeTrafficSyncJob) maybePushGlobals(mgr *runtime.Manager, nodes []*mod logger.Warning("node traffic sync: push globals to", n.Name, "failed:", err) } } - }(n, remote, traffics) + }) } wg.Wait() }