From cec05c7440b5680c4f4a2683936206f584c9fd97 Mon Sep 17 00:00:00 2001 From: Allison Chiang Date: Mon, 13 Oct 2025 13:12:34 -0500 Subject: [PATCH 01/17] change logger --- pexec/managed_process.go | 4 +- pexec/panic_test.go | 187 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 189 insertions(+), 2 deletions(-) create mode 100644 pexec/panic_test.go diff --git a/pexec/managed_process.go b/pexec/managed_process.go index 6b56a4c9..ecb02e74 100644 --- a/pexec/managed_process.go +++ b/pexec/managed_process.go @@ -439,9 +439,9 @@ func (p *managedProcess) startLoggers(stdOut, stdErr io.ReadCloser) func() { } if p.shouldLog { if isErr { - logger.Error("\n\\_ " + string(line)) + logger.Error("\n\u001F\\_ " + string(line)) } else { - logger.Info("\n\\_ " + string(line)) + logger.Info("\n\u001F\\_ " + string(line)) } } if p.logWriter != nil && !logWriterError { diff --git a/pexec/panic_test.go b/pexec/panic_test.go new file mode 100644 index 00000000..4baa9786 --- /dev/null +++ b/pexec/panic_test.go @@ -0,0 +1,187 @@ +package pexec + +import ( + "context" + "fmt" + "os" + "path/filepath" + "testing" + "time" + + "go.uber.org/zap" + "go.uber.org/zap/zapcore" +) + +// mockLogger captures all log calls with their levels +type mockLogger struct { + name string + logs []logEntry +} + +type logEntry struct { + level string + message string + logger string +} + +func (m *mockLogger) Debug(args ...interface{}) { m.log("DEBUG", args...) } +func (m *mockLogger) Debugf(template string, args ...interface{}) { m.logf("DEBUG", template, args...) } +func (m *mockLogger) Debugw(msg string, keysAndValues ...interface{}) { m.log("DEBUG", msg) } +func (m *mockLogger) Info(args ...interface{}) { m.log("INFO", args...) } +func (m *mockLogger) Infof(template string, args ...interface{}) { m.logf("INFO", template, args...) } +func (m *mockLogger) Infow(msg string, keysAndValues ...interface{}) { m.log("INFO", msg) } +func (m *mockLogger) Warn(args ...interface{}) { m.log("WARN", args...) } +func (m *mockLogger) Warnf(template string, args ...interface{}) { m.logf("WARN", template, args...) } +func (m *mockLogger) Warnw(msg string, keysAndValues ...interface{}) { m.log("WARN", msg) } +func (m *mockLogger) Error(args ...interface{}) { m.log("ERROR", args...) } +func (m *mockLogger) Errorf(template string, args ...interface{}) { m.logf("ERROR", template, args...) } +func (m *mockLogger) Errorw(msg string, keysAndValues ...interface{}) { m.log("ERROR", msg) } +func (m *mockLogger) Fatal(args ...interface{}) { m.log("FATAL", args...) } +func (m *mockLogger) Fatalf(template string, args ...interface{}) { m.logf("FATAL", template, args...) } +func (m *mockLogger) Fatalw(msg string, keysAndValues ...interface{}) { m.log("FATAL", msg) } + +func (m *mockLogger) log(level string, args ...interface{}) { + message := fmt.Sprint(args...) + m.logs = append(m.logs, logEntry{level: level, message: message, logger: m.name}) + fmt.Printf("[MOCK %s] %s: %s\n", m.name, level, message) +} + +func (m *mockLogger) logf(level string, template string, args ...interface{}) { + message := fmt.Sprintf(template, args...) + m.logs = append(m.logs, logEntry{level: level, message: message, logger: m.name}) + fmt.Printf("[MOCK %s] %s: %s\n", m.name, level, message) +} + +// Implement other required methods to satisfy ZapCompatibleLogger interface +func (m *mockLogger) Desugar() *zap.Logger { return nil } +func (m *mockLogger) Level() zapcore.Level { return zapcore.InfoLevel } +func (m *mockLogger) Named(name string) *zap.SugaredLogger { + // Return a dummy sugared logger, but our mock will still capture calls + return zap.NewNop().Sugar() +} +func (m *mockLogger) Sync() error { return nil } +func (m *mockLogger) WithOptions(opts ...zap.Option) *zap.SugaredLogger { + return zap.NewNop().Sugar() +} + +func TestPexecPanicLogging(t *testing.T) { + // Create a temporary Go program that panics + tempDir := t.TempDir() + panicProgram := filepath.Join(tempDir, "main.go") + + panicCode := `package main + +func main() { + panic("test panic for pexec analysis") +} +` + + err := os.WriteFile(panicProgram, []byte(panicCode), 0644) + if err != nil { + t.Fatalf("Failed to write panic program: %v", err) + } + + // Create go.mod + goMod := filepath.Join(tempDir, "go.mod") + err = os.WriteFile(goMod, []byte("module panictest\ngo 1.21\n"), 0644) + if err != nil { + t.Fatalf("Failed to write go.mod: %v", err) + } + + // Create mock loggers to capture exactly what gets logged + mainLogger := &mockLogger{name: "main"} + stdoutLogger := &mockLogger{name: "stdout"} + stderrLogger := &mockLogger{name: "stderr"} + + // Configure ProcessConfig + config := ProcessConfig{ + ID: "panic-test", + Name: "go", + Args: []string{"run", "main.go"}, + CWD: tempDir, + Log: true, + StdOutLogger: stdoutLogger, + StdErrLogger: stderrLogger, + } + //start processes + process := NewManagedProcess(config, mainLogger) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + err = process.Start(ctx) + if err != nil { + fmt.Printf("Process start error (expected): %v\n", err) + } + + // Wait a moment for the process to run and panic + time.Sleep(5 * time.Second) + process.Stop() + + // Output the captured logs + fmt.Println("=== MAIN LOGGER ===") + for _, log := range mainLogger.logs { + fmt.Printf(" [%s] %s: %s\n", log.logger, log.level, log.message) + } + fmt.Println("=== STDOUT LOGGER ===") + for _, log := range stdoutLogger.logs { + fmt.Printf(" [%s] %s: %s\n", log.logger, log.level, log.message) + } + fmt.Println("=== STDERR LOGGER ===") + for _, log := range stderrLogger.logs { + fmt.Printf(" [%s] %s: %s\n", log.logger, log.level, log.message) + } + + // fmt.Println("\n=== ANALYSIS ===") + // fmt.Printf("Main logger captured %d logs:\n", len(mainLogger.logs)) + // for _, log := range mainLogger.logs { + // fmt.Printf(" [%s] %s: %s\n", log.logger, log.level, log.message) + // } + + // fmt.Printf("\nStdOut logger captured %d logs:\n", len(stdoutLogger.logs)) + // for _, log := range stdoutLogger.logs { + // fmt.Printf(" [%s] %s: %s\n", log.logger, log.level, log.message) + // if strings.Contains(log.message, "panic") { + // fmt.Printf(" ^^^ PANIC FOUND IN STDOUT! ^^^\n") + // } + // } + + // fmt.Printf("\nStdErr logger captured %d logs:\n", len(stderrLogger.logs)) + // for _, log := range stderrLogger.logs { + // fmt.Printf(" [%s] %s: %s\n", log.logger, log.level, log.message) + // if strings.Contains(log.message, "panic") { + // fmt.Printf(" ^^^ PANIC FOUND IN STDERR! ^^^\n") + // } + // } + + // // Check if panic appears in both streams + // panicInStdout := false + // panicInStderr := false + + // for _, log := range stdoutLogger.logs { + // if strings.Contains(log.message, "panic") { + // panicInStdout = true + // fmt.Printf("\n🚨 STDOUT Logger.%s() called with panic!\n", log.level) + // } + // } + + // for _, log := range stderrLogger.logs { + // if strings.Contains(log.message, "panic") { + // panicInStderr = true + // fmt.Printf("\n🚨 STDERR Logger.%s() called with panic!\n", log.level) + // } + // } + + // fmt.Printf("\n=== SUMMARY ===\n") + // fmt.Printf("Panic found in stdout logger: %v\n", panicInStdout) + // fmt.Printf("Panic found in stderr logger: %v\n", panicInStderr) + + // if panicInStdout && panicInStderr { + // fmt.Printf("šŸ”„ DOUBLE LOGGING CONFIRMED: Panic went through BOTH stdout and stderr!\n") + // } else if panicInStdout { + // fmt.Printf("šŸ¤” UNEXPECTED: Panic only went through stdout\n") + // } else if panicInStderr { + // fmt.Printf("āœ… EXPECTED: Panic only went through stderr\n") + // } else { + // fmt.Printf("ā“ NO PANIC CAPTURED: Something went wrong\n") + // } +} \ No newline at end of file From 13c87f25cc6db7fd93afb4e2221fae48a2d040c4 Mon Sep 17 00:00:00 2001 From: Allison Chiang Date: Thu, 16 Oct 2025 17:28:40 -0400 Subject: [PATCH 02/17] Delete panic_test.go --- pexec/panic_test.go | 187 -------------------------------------------- 1 file changed, 187 deletions(-) delete mode 100644 pexec/panic_test.go diff --git a/pexec/panic_test.go b/pexec/panic_test.go deleted file mode 100644 index 4baa9786..00000000 --- a/pexec/panic_test.go +++ /dev/null @@ -1,187 +0,0 @@ -package pexec - -import ( - "context" - "fmt" - "os" - "path/filepath" - "testing" - "time" - - "go.uber.org/zap" - "go.uber.org/zap/zapcore" -) - -// mockLogger captures all log calls with their levels -type mockLogger struct { - name string - logs []logEntry -} - -type logEntry struct { - level string - message string - logger string -} - -func (m *mockLogger) Debug(args ...interface{}) { m.log("DEBUG", args...) } -func (m *mockLogger) Debugf(template string, args ...interface{}) { m.logf("DEBUG", template, args...) } -func (m *mockLogger) Debugw(msg string, keysAndValues ...interface{}) { m.log("DEBUG", msg) } -func (m *mockLogger) Info(args ...interface{}) { m.log("INFO", args...) } -func (m *mockLogger) Infof(template string, args ...interface{}) { m.logf("INFO", template, args...) } -func (m *mockLogger) Infow(msg string, keysAndValues ...interface{}) { m.log("INFO", msg) } -func (m *mockLogger) Warn(args ...interface{}) { m.log("WARN", args...) } -func (m *mockLogger) Warnf(template string, args ...interface{}) { m.logf("WARN", template, args...) } -func (m *mockLogger) Warnw(msg string, keysAndValues ...interface{}) { m.log("WARN", msg) } -func (m *mockLogger) Error(args ...interface{}) { m.log("ERROR", args...) } -func (m *mockLogger) Errorf(template string, args ...interface{}) { m.logf("ERROR", template, args...) } -func (m *mockLogger) Errorw(msg string, keysAndValues ...interface{}) { m.log("ERROR", msg) } -func (m *mockLogger) Fatal(args ...interface{}) { m.log("FATAL", args...) } -func (m *mockLogger) Fatalf(template string, args ...interface{}) { m.logf("FATAL", template, args...) } -func (m *mockLogger) Fatalw(msg string, keysAndValues ...interface{}) { m.log("FATAL", msg) } - -func (m *mockLogger) log(level string, args ...interface{}) { - message := fmt.Sprint(args...) - m.logs = append(m.logs, logEntry{level: level, message: message, logger: m.name}) - fmt.Printf("[MOCK %s] %s: %s\n", m.name, level, message) -} - -func (m *mockLogger) logf(level string, template string, args ...interface{}) { - message := fmt.Sprintf(template, args...) - m.logs = append(m.logs, logEntry{level: level, message: message, logger: m.name}) - fmt.Printf("[MOCK %s] %s: %s\n", m.name, level, message) -} - -// Implement other required methods to satisfy ZapCompatibleLogger interface -func (m *mockLogger) Desugar() *zap.Logger { return nil } -func (m *mockLogger) Level() zapcore.Level { return zapcore.InfoLevel } -func (m *mockLogger) Named(name string) *zap.SugaredLogger { - // Return a dummy sugared logger, but our mock will still capture calls - return zap.NewNop().Sugar() -} -func (m *mockLogger) Sync() error { return nil } -func (m *mockLogger) WithOptions(opts ...zap.Option) *zap.SugaredLogger { - return zap.NewNop().Sugar() -} - -func TestPexecPanicLogging(t *testing.T) { - // Create a temporary Go program that panics - tempDir := t.TempDir() - panicProgram := filepath.Join(tempDir, "main.go") - - panicCode := `package main - -func main() { - panic("test panic for pexec analysis") -} -` - - err := os.WriteFile(panicProgram, []byte(panicCode), 0644) - if err != nil { - t.Fatalf("Failed to write panic program: %v", err) - } - - // Create go.mod - goMod := filepath.Join(tempDir, "go.mod") - err = os.WriteFile(goMod, []byte("module panictest\ngo 1.21\n"), 0644) - if err != nil { - t.Fatalf("Failed to write go.mod: %v", err) - } - - // Create mock loggers to capture exactly what gets logged - mainLogger := &mockLogger{name: "main"} - stdoutLogger := &mockLogger{name: "stdout"} - stderrLogger := &mockLogger{name: "stderr"} - - // Configure ProcessConfig - config := ProcessConfig{ - ID: "panic-test", - Name: "go", - Args: []string{"run", "main.go"}, - CWD: tempDir, - Log: true, - StdOutLogger: stdoutLogger, - StdErrLogger: stderrLogger, - } - //start processes - process := NewManagedProcess(config, mainLogger) - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - err = process.Start(ctx) - if err != nil { - fmt.Printf("Process start error (expected): %v\n", err) - } - - // Wait a moment for the process to run and panic - time.Sleep(5 * time.Second) - process.Stop() - - // Output the captured logs - fmt.Println("=== MAIN LOGGER ===") - for _, log := range mainLogger.logs { - fmt.Printf(" [%s] %s: %s\n", log.logger, log.level, log.message) - } - fmt.Println("=== STDOUT LOGGER ===") - for _, log := range stdoutLogger.logs { - fmt.Printf(" [%s] %s: %s\n", log.logger, log.level, log.message) - } - fmt.Println("=== STDERR LOGGER ===") - for _, log := range stderrLogger.logs { - fmt.Printf(" [%s] %s: %s\n", log.logger, log.level, log.message) - } - - // fmt.Println("\n=== ANALYSIS ===") - // fmt.Printf("Main logger captured %d logs:\n", len(mainLogger.logs)) - // for _, log := range mainLogger.logs { - // fmt.Printf(" [%s] %s: %s\n", log.logger, log.level, log.message) - // } - - // fmt.Printf("\nStdOut logger captured %d logs:\n", len(stdoutLogger.logs)) - // for _, log := range stdoutLogger.logs { - // fmt.Printf(" [%s] %s: %s\n", log.logger, log.level, log.message) - // if strings.Contains(log.message, "panic") { - // fmt.Printf(" ^^^ PANIC FOUND IN STDOUT! ^^^\n") - // } - // } - - // fmt.Printf("\nStdErr logger captured %d logs:\n", len(stderrLogger.logs)) - // for _, log := range stderrLogger.logs { - // fmt.Printf(" [%s] %s: %s\n", log.logger, log.level, log.message) - // if strings.Contains(log.message, "panic") { - // fmt.Printf(" ^^^ PANIC FOUND IN STDERR! ^^^\n") - // } - // } - - // // Check if panic appears in both streams - // panicInStdout := false - // panicInStderr := false - - // for _, log := range stdoutLogger.logs { - // if strings.Contains(log.message, "panic") { - // panicInStdout = true - // fmt.Printf("\n🚨 STDOUT Logger.%s() called with panic!\n", log.level) - // } - // } - - // for _, log := range stderrLogger.logs { - // if strings.Contains(log.message, "panic") { - // panicInStderr = true - // fmt.Printf("\n🚨 STDERR Logger.%s() called with panic!\n", log.level) - // } - // } - - // fmt.Printf("\n=== SUMMARY ===\n") - // fmt.Printf("Panic found in stdout logger: %v\n", panicInStdout) - // fmt.Printf("Panic found in stderr logger: %v\n", panicInStderr) - - // if panicInStdout && panicInStderr { - // fmt.Printf("šŸ”„ DOUBLE LOGGING CONFIRMED: Panic went through BOTH stdout and stderr!\n") - // } else if panicInStdout { - // fmt.Printf("šŸ¤” UNEXPECTED: Panic only went through stdout\n") - // } else if panicInStderr { - // fmt.Printf("āœ… EXPECTED: Panic only went through stderr\n") - // } else { - // fmt.Printf("ā“ NO PANIC CAPTURED: Something went wrong\n") - // } -} \ No newline at end of file From a189af29fe704fb985e52fa6950072b3bda0b169 Mon Sep 17 00:00:00 2001 From: Allison Chiang Date: Tue, 11 Nov 2025 14:52:14 -0500 Subject: [PATCH 03/17] warning remove --- rpc/wrtc_client.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/rpc/wrtc_client.go b/rpc/wrtc_client.go index 4f5d026b..d47737ab 100644 --- a/rpc/wrtc_client.go +++ b/rpc/wrtc_client.go @@ -3,6 +3,7 @@ package rpc import ( "context" "io" + "strings" "sync" "testing" "time" @@ -388,7 +389,9 @@ func dialWebRTC( utils.PanicCapturingGo(func() { if err := exchangeCandidates(); err != nil { - logger.Warnw("Failed to exchange candidates", "err", err) + if !strings.Contains(err.Error(), "host appears to be offline") { + logger.Warnw("Failed to exchange candidates", "err", err) + } exchangeCancel(err) } }) @@ -411,7 +414,9 @@ func dialWebRTC( Error: ErrorToStatus(exchangeErr).Proto(), }, }); err != nil { - logger.Warnw("Problem sending error to signaling server", "err", err) + if !strings.Contains(err.Error(), "no active offer for \"\"") { + logger.Warnw("Problem sending error to signaling server", "err", err) + } } }) return nil, exchangeErr From 4cd064cf99770a741967a7ccac29e48e0bea192e Mon Sep 17 00:00:00 2001 From: Allison Chiang Date: Tue, 11 Nov 2025 15:06:07 -0500 Subject: [PATCH 04/17] Revert "change logger" This reverts commit cec05c7440b5680c4f4a2683936206f584c9fd97. --- pexec/managed_process.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pexec/managed_process.go b/pexec/managed_process.go index ecb02e74..6b56a4c9 100644 --- a/pexec/managed_process.go +++ b/pexec/managed_process.go @@ -439,9 +439,9 @@ func (p *managedProcess) startLoggers(stdOut, stdErr io.ReadCloser) func() { } if p.shouldLog { if isErr { - logger.Error("\n\u001F\\_ " + string(line)) + logger.Error("\n\\_ " + string(line)) } else { - logger.Info("\n\u001F\\_ " + string(line)) + logger.Info("\n\\_ " + string(line)) } } if p.logWriter != nil && !logWriterError { From e1acc46481e427f2593dc50a0eed59afac4bf234 Mon Sep 17 00:00:00 2001 From: Allison Chiang Date: Wed, 12 Nov 2025 14:14:42 -0500 Subject: [PATCH 05/17] j --- rpc/wrtc_call_queue_mongodb.go | 4 ++-- rpc/wrtc_client.go | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/rpc/wrtc_call_queue_mongodb.go b/rpc/wrtc_call_queue_mongodb.go index 14e7a7cd..eb79ad1e 100644 --- a/rpc/wrtc_call_queue_mongodb.go +++ b/rpc/wrtc_call_queue_mongodb.go @@ -1049,7 +1049,7 @@ func (queue *mongoDBWebRTCCallQueue) checkHostQueueSize(ctx context.Context, for return errTooManyConns } -var errOffline = status.Error(codes.Unavailable, "host appears to be offline; ensure machine is online and try again") +var ErrOffline = status.Error(codes.Unavailable, "host appears to be offline; ensure machine is online and try again") // checkHostOnline will check if there is some operator for all the managed hosts that // claims to have an answerer online for that host. It does this by running an aggregation @@ -1081,7 +1081,7 @@ func (queue *mongoDBWebRTCCallQueue) checkHostOnline(ctx context.Context, hosts return err } if len(ret) == 0 { - return errOffline + return ErrOffline } return nil } diff --git a/rpc/wrtc_client.go b/rpc/wrtc_client.go index d47737ab..38dbe707 100644 --- a/rpc/wrtc_client.go +++ b/rpc/wrtc_client.go @@ -387,11 +387,13 @@ func dialWebRTC( } } + isErrOffline := false utils.PanicCapturingGo(func() { if err := exchangeCandidates(); err != nil { - if !strings.Contains(err.Error(), "host appears to be offline") { + if !errors.Is(err, ErrOffline) { logger.Warnw("Failed to exchange candidates", "err", err) } + isErrOffline = true exchangeCancel(err) } }) @@ -414,7 +416,8 @@ func dialWebRTC( Error: ErrorToStatus(exchangeErr).Proto(), }, }); err != nil { - if !strings.Contains(err.Error(), "no active offer for \"\"") { + var errInactive inactiveOfferError + if !isErrOffline && !errors.As(err, &errInactive) { logger.Warnw("Problem sending error to signaling server", "err", err) } } From a91b8a21203ed50b80fa574856ede832e6842fb8 Mon Sep 17 00:00:00 2001 From: Allison Chiang Date: Wed, 12 Nov 2025 15:04:38 -0500 Subject: [PATCH 06/17] Revert "j" This reverts commit e1acc46481e427f2593dc50a0eed59afac4bf234. --- rpc/wrtc_call_queue_mongodb.go | 4 ++-- rpc/wrtc_client.go | 7 ++----- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/rpc/wrtc_call_queue_mongodb.go b/rpc/wrtc_call_queue_mongodb.go index d6afee50..5443539e 100644 --- a/rpc/wrtc_call_queue_mongodb.go +++ b/rpc/wrtc_call_queue_mongodb.go @@ -1093,7 +1093,7 @@ func (queue *mongoDBWebRTCCallQueue) checkHostQueueSize(ctx context.Context, for return errTooManyConns } -var ErrOffline = status.Error(codes.Unavailable, "host appears to be offline; ensure machine is online and try again") +var errOffline = status.Error(codes.Unavailable, "host appears to be offline; ensure machine is online and try again") // checkHostOnline will check if there is some operator for all the managed hosts that // claims to have an answerer online for that host. It does this by running an aggregation @@ -1128,7 +1128,7 @@ func (queue *mongoDBWebRTCCallQueue) checkHostOnline(ctx context.Context, hosts return err } if len(ret) == 0 { - return ErrOffline + return errOffline } return nil } diff --git a/rpc/wrtc_client.go b/rpc/wrtc_client.go index 38dbe707..d47737ab 100644 --- a/rpc/wrtc_client.go +++ b/rpc/wrtc_client.go @@ -387,13 +387,11 @@ func dialWebRTC( } } - isErrOffline := false utils.PanicCapturingGo(func() { if err := exchangeCandidates(); err != nil { - if !errors.Is(err, ErrOffline) { + if !strings.Contains(err.Error(), "host appears to be offline") { logger.Warnw("Failed to exchange candidates", "err", err) } - isErrOffline = true exchangeCancel(err) } }) @@ -416,8 +414,7 @@ func dialWebRTC( Error: ErrorToStatus(exchangeErr).Proto(), }, }); err != nil { - var errInactive inactiveOfferError - if !isErrOffline && !errors.As(err, &errInactive) { + if !strings.Contains(err.Error(), "no active offer for \"\"") { logger.Warnw("Problem sending error to signaling server", "err", err) } } From ff27d9d3d3b30231415eeae3175820d443f778a0 Mon Sep 17 00:00:00 2001 From: Allison Chiang Date: Wed, 12 Nov 2025 15:05:23 -0500 Subject: [PATCH 07/17] Revert "warning remove" This reverts commit a189af29fe704fb985e52fa6950072b3bda0b169. --- rpc/wrtc_client.go | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/rpc/wrtc_client.go b/rpc/wrtc_client.go index d47737ab..4f5d026b 100644 --- a/rpc/wrtc_client.go +++ b/rpc/wrtc_client.go @@ -3,7 +3,6 @@ package rpc import ( "context" "io" - "strings" "sync" "testing" "time" @@ -389,9 +388,7 @@ func dialWebRTC( utils.PanicCapturingGo(func() { if err := exchangeCandidates(); err != nil { - if !strings.Contains(err.Error(), "host appears to be offline") { - logger.Warnw("Failed to exchange candidates", "err", err) - } + logger.Warnw("Failed to exchange candidates", "err", err) exchangeCancel(err) } }) @@ -414,9 +411,7 @@ func dialWebRTC( Error: ErrorToStatus(exchangeErr).Proto(), }, }); err != nil { - if !strings.Contains(err.Error(), "no active offer for \"\"") { - logger.Warnw("Problem sending error to signaling server", "err", err) - } + logger.Warnw("Problem sending error to signaling server", "err", err) } }) return nil, exchangeErr From 3e9cabe37b524dc17037f8d19c1fe04bcf15a009 Mon Sep 17 00:00:00 2001 From: Allison Chiang Date: Wed, 12 Nov 2025 15:12:20 -0500 Subject: [PATCH 08/17] lower to debug --- rpc/wrtc_client.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rpc/wrtc_client.go b/rpc/wrtc_client.go index 4f5d026b..b912407a 100644 --- a/rpc/wrtc_client.go +++ b/rpc/wrtc_client.go @@ -388,7 +388,7 @@ func dialWebRTC( utils.PanicCapturingGo(func() { if err := exchangeCandidates(); err != nil { - logger.Warnw("Failed to exchange candidates", "err", err) + logger.Debugw("Failed to exchange candidates", "err", err) exchangeCancel(err) } }) @@ -411,7 +411,7 @@ func dialWebRTC( Error: ErrorToStatus(exchangeErr).Proto(), }, }); err != nil { - logger.Warnw("Problem sending error to signaling server", "err", err) + logger.Debugw("Problem sending error to signaling server", "err", err) } }) return nil, exchangeErr From 054142ea1c3bb32e3f843e95f7a92e01b3ce22a7 Mon Sep 17 00:00:00 2001 From: Allison Chiang Date: Wed, 12 Nov 2025 17:57:52 -0500 Subject: [PATCH 09/17] Update wrtc_signaling_answerer.go --- rpc/wrtc_signaling_answerer.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/rpc/wrtc_signaling_answerer.go b/rpc/wrtc_signaling_answerer.go index f845658f..184bb390 100644 --- a/rpc/wrtc_signaling_answerer.go +++ b/rpc/wrtc_signaling_answerer.go @@ -451,7 +451,11 @@ func (aa *answerAttempt) connect(ctx context.Context) (err error) { if err != nil { logFields = append(logFields, "error", err.Error()) } - aa.logger.Infow("Diagnostic data for failed webrtc connection attempt", logFields...) + candidateStatus := "no remote candidates received" + if len(connInfo.RemoteCandidates) > 0 { + candidateStatus = "candidates received but unreachable" + } + aa.logger.Infow(fmt.Sprintf("Diagnostic data for failed webrtc connection attempt, %s", candidateStatus), logFields...) // Close unhealthy connection. utils.UncheckedError(pc.GracefulClose()) From def83eacb6f70ddd0fb9d042262f00629aebb5ee Mon Sep 17 00:00:00 2001 From: Allison Chiang Date: Wed, 12 Nov 2025 18:02:15 -0500 Subject: [PATCH 10/17] Revert "Update wrtc_signaling_answerer.go" This reverts commit 054142ea1c3bb32e3f843e95f7a92e01b3ce22a7. --- rpc/wrtc_signaling_answerer.go | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/rpc/wrtc_signaling_answerer.go b/rpc/wrtc_signaling_answerer.go index 184bb390..f845658f 100644 --- a/rpc/wrtc_signaling_answerer.go +++ b/rpc/wrtc_signaling_answerer.go @@ -451,11 +451,7 @@ func (aa *answerAttempt) connect(ctx context.Context) (err error) { if err != nil { logFields = append(logFields, "error", err.Error()) } - candidateStatus := "no remote candidates received" - if len(connInfo.RemoteCandidates) > 0 { - candidateStatus = "candidates received but unreachable" - } - aa.logger.Infow(fmt.Sprintf("Diagnostic data for failed webrtc connection attempt, %s", candidateStatus), logFields...) + aa.logger.Infow("Diagnostic data for failed webrtc connection attempt", logFields...) // Close unhealthy connection. utils.UncheckedError(pc.GracefulClose()) From b916a0ce4724df4a07a9815b06883f8c308c7cd6 Mon Sep 17 00:00:00 2001 From: Allison Chiang Date: Fri, 14 Nov 2025 12:11:30 -0500 Subject: [PATCH 11/17] Update wrtc_call_queue_mongodb.go --- rpc/wrtc_call_queue_mongodb.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/rpc/wrtc_call_queue_mongodb.go b/rpc/wrtc_call_queue_mongodb.go index 5443539e..9cd6eb81 100644 --- a/rpc/wrtc_call_queue_mongodb.go +++ b/rpc/wrtc_call_queue_mongodb.go @@ -1093,7 +1093,7 @@ func (queue *mongoDBWebRTCCallQueue) checkHostQueueSize(ctx context.Context, for return errTooManyConns } -var errOffline = status.Error(codes.Unavailable, "host appears to be offline; ensure machine is online and try again") +var ErrOffline = errors.New("host appears to be offline; ensure machine is online and try again hi") // checkHostOnline will check if there is some operator for all the managed hosts that // claims to have an answerer online for that host. It does this by running an aggregation @@ -1104,6 +1104,7 @@ var errOffline = status.Error(codes.Unavailable, "host appears to be offline; en // NOTE(benjirewis): The same NOTE about `len(hosts) == 1` applies here as in the method // above. func (queue *mongoDBWebRTCCallQueue) checkHostOnline(ctx context.Context, hosts ...string) error { + return ErrOffline ctx, span := trace.StartSpan(ctx, "CallQueue::checkHostOnline") defer span.End() @@ -1128,7 +1129,7 @@ func (queue *mongoDBWebRTCCallQueue) checkHostOnline(ctx context.Context, hosts return err } if len(ret) == 0 { - return errOffline + return ErrOffline } return nil } @@ -1149,7 +1150,7 @@ func (queue *mongoDBWebRTCCallQueue) incrementConnectionEstablishmentExpectedFai // error (internal error from MDB query, e.g.). We can tell from the passed in error. reason := "other" - if errors.Is(err, errOffline) { + if errors.Is(err, ErrOffline) { reason = "answerers_offline" } else if errors.Is(err, errTooManyConns) { reason = "too_many_callers" From 8f57c4a4d933ceeb4ba48b26859f7ccc84158548 Mon Sep 17 00:00:00 2001 From: Allison Chiang Date: Fri, 14 Nov 2025 12:59:10 -0500 Subject: [PATCH 12/17] Update wrtc_call_queue_mongodb.go --- rpc/wrtc_call_queue_mongodb.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rpc/wrtc_call_queue_mongodb.go b/rpc/wrtc_call_queue_mongodb.go index 9cd6eb81..cd7d2a54 100644 --- a/rpc/wrtc_call_queue_mongodb.go +++ b/rpc/wrtc_call_queue_mongodb.go @@ -1093,7 +1093,7 @@ func (queue *mongoDBWebRTCCallQueue) checkHostQueueSize(ctx context.Context, for return errTooManyConns } -var ErrOffline = errors.New("host appears to be offline; ensure machine is online and try again hi") +var ErrOffline = errors.New("host appears to be offline; ensure machine is online and try again") // checkHostOnline will check if there is some operator for all the managed hosts that // claims to have an answerer online for that host. It does this by running an aggregation From 359b70b6222e0c217d127a29e4574126f86681cd Mon Sep 17 00:00:00 2001 From: Allison Chiang Date: Fri, 14 Nov 2025 13:05:06 -0500 Subject: [PATCH 13/17] Update wrtc_call_queue_mongodb.go --- rpc/wrtc_call_queue_mongodb.go | 1 - 1 file changed, 1 deletion(-) diff --git a/rpc/wrtc_call_queue_mongodb.go b/rpc/wrtc_call_queue_mongodb.go index cd7d2a54..0174f8fe 100644 --- a/rpc/wrtc_call_queue_mongodb.go +++ b/rpc/wrtc_call_queue_mongodb.go @@ -1104,7 +1104,6 @@ var ErrOffline = errors.New("host appears to be offline; ensure machine is onlin // NOTE(benjirewis): The same NOTE about `len(hosts) == 1` applies here as in the method // above. func (queue *mongoDBWebRTCCallQueue) checkHostOnline(ctx context.Context, hosts ...string) error { - return ErrOffline ctx, span := trace.StartSpan(ctx, "CallQueue::checkHostOnline") defer span.End() From ec3296228b592fdccac653b3e67637e22b2383b4 Mon Sep 17 00:00:00 2001 From: Allison Chiang Date: Fri, 14 Nov 2025 15:09:57 -0500 Subject: [PATCH 14/17] Update wrtc_call_queue_mongodb.go --- rpc/wrtc_call_queue_mongodb.go | 1 + 1 file changed, 1 insertion(+) diff --git a/rpc/wrtc_call_queue_mongodb.go b/rpc/wrtc_call_queue_mongodb.go index 0174f8fe..feff515f 100644 --- a/rpc/wrtc_call_queue_mongodb.go +++ b/rpc/wrtc_call_queue_mongodb.go @@ -1093,6 +1093,7 @@ func (queue *mongoDBWebRTCCallQueue) checkHostQueueSize(ctx context.Context, for return errTooManyConns } +// ErrOffline is returned when a host appears to be offline. var ErrOffline = errors.New("host appears to be offline; ensure machine is online and try again") // checkHostOnline will check if there is some operator for all the managed hosts that From 8ed95690bf1dbfce94d13a5a95b5d7ba680d59ea Mon Sep 17 00:00:00 2001 From: Allison Chiang Date: Tue, 18 Nov 2025 10:58:31 -0500 Subject: [PATCH 15/17] changes --- rpc/dial.go | 18 ++++++++++++++++-- rpc/wrtc_call_queue_mongodb.go | 6 +++--- rpc/wrtc_client.go | 4 ++-- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/rpc/dial.go b/rpc/dial.go index 6e489507..32c2470b 100644 --- a/rpc/dial.go +++ b/rpc/dial.go @@ -141,7 +141,7 @@ func dial( mdnsLogger := utils.Sublogger(logger, "mdns") defer wg.Done() - mdnsLogger.Debugw("trying mDNS", "address", address) + mdnsLogger.Debugw("dialMulticastDNS CALLED") conn, cached, err := dialMulticastDNS(ctxParallel, address, mdnsLogger, dOpts) if err != nil { dialCh <- dialResult{err: err} @@ -191,6 +191,7 @@ func dial( "host", originalAddress, ) + webrtcLogger.Debugw("dialFunc WebRTC CALLED") conn, cached, err := dialFunc( ctxParallel, "webrtc", @@ -273,6 +274,10 @@ func dial( return conn, cached, nil } if fatalErr != nil { + logger.Errorw("HERE ",nonFatalErr, fatalErr, errors.Is(fatalErr, ErrOffline)) + if errors.Is(fatalErr, ErrOffline) { + return nil, false, ErrOffline + } return nil, false, fatalErr } @@ -284,8 +289,14 @@ func dial( } var directErr error + logger.Errorw("dialDirectGRPC CALLED") conn, cached, directErr = dialDirectGRPC(ctx, address, dOpts, logger) if directErr != nil { + logger.Errorw("HERE 1", directErr, errors.Is(directErr, ErrOffline)) + logger.Errorw("HERE 2", nonFatalErr, fatalErr) + if errors.Is(directErr, ErrOffline) { + return nil, false, ErrOffline + } return nil, false, multierr.Combine(directErr, nonFatalErr) } if dOpts.debug { @@ -322,6 +333,9 @@ func listMulticastInterfaces() []net.Interface { return interfaces } +// ErrMDNSQueryFailed is returned when a mDNS query fails to find a candidate. +var ErrMDNSQuery = errors.New("mDNS query failed to find a candidate") + func lookupMDNSCandidate(ctx context.Context, address string, logger utils.ZapCompatibleLogger) (*zeroconf.ServiceEntry, error) { candidates := []string{address, strings.ReplaceAll(address, ".", "-")} // RSDK-8205: logger.Desugar().Sugar() is necessary to massage a ZapCompatibleLogger into a @@ -356,7 +370,7 @@ func lookupMDNSCandidate(ctx context.Context, address string, logger utils.ZapCo if ctx.Err() != nil { return nil, ctx.Err() } - return nil, errors.New("mDNS query failed to find a candidate") + return nil, ErrMDNSQuery } func dialMulticastDNS( diff --git a/rpc/wrtc_call_queue_mongodb.go b/rpc/wrtc_call_queue_mongodb.go index 5443539e..581d7934 100644 --- a/rpc/wrtc_call_queue_mongodb.go +++ b/rpc/wrtc_call_queue_mongodb.go @@ -1093,7 +1093,7 @@ func (queue *mongoDBWebRTCCallQueue) checkHostQueueSize(ctx context.Context, for return errTooManyConns } -var errOffline = status.Error(codes.Unavailable, "host appears to be offline; ensure machine is online and try again") +var ErrOffline = status.Error(codes.Unavailable, "host appears to be offline; ensure machine is online and try again") // checkHostOnline will check if there is some operator for all the managed hosts that // claims to have an answerer online for that host. It does this by running an aggregation @@ -1128,7 +1128,7 @@ func (queue *mongoDBWebRTCCallQueue) checkHostOnline(ctx context.Context, hosts return err } if len(ret) == 0 { - return errOffline + return ErrOffline } return nil } @@ -1149,7 +1149,7 @@ func (queue *mongoDBWebRTCCallQueue) incrementConnectionEstablishmentExpectedFai // error (internal error from MDB query, e.g.). We can tell from the passed in error. reason := "other" - if errors.Is(err, errOffline) { + if errors.Is(err, ErrOffline) { reason = "answerers_offline" } else if errors.Is(err, errTooManyConns) { reason = "too_many_callers" diff --git a/rpc/wrtc_client.go b/rpc/wrtc_client.go index 4f5d026b..b912407a 100644 --- a/rpc/wrtc_client.go +++ b/rpc/wrtc_client.go @@ -388,7 +388,7 @@ func dialWebRTC( utils.PanicCapturingGo(func() { if err := exchangeCandidates(); err != nil { - logger.Warnw("Failed to exchange candidates", "err", err) + logger.Debugw("Failed to exchange candidates", "err", err) exchangeCancel(err) } }) @@ -411,7 +411,7 @@ func dialWebRTC( Error: ErrorToStatus(exchangeErr).Proto(), }, }); err != nil { - logger.Warnw("Problem sending error to signaling server", "err", err) + logger.Debugw("Problem sending error to signaling server", "err", err) } }) return nil, exchangeErr From 6db000f83f3035e3e2135dfc586c20a6170060ce Mon Sep 17 00:00:00 2001 From: Allison Chiang Date: Tue, 18 Nov 2025 14:54:04 -0500 Subject: [PATCH 16/17] ipdate --- rpc/dial.go | 6 ------ rpc/wrtc_call_queue_mongodb.go | 4 ++-- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/rpc/dial.go b/rpc/dial.go index 32c2470b..4b5b0bef 100644 --- a/rpc/dial.go +++ b/rpc/dial.go @@ -141,7 +141,6 @@ func dial( mdnsLogger := utils.Sublogger(logger, "mdns") defer wg.Done() - mdnsLogger.Debugw("dialMulticastDNS CALLED") conn, cached, err := dialMulticastDNS(ctxParallel, address, mdnsLogger, dOpts) if err != nil { dialCh <- dialResult{err: err} @@ -191,7 +190,6 @@ func dial( "host", originalAddress, ) - webrtcLogger.Debugw("dialFunc WebRTC CALLED") conn, cached, err := dialFunc( ctxParallel, "webrtc", @@ -274,7 +272,6 @@ func dial( return conn, cached, nil } if fatalErr != nil { - logger.Errorw("HERE ",nonFatalErr, fatalErr, errors.Is(fatalErr, ErrOffline)) if errors.Is(fatalErr, ErrOffline) { return nil, false, ErrOffline } @@ -289,11 +286,8 @@ func dial( } var directErr error - logger.Errorw("dialDirectGRPC CALLED") conn, cached, directErr = dialDirectGRPC(ctx, address, dOpts, logger) if directErr != nil { - logger.Errorw("HERE 1", directErr, errors.Is(directErr, ErrOffline)) - logger.Errorw("HERE 2", nonFatalErr, fatalErr) if errors.Is(directErr, ErrOffline) { return nil, false, ErrOffline } diff --git a/rpc/wrtc_call_queue_mongodb.go b/rpc/wrtc_call_queue_mongodb.go index 581d7934..413b8848 100644 --- a/rpc/wrtc_call_queue_mongodb.go +++ b/rpc/wrtc_call_queue_mongodb.go @@ -1093,7 +1093,7 @@ func (queue *mongoDBWebRTCCallQueue) checkHostQueueSize(ctx context.Context, for return errTooManyConns } -var ErrOffline = status.Error(codes.Unavailable, "host appears to be offline; ensure machine is online and try again") +var ErrOffline = errors.New("host appears to be offline; ensure machine is online and try again") // checkHostOnline will check if there is some operator for all the managed hosts that // claims to have an answerer online for that host. It does this by running an aggregation @@ -1128,7 +1128,7 @@ func (queue *mongoDBWebRTCCallQueue) checkHostOnline(ctx context.Context, hosts return err } if len(ret) == 0 { - return ErrOffline + return status.Error(codes.Unavailable, ErrOffline.Error()) } return nil } From 53de73373bf2fac9bf975977a44cf95376b621f1 Mon Sep 17 00:00:00 2001 From: Allison Chiang Date: Tue, 18 Nov 2025 15:01:20 -0500 Subject: [PATCH 17/17] oop --- rpc/dial.go | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/rpc/dial.go b/rpc/dial.go index 4b5b0bef..8d96844e 100644 --- a/rpc/dial.go +++ b/rpc/dial.go @@ -141,6 +141,7 @@ func dial( mdnsLogger := utils.Sublogger(logger, "mdns") defer wg.Done() + mdnsLogger.Debugw("trying mDNS", "address", address) conn, cached, err := dialMulticastDNS(ctxParallel, address, mdnsLogger, dOpts) if err != nil { dialCh <- dialResult{err: err} @@ -272,9 +273,6 @@ func dial( return conn, cached, nil } if fatalErr != nil { - if errors.Is(fatalErr, ErrOffline) { - return nil, false, ErrOffline - } return nil, false, fatalErr } @@ -288,9 +286,6 @@ func dial( var directErr error conn, cached, directErr = dialDirectGRPC(ctx, address, dOpts, logger) if directErr != nil { - if errors.Is(directErr, ErrOffline) { - return nil, false, ErrOffline - } return nil, false, multierr.Combine(directErr, nonFatalErr) } if dOpts.debug {