Skip to content

Commit 6b70e19

Browse files
authored
collect more info for possible powershell crash (#43710)
### What does this PR do? [Windows-specific] This PR collects more information for possible Powershell crash in e2e test. The issue happened randomly, so the goal is to gather more information in order to understand the problem better. ### Motivation [WINA-1746](https://datadoghq.atlassian.net/browse/WINA-1746) ### Describe how you validated your changes Observe E2E test results for the collected data for powershell crash. ### Additional Notes [WINA-1746]: https://datadoghq.atlassian.net/browse/WINA-1746?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ Co-authored-by: hongshi.guo <hongshi.guo@datadoghq.com>
1 parent a969a60 commit 6b70e19

File tree

2 files changed

+116
-16
lines changed

2 files changed

+116
-16
lines changed

test/new-e2e/tests/installer/windows/base_suite.go

Lines changed: 82 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,17 @@ import (
2626
"github.com/stretchr/testify/suite"
2727
)
2828

29+
// isWERDumpCollectionEnabled returns true by default; it is disabled when DD_E2E_SKIP_WER_DUMPS is truthy
30+
func isWERDumpCollectionEnabled() bool {
31+
val := os.Getenv("DD_E2E_SKIP_WER_DUMPS")
32+
switch strings.ToLower(strings.TrimSpace(val)) {
33+
case "1", "true", "yes", "on":
34+
return false
35+
default:
36+
return true
37+
}
38+
}
39+
2940
// BaseSuite the base suite for all installer tests on Windows (install script, MSI, exe etc...).
3041
// To run the test suites locally, pick a pipeline and define the following environment variables:
3142
// E2E_PIPELINE_ID: the ID of the pipeline
@@ -111,15 +122,19 @@ func (s *BaseSuite) SetupSuite() {
111122
s.T().Logf("stable agent version: %s", s.StableAgentVersion())
112123

113124
// Enable crash dumps
114-
host := s.Env().RemoteHost
115-
s.dumpFolder = `C:\dumps`
116-
err := windowscommon.EnableWERGlobalDumps(host, s.dumpFolder)
117-
s.Require().NoError(err, "should enable WER dumps")
118-
// Set the environment variable at the machine level.
119-
// The tests will be re-installing services so the per-service environment
120-
// won't be persisted.
121-
_, err = host.Execute(`[Environment]::SetEnvironmentVariable("GOTRACEBACK", "wer", "Machine")`)
122-
s.Require().NoError(err, "should set GOTRACEBACK environment variable")
125+
if isWERDumpCollectionEnabled() {
126+
host := s.Env().RemoteHost
127+
s.dumpFolder = `C:\dumps`
128+
err := windowscommon.EnableWERGlobalDumps(host, s.dumpFolder)
129+
s.Require().NoError(err, "should enable WER dumps")
130+
// Set the environment variable at the machine level.
131+
// The tests will be re-installing services so the per-service environment
132+
// won't be persisted.
133+
_, err = host.Execute(`[Environment]::SetEnvironmentVariable("GOTRACEBACK", "wer", "Machine")`)
134+
s.Require().NoError(err, "should set GOTRACEBACK environment variable")
135+
} else {
136+
s.T().Log("WER dump collection disabled via DD_E2E_SKIP_WER_DUMPS")
137+
}
123138
}
124139

125140
// createCurrentAgent sets the current agent version for the test suite.
@@ -276,14 +291,66 @@ func (s *BaseSuite) AfterTest(suiteName, testName string) {
276291
}
277292

278293
// look for and download crashdumps
279-
dumps, err := windowscommon.DownloadAllWERDumps(s.Env().RemoteHost, s.dumpFolder, s.SessionOutputDir())
280-
s.Assert().NoError(err, "should download crash dumps")
281-
if !s.Assert().Empty(dumps, "should not have crash dumps") {
282-
s.T().Logf("Found crash dumps:")
283-
for _, dump := range dumps {
284-
s.T().Logf(" %s", dump)
294+
if isWERDumpCollectionEnabled() {
295+
// Poll for up to ~30s (1s interval) to allow WER to finish writing full dumps (DumpType=2)
296+
h := s.Env().RemoteHost
297+
deadline := time.Now().Add(30 * time.Second)
298+
for time.Now().Before(deadline) {
299+
entries, _ := h.ReadDir(s.dumpFolder)
300+
hasDump := false
301+
for _, e := range entries {
302+
if !e.IsDir() && strings.HasSuffix(strings.ToLower(e.Name()), ".dmp") {
303+
hasDump = true
304+
break
305+
}
306+
}
307+
if hasDump {
308+
break
309+
}
310+
time.Sleep(1 * time.Second)
311+
}
312+
dumps, err := windowscommon.DownloadAllWERDumps(s.Env().RemoteHost, s.dumpFolder, s.SessionOutputDir())
313+
s.Assert().NoError(err, "should download crash dumps")
314+
if !s.Assert().Empty(dumps, "should not have crash dumps") {
315+
s.T().Logf("Found crash dumps:")
316+
for _, dump := range dumps {
317+
s.T().Logf(" %s", dump)
318+
}
285319
}
286320
}
321+
// Collect WER ReportArchive entries for powershell.exe if present.
322+
// Synchronously scan the WER ReportArchive for PowerShell crash
323+
// reports and copy any found files into the test’s output directory.
324+
func() {
325+
host := s.Env().RemoteHost
326+
base := `C:\ProgramData\Microsoft\Windows\WER\ReportArchive`
327+
entries, derr := host.ReadDir(base)
328+
if derr != nil {
329+
return
330+
}
331+
for _, e := range entries {
332+
if !e.IsDir() {
333+
continue
334+
}
335+
name := e.Name()
336+
if !strings.HasPrefix(strings.ToLower(name), strings.ToLower("AppCrash_powershell.exe")) {
337+
continue
338+
}
339+
dir := filepath.Join(base, name)
340+
files, derr2 := host.ReadDir(dir)
341+
if derr2 != nil {
342+
continue
343+
}
344+
for _, f := range files {
345+
if f.IsDir() {
346+
continue
347+
}
348+
src := filepath.Join(dir, f.Name())
349+
dst := filepath.Join(s.SessionOutputDir(), fmt.Sprintf("WER_%s_%s", name, f.Name()))
350+
_ = host.GetFile(src, dst)
351+
}
352+
}
353+
}()
287354

288355
if s.T().Failed() {
289356
// If the test failed, export the event logs for debugging

test/new-e2e/tests/windows/common/acl.go

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -359,7 +359,40 @@ func objectSecurityDTOFromCommand(host *components.RemoteHost, cmd string) (Obje
359359
// Get the ACL information
360360
output, err := host.Execute(cmd)
361361
if err != nil {
362-
return s, err
362+
// On error, collect diagnostics to aid root cause (e.g., PowerShell crash with clr.dll)
363+
diagScript := fmt.Sprintf(`
364+
$ts = Get-Date -Format 'yyyyMMdd_HHmmss'
365+
$out = "C:\Windows\Temp\acl_diag_$ts.txt"
366+
try {
367+
"=== ACL diagnostics ===" | Out-File -FilePath $out -Encoding UTF8
368+
"Command: %s" | Out-File $out -Append
369+
"PSVersionTable:" | Out-File $out -Append
370+
$PSVersionTable | Out-File $out -Append
371+
"Is64BitProcess: $([Environment]::Is64BitProcess)" | Out-File $out -Append
372+
try {
373+
$clr = Get-Item "$env:WINDIR\Microsoft\NET\Framework64\v4.0.30319\clr.dll"
374+
"CLR64: $($clr.VersionInfo.FileVersion)" | Out-File $out -Append
375+
} catch {}
376+
try {
377+
$clr32 = Get-Item "$env:WINDIR\Microsoft\NET\Framework\v4.0.30319\clr.dll"
378+
"CLR32: $($clr32.VersionInfo.FileVersion)" | Out-File $out -Append
379+
} catch {}
380+
"Privileges (whoami /priv):" | Out-File $out -Append
381+
whoami /priv | Out-File $out -Append
382+
""
383+
"Recent Application errors (.NET Runtime 1023, Application Error 1000):" | Out-File $out -Append
384+
try {
385+
Get-WinEvent -FilterHashtable @{LogName='Application'; Level=1,2} -MaxEvents 30 |
386+
Where-Object { $_.ProviderName -in @('.NET Runtime','Application Error') } |
387+
Select TimeCreated, Id, ProviderName, Message |
388+
Format-List | Out-String -Width 4096 | Out-File $out -Append
389+
} catch {}
390+
} catch {
391+
"Diagnostics collection error: $($_.Exception.Message)" | Out-File $out -Append
392+
}
393+
`, strings.ReplaceAll(cmd, "`", "``"))
394+
_, _ = host.Execute(diagScript)
395+
return s, fmt.Errorf("failed executing ACL command: %v; diagnostics saved under C:\\Windows\\Temp (acl_diag_*\\.txt)", err)
363396
}
364397

365398
err = json.Unmarshal([]byte(output), &s)

0 commit comments

Comments
 (0)