Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.

Commit 2cc4767

Browse files
author
Mike McLaughlin
authored
Fix debugger launch race hitting breakpoints in startup code. (#8951) (#9060)
The attached flag was been set asynchronously relative to the DebugActiveProcess returning. This could cause a race where the initial module load notification being missed/not sent to the debugger. This fix sets the attached flag before any notifications sent during launch if the runtime was launched/attached using the startup handshake after dbgshim tells the runtime to "continue" when the runtime startup API callback returns. Also fixes another race condition in dbgshim where EnumerateCLRs returns a NULL continue event handle because the coreclr module was loaded but the g_hContinueStartupEvent wasn't initialized on the runtime side yet. Changed the static initialization of g_hContinueStartupEvent to INVALID_HANDLE_VALUE and the InternalEnumerateCLRs sleep/retry loop to retry when any of the handles are INVALID_HANDLE_VALUE. This fixes the race only when you have the latest dbgshim and coreclr binaries and the old/new mixes still function but don't fix the race.
1 parent 230a62c commit 2cc4767

File tree

3 files changed

+70
-24
lines changed

3 files changed

+70
-24
lines changed

src/debug/ee/debugger.cpp

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1895,7 +1895,7 @@ void Debugger::SendCreateProcess(DebuggerLockHolder * pDbgLockHolder)
18951895

18961896
#if defined(FEATURE_CORECLR) && !defined(FEATURE_PAL)
18971897

1898-
HANDLE g_hContinueStartupEvent = NULL;
1898+
HANDLE g_hContinueStartupEvent = INVALID_HANDLE_VALUE;
18991899

19001900
CLR_ENGINE_METRICS g_CLREngineMetrics = {
19011901
sizeof(CLR_ENGINE_METRICS),
@@ -1942,7 +1942,7 @@ void NotifyDebuggerOfTelestoStartup()
19421942
// enumeration of this process will get back a valid continue event
19431943
// the instant we signal the startup notification event.
19441944

1945-
CONSISTENCY_CHECK(NULL == g_hContinueStartupEvent);
1945+
CONSISTENCY_CHECK(INVALID_HANDLE_VALUE == g_hContinueStartupEvent);
19461946
g_hContinueStartupEvent = WszCreateEvent(NULL, TRUE, FALSE, NULL);
19471947
CONSISTENCY_CHECK(INVALID_HANDLE_VALUE != g_hContinueStartupEvent); // we reserve this value for error conditions in EnumerateCLRs
19481948

@@ -2170,7 +2170,14 @@ HRESULT Debugger::Startup(void)
21702170
// Signal the debugger (via dbgshim) and wait until it is ready for us to
21712171
// continue. This needs to be outside the lock and after the transport is
21722172
// initialized.
2173-
PAL_NotifyRuntimeStarted();
2173+
if (PAL_NotifyRuntimeStarted())
2174+
{
2175+
// The runtime was successfully launched and attached so mark it now
2176+
// so no notifications are missed especially the initial module load
2177+
// which would cause debuggers problems with reliable setting breakpoints
2178+
// in startup code or Main.
2179+
MarkDebuggerAttachedInternal();
2180+
}
21742181
#endif // FEATURE_PAL
21752182

21762183
// We don't bother changing this process's permission.

src/dlls/dbgshim/dbgshim.cpp

Lines changed: 53 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -387,25 +387,66 @@ class RuntimeStartupHelper
387387
return hr;
388388
}
389389

390+
bool AreAllHandlesValid(HANDLE *handleArray, DWORD arrayLength)
391+
{
392+
for (int i = 0; i < (int)arrayLength; i++)
393+
{
394+
HANDLE h = handleArray[i];
395+
if (h == INVALID_HANDLE_VALUE)
396+
{
397+
return false;
398+
}
399+
}
400+
return true;
401+
}
402+
390403
HRESULT InternalEnumerateCLRs(HANDLE** ppHandleArray, _In_reads_(*pdwArrayLength) LPWSTR** ppStringArray, DWORD* pdwArrayLength)
391404
{
392405
int numTries = 0;
393406
HRESULT hr;
394407

395408
while (numTries < 25)
396409
{
410+
hr = EnumerateCLRs(m_processId, ppHandleArray, ppStringArray, pdwArrayLength);
411+
397412
// EnumerateCLRs uses the OS API CreateToolhelp32Snapshot which can return ERROR_BAD_LENGTH or
398413
// ERROR_PARTIAL_COPY. If we get either of those, we try wait 1/10th of a second try again (that
399-
// is the recommendation of the OS API owners)
400-
hr = EnumerateCLRs(m_processId, ppHandleArray, ppStringArray, pdwArrayLength);
414+
// is the recommendation of the OS API owners).
401415
if ((hr != HRESULT_FROM_WIN32(ERROR_PARTIAL_COPY)) && (hr != HRESULT_FROM_WIN32(ERROR_BAD_LENGTH)))
402416
{
403-
break;
417+
// Just return any other error or if no handles were found (which means the coreclr module wasn't found yet).
418+
if (FAILED(hr) || *ppHandleArray == NULL || *pdwArrayLength <= 0)
419+
{
420+
return hr;
421+
}
422+
// If EnumerateCLRs succeeded but any of the handles are INVALID_HANDLE_VALUE, then sleep and retry
423+
// also. This fixes a race condition where dbgshim catches the coreclr module just being loaded but
424+
// before g_hContinueStartupEvent has been initialized.
425+
if (AreAllHandlesValid(*ppHandleArray, *pdwArrayLength))
426+
{
427+
return hr;
428+
}
429+
// Clean up memory allocated in EnumerateCLRs since this path it succeeded
430+
CloseCLREnumeration(*ppHandleArray, *ppStringArray, *pdwArrayLength);
431+
432+
*ppHandleArray = NULL;
433+
*ppStringArray = NULL;
434+
*pdwArrayLength = 0;
404435
}
436+
437+
// Sleep and retry enumerating the runtimes
405438
Sleep(100);
406439
numTries++;
440+
441+
if (m_canceled)
442+
{
443+
break;
444+
}
407445
}
408446

447+
// Indicate a timeout
448+
hr = HRESULT_FROM_WIN32(ERROR_TIMEOUT);
449+
409450
return hr;
410451
}
411452

@@ -433,7 +474,7 @@ class RuntimeStartupHelper
433474
DWORD arrayLength = 0;
434475

435476
// Wake up runtime(s)
436-
HRESULT hr = InternalEnumerateCLRs(&handleArray, &stringArray, &arrayLength);
477+
HRESULT hr = EnumerateCLRs(m_processId, &handleArray, &stringArray, &arrayLength);
437478
if (SUCCEEDED(hr))
438479
{
439480
WakeRuntimes(handleArray, arrayLength);
@@ -517,9 +558,8 @@ class RuntimeStartupHelper
517558
bool coreclrExists = false;
518559

519560
HRESULT hr = InvokeStartupCallback(&coreclrExists);
520-
// Because the target process is suspended on create, the toolhelp apis fail with the below errors even
521-
// with the retry logic in InternalEnumerateCLRs.
522-
if (SUCCEEDED(hr) || (hr == HRESULT_FROM_WIN32(ERROR_PARTIAL_COPY)) || (hr == HRESULT_FROM_WIN32(ERROR_BAD_LENGTH)))
561+
// The retry logic in InternalEnumerateCLRs failed if ERROR_TIMEOUT was returned.
562+
if (SUCCEEDED(hr) || (hr == HRESULT_FROM_WIN32(ERROR_TIMEOUT)))
523563
{
524564
if (!coreclrExists && !m_canceled)
525565
{
@@ -531,8 +571,11 @@ class RuntimeStartupHelper
531571
hr = InvokeStartupCallback(&coreclrExists);
532572
if (SUCCEEDED(hr))
533573
{
534-
// We should always find a coreclr module
535-
_ASSERTE(coreclrExists);
574+
// We should always find a coreclr module so fail if we don't
575+
if (!coreclrExists)
576+
{
577+
hr = E_FAIL;
578+
}
536579
}
537580
}
538581
}
@@ -1120,8 +1163,6 @@ EnumerateCLRs(
11201163

11211164
HANDLE hContinueStartupEvent = INVALID_HANDLE_VALUE;
11221165
HRESULT hr = GetContinueStartupEvent(debuggeePID, pStringArray[idx], &hContinueStartupEvent);
1123-
_ASSERTE(SUCCEEDED(hr) == (hContinueStartupEvent != INVALID_HANDLE_VALUE));
1124-
11251166
pEventArray[idx] = hContinueStartupEvent;
11261167
#else
11271168
pEventArray[idx] = NULL;
@@ -1729,7 +1770,7 @@ GetContinueStartupEvent(
17291770
ThrowHR(E_FAIL);
17301771
}
17311772

1732-
if (NULL != continueEvent)
1773+
if (NULL != continueEvent && INVALID_HANDLE_VALUE != continueEvent)
17331774
{
17341775
if (!DuplicateHandle(hProcess, continueEvent, GetCurrentProcess(), &continueEvent,
17351776
EVENT_MODIFY_STATE, FALSE, 0))

src/pal/src/thread/process.cpp

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1879,7 +1879,7 @@ PAL_UnregisterForRuntimeStartup(
18791879
None
18801880
18811881
Return value:
1882-
TRUE - succeeded, FALSE - failed
1882+
TRUE - successfully launched by debugger, FALSE - not launched or some failure in the handshake
18831883
--*/
18841884
BOOL
18851885
PALAPI
@@ -1889,7 +1889,7 @@ PAL_NotifyRuntimeStarted()
18891889
char continueSemName[CLR_SEM_MAX_NAMELEN];
18901890
sem_t *startupSem = SEM_FAILED;
18911891
sem_t *continueSem = SEM_FAILED;
1892-
BOOL result = TRUE;
1892+
BOOL launched = FALSE;
18931893

18941894
UINT64 processIdDisambiguationKey = 0;
18951895
GetProcessIdDisambiguationKey(gPID, &processIdDisambiguationKey);
@@ -1899,9 +1899,7 @@ PAL_NotifyRuntimeStarted()
18991899

19001900
TRACE("PAL_NotifyRuntimeStarted opening continue '%s' startup '%s'\n", continueSemName, startupSemName);
19011901

1902-
1903-
// Open the debugger startup semaphore. If it doesn't exists, then we do nothing and
1904-
// the function is successful.
1902+
// Open the debugger startup semaphore. If it doesn't exists, then we do nothing and return
19051903
startupSem = sem_open(startupSemName, 0);
19061904
if (startupSem == SEM_FAILED)
19071905
{
@@ -1913,26 +1911,26 @@ PAL_NotifyRuntimeStarted()
19131911
if (continueSem == SEM_FAILED)
19141912
{
19151913
ASSERT("sem_open(%s) failed: %d (%s)\n", continueSemName, errno, strerror(errno));
1916-
result = FALSE;
19171914
goto exit;
19181915
}
19191916

19201917
// Wake up the debugger waiting for startup
19211918
if (sem_post(startupSem) != 0)
19221919
{
19231920
ASSERT("sem_post(startupSem) failed: errno is %d (%s)\n", errno, strerror(errno));
1924-
result = FALSE;
19251921
goto exit;
19261922
}
19271923

19281924
// Now wait until the debugger's runtime startup notification is finished
19291925
if (sem_wait(continueSem) != 0)
19301926
{
19311927
ASSERT("sem_wait(continueSem) failed: errno is %d (%s)\n", errno, strerror(errno));
1932-
result = FALSE;
19331928
goto exit;
19341929
}
19351930

1931+
// Returns that the runtime was successfully launched for debugging
1932+
launched = TRUE;
1933+
19361934
exit:
19371935
if (startupSem != SEM_FAILED)
19381936
{
@@ -1942,7 +1940,7 @@ PAL_NotifyRuntimeStarted()
19421940
{
19431941
sem_close(continueSem);
19441942
}
1945-
return result;
1943+
return launched;
19461944
}
19471945

19481946
/*++

0 commit comments

Comments
 (0)