Detect unexpected VM exit and terminate wslcsession.exe

When the VM is killed externally (e.g. hcsdiag /kill, kernel panic), the
wslcsession.exe process now detects it and exits cleanly instead of
hanging indefinitely as a zombie.

Implementation:
- Add GetExitEvent() to IWSLCVirtualMachine IDL. The SYSTEM service
  duplicates HcsVirtualMachine::m_vmExitEvent via COM system_handle
  marshaling so the session process can wait on it.
- WSLCVirtualMachine calls GetExitEvent() during Initialize() and
  exposes VmExitedEvent() for consumers to monitor.
- WSLCSession monitors the exit event via IORelay. On unexpected VM
  exit, OnVmExited() spawns a thread to call Terminate() (must be a
  separate thread to avoid deadlock with IORelay::Stop).
- WSLCSessionManager registers a cleanup callback on HcsVirtualMachine
  to terminate sessions when VM exits. Callback is cleared in
  ~HcsVirtualMachine to avoid firing during normal shutdown.
- Harden Terminate() to skip dockerd signal/wait and unmount when the
  VM is already dead, avoiding unnecessary 30s+ hangs.
- Add tests: VmKillTerminatesSession, VmKillFailsInFlightOperations,
  CleanShutdownStillWorks.
This commit is contained in:
Ben Hillis
2026-04-10 08:45:41 -07:00
parent 251e6cf16f
commit aa851c701c
6 changed files with 222 additions and 18 deletions

View File

@@ -49,7 +49,7 @@ HcsVirtualMachine::HcsVirtualMachine(_In_ const WSLCSessionSettings* Settings)
// Build HCS settings
hcs::ComputeSystem systemSettings{};
systemSettings.Owner = L"WSL";
systemSettings.Owner = std::format(L"WSLC-{}", Settings->DisplayName);
systemSettings.ShouldTerminateOnLastHandleClosed = true;
// Determine which schema version to use based on the Windows version. Windows 10 does not support
@@ -281,6 +281,15 @@ HcsVirtualMachine::~HcsVirtualMachine()
{
std::lock_guard lock(m_lock);
// Clear the session termination callback before destroying the VM.
// During normal shutdown, the session is already terminating — firing the
// callback would cause a redundant (and potentially crashing) COM call.
// Uses its own lock to avoid deadlock with HCS callback thread.
{
std::lock_guard callbackLock(m_sessionTerminationCallbackLock);
m_sessionTerminationCallback = nullptr;
}
// Wait up to 5 seconds for the VM to terminate gracefully.
bool forceTerminate = false;
if (!m_vmExitEvent.wait(5000))
@@ -581,6 +590,15 @@ try
}
CATCH_RETURN()
HRESULT HcsVirtualMachine::RegisterTerminationCallback(_In_ ITerminationCallback* Callback)
try
{
std::lock_guard lock(m_sessionTerminationCallbackLock);
m_sessionTerminationCallback = Callback;
return S_OK;
}
CATCH_RETURN()
void CALLBACK HcsVirtualMachine::OnVmExitCallback(HCS_EVENT* Event, void* Context)
try
{
@@ -630,6 +648,17 @@ void HcsVirtualMachine::OnExit(const HCS_EVENT* Event)
{
LOG_IF_FAILED(m_terminationCallback->OnTermination(reason, Event->EventData));
}
wil::com_ptr<ITerminationCallback> sessionCallback;
{
std::lock_guard lock(m_sessionTerminationCallbackLock);
sessionCallback = m_sessionTerminationCallback;
}
if (sessionCallback)
{
LOG_IF_FAILED(sessionCallback->OnTermination(reason, Event->EventData));
}
}
void HcsVirtualMachine::OnCrash(const HCS_EVENT* Event)

View File

@@ -43,6 +43,7 @@ public:
IFACEMETHOD(DetachDisk)(_In_ ULONG Lun) override;
IFACEMETHOD(AddShare)(_In_ LPCWSTR WindowsPath, _In_ BOOL ReadOnly, _Out_ GUID* ShareId) override;
IFACEMETHOD(RemoveShare)(_In_ REFGUID ShareId) override;
IFACEMETHOD(RegisterTerminationCallback)(_In_ ITerminationCallback* Callback) override;
private:
struct DiskInfo
@@ -97,6 +98,12 @@ private:
bool m_crashLogCaptured = false;
wil::com_ptr<ITerminationCallback> m_terminationCallback;
// Session-side termination callback, registered via RegisterTerminationCallback().
// Guarded by m_sessionTerminationCallbackLock (separate from m_lock to avoid
// deadlock between HCS callback thread and destructor which holds m_lock).
std::mutex m_sessionTerminationCallbackLock;
wil::com_ptr<ITerminationCallback> m_sessionTerminationCallback;
};
} // namespace wsl::windows::service::wslc

View File

@@ -425,6 +425,10 @@ interface IWSLCVirtualMachine : IUnknown
// Removes a previously added filesystem share.
HRESULT RemoveShare([in] REFGUID ShareId);
// Registers a callback to be invoked when the VM exits.
// The callback receives the exit reason and optional details.
HRESULT RegisterTerminationCallback([in] ITerminationCallback* Callback);
}
typedef enum _WSLCSessionStorageFlags

View File

@@ -26,6 +26,28 @@ using wsl::windows::service::wslc::UserHandle;
using wsl::windows::service::wslc::WSLCSession;
using wsl::windows::service::wslc::WSLCVirtualMachine;
// COM callback that signals a local event when the VM terminates.
// Registered with IWSLCVirtualMachine::RegisterTerminationCallback() so the
// SYSTEM service can notify us cross-process when HCS reports VM exit.
struct VmTerminationCallback : winrt::implements<VmTerminationCallback, ITerminationCallback>
{
VmTerminationCallback(HANDLE event)
{
HANDLE dup = nullptr;
THROW_IF_WIN32_BOOL_FALSE(DuplicateHandle(GetCurrentProcess(), event, GetCurrentProcess(), &dup, 0, FALSE, DUPLICATE_SAME_ACCESS));
m_event.reset(dup);
}
HRESULT STDMETHODCALLTYPE OnTermination(WSLCVirtualMachineTerminationReason, LPCWSTR) override
{
m_event.SetEvent();
return S_OK;
}
private:
wil::unique_event m_event;
};
constexpr auto c_containerdStorage = "/var/lib/docker";
namespace {
@@ -289,6 +311,11 @@ try
m_virtualMachine->Initialize();
// Register a COM callback with the SYSTEM service to be notified when the VM exits.
// The callback signals m_vmExitedEvent, which IORelay monitors to trigger OnVmExited().
auto vmTermCallback = winrt::make<VmTerminationCallback>(m_vmExitedEvent.get());
THROW_IF_FAILED(Vm->RegisterTerminationCallback(vmTermCallback.as<ITerminationCallback>().get()));
// Configure storage.
ConfigureStorage(*Settings, tokenInfo->User.Sid);
@@ -306,6 +333,10 @@ try
// Start the event tracker.
m_eventTracker.emplace(m_dockerClient.value(), m_id, m_ioRelay);
// Monitor for unexpected VM exit.
m_ioRelay.AddHandle(
std::make_unique<windows::common::relay::EventHandle>(m_vmExitedEvent.get(), std::bind(&WSLCSession::OnVmExited, this)));
// Recover any existing containers from storage.
RecoverExistingVolumes();
RecoverExistingContainers();
@@ -413,6 +444,30 @@ void WSLCSession::OnDockerdExited()
}
}
void WSLCSession::OnVmExited()
{
if (m_sessionTerminatingEvent.is_signaled())
{
return; // Already shutting down (normal termination path).
}
WSL_LOG(
"UnexpectedVmExit",
TraceLoggingLevel(WINEVENT_LEVEL_WARNING),
TraceLoggingValue(m_id, "SessionId"),
TraceLoggingValue(m_displayName.c_str(), "Name"));
// N.B. This callback runs on the IORelay thread. Terminate() calls m_ioRelay.Stop()
// which joins the IORelay thread, so we must run termination on a separate thread
// to avoid deadlock. Capture a COM reference to prevent the session from being
// destroyed before the thread runs.
Microsoft::WRL::ComPtr<WSLCSession> self(this);
std::thread([self]() {
wsl::windows::common::wslutil::SetThreadDescription(L"VmExitTermination");
LOG_IF_FAILED(self->Terminate());
}).detach();
}
void WSLCSession::OnDockerdLog(const gsl::span<char>& buffer)
try
{
@@ -1792,40 +1847,55 @@ try
m_eventTracker.reset();
m_dockerClient.reset();
// Check if the VM has already exited (e.g., killed externally).
// If so, skip operations that require a live VM to avoid unnecessary waits.
const bool vmDead = m_vmExitedEvent.is_signaled();
// Stop dockerd.
// N.B. dockerd wait a couple seconds if there are any outstanding HTTP request sockets opened.
if (m_dockerdProcess.has_value())
{
LOG_IF_FAILED(m_dockerdProcess->Get().Signal(WSLCSignalSIGTERM));
if (!vmDead)
{
LOG_IF_FAILED(m_dockerdProcess->Get().Signal(WSLCSignalSIGTERM));
int exitCode = -1;
try
{
exitCode = m_dockerdProcess->Wait(30 * 1000);
}
catch (...)
{
LOG_CAUGHT_EXCEPTION();
int exitCode = -1;
try
{
m_dockerdProcess->Get().Signal(WSLCSignalSIGKILL);
exitCode = m_dockerdProcess->Wait(10 * 1000);
exitCode = m_dockerdProcess->Wait(30 * 1000);
}
CATCH_LOG();
catch (...)
{
LOG_CAUGHT_EXCEPTION();
try
{
m_dockerdProcess->Get().Signal(WSLCSignalSIGKILL);
exitCode = m_dockerdProcess->Wait(10 * 1000);
}
CATCH_LOG();
}
WSL_LOG("DockerdExit", TraceLoggingValue(exitCode, "code"));
}
else
{
WSL_LOG("SkippingDockerdShutdown_VmDead");
}
WSL_LOG("DockerdExit", TraceLoggingValue(exitCode, "code"));
m_dockerdProcess.reset();
}
if (m_virtualMachine)
{
// N.B. dockerd has exited by this point, so unmounting the VHD is safe since no container can be running.
try
if (!vmDead)
{
m_virtualMachine->Unmount(c_containerdStorage);
// N.B. dockerd has exited by this point, so unmounting the VHD is safe since no container can be running.
try
{
m_virtualMachine->Unmount(c_containerdStorage);
}
CATCH_LOG();
}
CATCH_LOG();
m_virtualMachine.reset();
}

View File

@@ -127,6 +127,7 @@ private:
void OnContainerDeleted(const WSLCContainerImpl* Container);
void OnDockerdLog(const gsl::span<char>& Data);
void OnDockerdExited();
void OnVmExited();
void StartDockerd();
void ImportImageImpl(DockerHTTPClient::HTTPRequestContext& Request, const WSLCHandle ImageHandle);
void RecoverExistingContainers();
@@ -150,6 +151,7 @@ private:
std::unordered_map<std::string, std::unique_ptr<WSLCVhdVolumeImpl>> m_volumes;
std::unordered_set<std::string> m_anonymousVolumes; // TODO: Implement proper anonymous volume support.
wil::unique_event m_sessionTerminatingEvent{wil::EventOptions::ManualReset};
wil::unique_event m_vmExitedEvent{wil::EventOptions::ManualReset};
wil::srwlock m_lock;
IORelay m_ioRelay;
std::optional<ServiceRunningProcess> m_dockerdProcess;

View File

@@ -6740,4 +6740,96 @@ class WSLCTests
ValidateProcessOutput(initProcess, {{1, "OK\n"}});
}
// Kills the VM for a session by finding it via the "WSLC-<name>" owner in hcsdiag output.
// hcsdiag detail line format: " VM, <State>, <GUID>, WSLC-<name>"
static void KillSessionVm(LPCWSTR sessionName)
{
auto ownerTag = std::format(L"WSLC-{}", sessionName);
wsl::windows::common::SubProcess listProc(nullptr, L"hcsdiag.exe list");
auto listOutput = listProc.RunAndCaptureOutput(10000);
auto& output = listOutput.Stdout;
auto ownerPos = output.find(ownerTag);
VERIFY_IS_TRUE(ownerPos != std::wstring::npos);
// The GUID (36 chars) appears before ", WSLC-<name>" in the detail line.
auto guidEnd = output.rfind(L", ", ownerPos);
VERIFY_IS_TRUE(guidEnd != std::wstring::npos && guidEnd >= 36);
auto vmId = output.substr(guidEnd - 36, 36);
VERIFY_IS_TRUE(wsl::shared::string::ToGuid(vmId.c_str()).has_value());
VERIFY_ARE_EQUAL(wsl::windows::common::SubProcess(nullptr, std::format(L"hcsdiag.exe kill {}", vmId).c_str()).Run(10000), 0u);
}
// Waits for a session to terminate (GetState returns terminated or RPC error).
static bool WaitForSessionTermination(IWSLCSession* session, DWORD timeoutSeconds = 30)
{
for (DWORD i = 0; i < timeoutSeconds; i++)
{
Sleep(1000);
WSLCSessionState state{};
auto hr = session->GetState(&state);
if (FAILED(hr) || state == WSLCSessionStateTerminated)
{
return true;
}
}
return false;
}
WSLC_TEST_METHOD(VmKillTerminatesSession)
{
static constexpr auto c_sessionName = L"wslc-vm-kill-test";
auto settings = GetDefaultSessionSettings(c_sessionName);
auto session = CreateSession(settings);
KillSessionVm(c_sessionName);
VERIFY_IS_TRUE(WaitForSessionTermination(session.get()));
}
WSLC_TEST_METHOD(VmKillFailsInFlightOperations)
{
static constexpr auto c_sessionName = L"wslc-vm-kill-inflight-test";
auto settings = GetDefaultSessionSettings(c_sessionName);
auto session = CreateSession(settings);
WSLCProcessLauncher launcher("/bin/sleep", {"/bin/sleep", "99999"});
auto process = launcher.Launch(*session);
KillSessionVm(c_sessionName);
// The process should fail (not hang).
auto exitEvent = process.GetExitEvent();
bool exited = exitEvent.wait(30000);
if (!exited)
{
WSLCProcessState processState{};
int exitCode{};
VERIFY_IS_TRUE(FAILED(process.Get().GetState(&processState, &exitCode)));
}
}
WSLC_TEST_METHOD(CleanShutdownStillWorks)
{
auto settings = GetDefaultSessionSettings(L"wslc-clean-shutdown-test");
auto session = CreateSession(settings);
ExpectCommandResult(session.get(), {"/bin/echo", "hello"}, 0);
auto hr = session->Terminate();
VERIFY_IS_TRUE(SUCCEEDED(hr) || hr == HRESULT_FROM_WIN32(RPC_S_CALL_FAILED));
if (SUCCEEDED(hr))
{
WSLCSessionState state{};
VERIFY_SUCCEEDED(session->GetState(&state));
VERIFY_ARE_EQUAL(state, WSLCSessionStateTerminated);
}
}
};