mirror of
https://github.com/microsoft/WSL.git
synced 2026-04-24 20:30:00 -05:00
Detect unexpected VM exit and terminate wslcsession.exe
When the VM is killed externally (e.g. hcsdiag /kill, kernel panic), the wslcsession.exe process now detects it and exits cleanly instead of hanging indefinitely as a zombie. Implementation: - Add GetExitEvent() to IWSLCVirtualMachine IDL. The SYSTEM service duplicates HcsVirtualMachine::m_vmExitEvent via COM system_handle marshaling so the session process can wait on it. - WSLCVirtualMachine calls GetExitEvent() during Initialize() and exposes VmExitedEvent() for consumers to monitor. - WSLCSession monitors the exit event via IORelay. On unexpected VM exit, OnVmExited() spawns a thread to call Terminate() (must be a separate thread to avoid deadlock with IORelay::Stop). - WSLCSessionManager registers a cleanup callback on HcsVirtualMachine to terminate sessions when VM exits. Callback is cleared in ~HcsVirtualMachine to avoid firing during normal shutdown. - Harden Terminate() to skip dockerd signal/wait and unmount when the VM is already dead, avoiding unnecessary 30s+ hangs. - Add tests: VmKillTerminatesSession, VmKillFailsInFlightOperations, CleanShutdownStillWorks.
This commit is contained in:
@@ -49,7 +49,7 @@ HcsVirtualMachine::HcsVirtualMachine(_In_ const WSLCSessionSettings* Settings)
|
||||
|
||||
// Build HCS settings
|
||||
hcs::ComputeSystem systemSettings{};
|
||||
systemSettings.Owner = L"WSL";
|
||||
systemSettings.Owner = std::format(L"WSLC-{}", Settings->DisplayName);
|
||||
systemSettings.ShouldTerminateOnLastHandleClosed = true;
|
||||
|
||||
// Determine which schema version to use based on the Windows version. Windows 10 does not support
|
||||
@@ -281,6 +281,15 @@ HcsVirtualMachine::~HcsVirtualMachine()
|
||||
{
|
||||
std::lock_guard lock(m_lock);
|
||||
|
||||
// Clear the session termination callback before destroying the VM.
|
||||
// During normal shutdown, the session is already terminating — firing the
|
||||
// callback would cause a redundant (and potentially crashing) COM call.
|
||||
// Uses its own lock to avoid deadlock with HCS callback thread.
|
||||
{
|
||||
std::lock_guard callbackLock(m_sessionTerminationCallbackLock);
|
||||
m_sessionTerminationCallback = nullptr;
|
||||
}
|
||||
|
||||
// Wait up to 5 seconds for the VM to terminate gracefully.
|
||||
bool forceTerminate = false;
|
||||
if (!m_vmExitEvent.wait(5000))
|
||||
@@ -581,6 +590,15 @@ try
|
||||
}
|
||||
CATCH_RETURN()
|
||||
|
||||
HRESULT HcsVirtualMachine::RegisterTerminationCallback(_In_ ITerminationCallback* Callback)
|
||||
try
|
||||
{
|
||||
std::lock_guard lock(m_sessionTerminationCallbackLock);
|
||||
m_sessionTerminationCallback = Callback;
|
||||
return S_OK;
|
||||
}
|
||||
CATCH_RETURN()
|
||||
|
||||
void CALLBACK HcsVirtualMachine::OnVmExitCallback(HCS_EVENT* Event, void* Context)
|
||||
try
|
||||
{
|
||||
@@ -630,6 +648,17 @@ void HcsVirtualMachine::OnExit(const HCS_EVENT* Event)
|
||||
{
|
||||
LOG_IF_FAILED(m_terminationCallback->OnTermination(reason, Event->EventData));
|
||||
}
|
||||
|
||||
wil::com_ptr<ITerminationCallback> sessionCallback;
|
||||
{
|
||||
std::lock_guard lock(m_sessionTerminationCallbackLock);
|
||||
sessionCallback = m_sessionTerminationCallback;
|
||||
}
|
||||
|
||||
if (sessionCallback)
|
||||
{
|
||||
LOG_IF_FAILED(sessionCallback->OnTermination(reason, Event->EventData));
|
||||
}
|
||||
}
|
||||
|
||||
void HcsVirtualMachine::OnCrash(const HCS_EVENT* Event)
|
||||
|
||||
@@ -43,6 +43,7 @@ public:
|
||||
IFACEMETHOD(DetachDisk)(_In_ ULONG Lun) override;
|
||||
IFACEMETHOD(AddShare)(_In_ LPCWSTR WindowsPath, _In_ BOOL ReadOnly, _Out_ GUID* ShareId) override;
|
||||
IFACEMETHOD(RemoveShare)(_In_ REFGUID ShareId) override;
|
||||
IFACEMETHOD(RegisterTerminationCallback)(_In_ ITerminationCallback* Callback) override;
|
||||
|
||||
private:
|
||||
struct DiskInfo
|
||||
@@ -97,6 +98,12 @@ private:
|
||||
bool m_crashLogCaptured = false;
|
||||
|
||||
wil::com_ptr<ITerminationCallback> m_terminationCallback;
|
||||
|
||||
// Session-side termination callback, registered via RegisterTerminationCallback().
|
||||
// Guarded by m_sessionTerminationCallbackLock (separate from m_lock to avoid
|
||||
// deadlock between HCS callback thread and destructor which holds m_lock).
|
||||
std::mutex m_sessionTerminationCallbackLock;
|
||||
wil::com_ptr<ITerminationCallback> m_sessionTerminationCallback;
|
||||
};
|
||||
|
||||
} // namespace wsl::windows::service::wslc
|
||||
|
||||
@@ -425,6 +425,10 @@ interface IWSLCVirtualMachine : IUnknown
|
||||
|
||||
// Removes a previously added filesystem share.
|
||||
HRESULT RemoveShare([in] REFGUID ShareId);
|
||||
|
||||
// Registers a callback to be invoked when the VM exits.
|
||||
// The callback receives the exit reason and optional details.
|
||||
HRESULT RegisterTerminationCallback([in] ITerminationCallback* Callback);
|
||||
}
|
||||
|
||||
typedef enum _WSLCSessionStorageFlags
|
||||
|
||||
@@ -26,6 +26,28 @@ using wsl::windows::service::wslc::UserHandle;
|
||||
using wsl::windows::service::wslc::WSLCSession;
|
||||
using wsl::windows::service::wslc::WSLCVirtualMachine;
|
||||
|
||||
// COM callback that signals a local event when the VM terminates.
|
||||
// Registered with IWSLCVirtualMachine::RegisterTerminationCallback() so the
|
||||
// SYSTEM service can notify us cross-process when HCS reports VM exit.
|
||||
struct VmTerminationCallback : winrt::implements<VmTerminationCallback, ITerminationCallback>
|
||||
{
|
||||
VmTerminationCallback(HANDLE event)
|
||||
{
|
||||
HANDLE dup = nullptr;
|
||||
THROW_IF_WIN32_BOOL_FALSE(DuplicateHandle(GetCurrentProcess(), event, GetCurrentProcess(), &dup, 0, FALSE, DUPLICATE_SAME_ACCESS));
|
||||
m_event.reset(dup);
|
||||
}
|
||||
|
||||
HRESULT STDMETHODCALLTYPE OnTermination(WSLCVirtualMachineTerminationReason, LPCWSTR) override
|
||||
{
|
||||
m_event.SetEvent();
|
||||
return S_OK;
|
||||
}
|
||||
|
||||
private:
|
||||
wil::unique_event m_event;
|
||||
};
|
||||
|
||||
constexpr auto c_containerdStorage = "/var/lib/docker";
|
||||
|
||||
namespace {
|
||||
@@ -289,6 +311,11 @@ try
|
||||
|
||||
m_virtualMachine->Initialize();
|
||||
|
||||
// Register a COM callback with the SYSTEM service to be notified when the VM exits.
|
||||
// The callback signals m_vmExitedEvent, which IORelay monitors to trigger OnVmExited().
|
||||
auto vmTermCallback = winrt::make<VmTerminationCallback>(m_vmExitedEvent.get());
|
||||
THROW_IF_FAILED(Vm->RegisterTerminationCallback(vmTermCallback.as<ITerminationCallback>().get()));
|
||||
|
||||
// Configure storage.
|
||||
ConfigureStorage(*Settings, tokenInfo->User.Sid);
|
||||
|
||||
@@ -306,6 +333,10 @@ try
|
||||
// Start the event tracker.
|
||||
m_eventTracker.emplace(m_dockerClient.value(), m_id, m_ioRelay);
|
||||
|
||||
// Monitor for unexpected VM exit.
|
||||
m_ioRelay.AddHandle(
|
||||
std::make_unique<windows::common::relay::EventHandle>(m_vmExitedEvent.get(), std::bind(&WSLCSession::OnVmExited, this)));
|
||||
|
||||
// Recover any existing containers from storage.
|
||||
RecoverExistingVolumes();
|
||||
RecoverExistingContainers();
|
||||
@@ -413,6 +444,30 @@ void WSLCSession::OnDockerdExited()
|
||||
}
|
||||
}
|
||||
|
||||
void WSLCSession::OnVmExited()
|
||||
{
|
||||
if (m_sessionTerminatingEvent.is_signaled())
|
||||
{
|
||||
return; // Already shutting down (normal termination path).
|
||||
}
|
||||
|
||||
WSL_LOG(
|
||||
"UnexpectedVmExit",
|
||||
TraceLoggingLevel(WINEVENT_LEVEL_WARNING),
|
||||
TraceLoggingValue(m_id, "SessionId"),
|
||||
TraceLoggingValue(m_displayName.c_str(), "Name"));
|
||||
|
||||
// N.B. This callback runs on the IORelay thread. Terminate() calls m_ioRelay.Stop()
|
||||
// which joins the IORelay thread, so we must run termination on a separate thread
|
||||
// to avoid deadlock. Capture a COM reference to prevent the session from being
|
||||
// destroyed before the thread runs.
|
||||
Microsoft::WRL::ComPtr<WSLCSession> self(this);
|
||||
std::thread([self]() {
|
||||
wsl::windows::common::wslutil::SetThreadDescription(L"VmExitTermination");
|
||||
LOG_IF_FAILED(self->Terminate());
|
||||
}).detach();
|
||||
}
|
||||
|
||||
void WSLCSession::OnDockerdLog(const gsl::span<char>& buffer)
|
||||
try
|
||||
{
|
||||
@@ -1792,40 +1847,55 @@ try
|
||||
m_eventTracker.reset();
|
||||
m_dockerClient.reset();
|
||||
|
||||
// Check if the VM has already exited (e.g., killed externally).
|
||||
// If so, skip operations that require a live VM to avoid unnecessary waits.
|
||||
const bool vmDead = m_vmExitedEvent.is_signaled();
|
||||
|
||||
// Stop dockerd.
|
||||
// N.B. dockerd wait a couple seconds if there are any outstanding HTTP request sockets opened.
|
||||
if (m_dockerdProcess.has_value())
|
||||
{
|
||||
LOG_IF_FAILED(m_dockerdProcess->Get().Signal(WSLCSignalSIGTERM));
|
||||
if (!vmDead)
|
||||
{
|
||||
LOG_IF_FAILED(m_dockerdProcess->Get().Signal(WSLCSignalSIGTERM));
|
||||
|
||||
int exitCode = -1;
|
||||
try
|
||||
{
|
||||
exitCode = m_dockerdProcess->Wait(30 * 1000);
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
LOG_CAUGHT_EXCEPTION();
|
||||
int exitCode = -1;
|
||||
try
|
||||
{
|
||||
m_dockerdProcess->Get().Signal(WSLCSignalSIGKILL);
|
||||
exitCode = m_dockerdProcess->Wait(10 * 1000);
|
||||
exitCode = m_dockerdProcess->Wait(30 * 1000);
|
||||
}
|
||||
CATCH_LOG();
|
||||
catch (...)
|
||||
{
|
||||
LOG_CAUGHT_EXCEPTION();
|
||||
try
|
||||
{
|
||||
m_dockerdProcess->Get().Signal(WSLCSignalSIGKILL);
|
||||
exitCode = m_dockerdProcess->Wait(10 * 1000);
|
||||
}
|
||||
CATCH_LOG();
|
||||
}
|
||||
|
||||
WSL_LOG("DockerdExit", TraceLoggingValue(exitCode, "code"));
|
||||
}
|
||||
else
|
||||
{
|
||||
WSL_LOG("SkippingDockerdShutdown_VmDead");
|
||||
}
|
||||
|
||||
WSL_LOG("DockerdExit", TraceLoggingValue(exitCode, "code"));
|
||||
m_dockerdProcess.reset();
|
||||
}
|
||||
|
||||
if (m_virtualMachine)
|
||||
{
|
||||
// N.B. dockerd has exited by this point, so unmounting the VHD is safe since no container can be running.
|
||||
try
|
||||
if (!vmDead)
|
||||
{
|
||||
m_virtualMachine->Unmount(c_containerdStorage);
|
||||
// N.B. dockerd has exited by this point, so unmounting the VHD is safe since no container can be running.
|
||||
try
|
||||
{
|
||||
m_virtualMachine->Unmount(c_containerdStorage);
|
||||
}
|
||||
CATCH_LOG();
|
||||
}
|
||||
CATCH_LOG();
|
||||
|
||||
m_virtualMachine.reset();
|
||||
}
|
||||
|
||||
@@ -127,6 +127,7 @@ private:
|
||||
void OnContainerDeleted(const WSLCContainerImpl* Container);
|
||||
void OnDockerdLog(const gsl::span<char>& Data);
|
||||
void OnDockerdExited();
|
||||
void OnVmExited();
|
||||
void StartDockerd();
|
||||
void ImportImageImpl(DockerHTTPClient::HTTPRequestContext& Request, const WSLCHandle ImageHandle);
|
||||
void RecoverExistingContainers();
|
||||
@@ -150,6 +151,7 @@ private:
|
||||
std::unordered_map<std::string, std::unique_ptr<WSLCVhdVolumeImpl>> m_volumes;
|
||||
std::unordered_set<std::string> m_anonymousVolumes; // TODO: Implement proper anonymous volume support.
|
||||
wil::unique_event m_sessionTerminatingEvent{wil::EventOptions::ManualReset};
|
||||
wil::unique_event m_vmExitedEvent{wil::EventOptions::ManualReset};
|
||||
wil::srwlock m_lock;
|
||||
IORelay m_ioRelay;
|
||||
std::optional<ServiceRunningProcess> m_dockerdProcess;
|
||||
|
||||
@@ -6740,4 +6740,96 @@ class WSLCTests
|
||||
|
||||
ValidateProcessOutput(initProcess, {{1, "OK\n"}});
|
||||
}
|
||||
|
||||
// Kills the VM for a session by finding it via the "WSLC-<name>" owner in hcsdiag output.
|
||||
// hcsdiag detail line format: " VM, <State>, <GUID>, WSLC-<name>"
|
||||
static void KillSessionVm(LPCWSTR sessionName)
|
||||
{
|
||||
auto ownerTag = std::format(L"WSLC-{}", sessionName);
|
||||
|
||||
wsl::windows::common::SubProcess listProc(nullptr, L"hcsdiag.exe list");
|
||||
auto listOutput = listProc.RunAndCaptureOutput(10000);
|
||||
|
||||
auto& output = listOutput.Stdout;
|
||||
auto ownerPos = output.find(ownerTag);
|
||||
VERIFY_IS_TRUE(ownerPos != std::wstring::npos);
|
||||
|
||||
// The GUID (36 chars) appears before ", WSLC-<name>" in the detail line.
|
||||
auto guidEnd = output.rfind(L", ", ownerPos);
|
||||
VERIFY_IS_TRUE(guidEnd != std::wstring::npos && guidEnd >= 36);
|
||||
|
||||
auto vmId = output.substr(guidEnd - 36, 36);
|
||||
VERIFY_IS_TRUE(wsl::shared::string::ToGuid(vmId.c_str()).has_value());
|
||||
|
||||
VERIFY_ARE_EQUAL(wsl::windows::common::SubProcess(nullptr, std::format(L"hcsdiag.exe kill {}", vmId).c_str()).Run(10000), 0u);
|
||||
}
|
||||
|
||||
// Waits for a session to terminate (GetState returns terminated or RPC error).
|
||||
static bool WaitForSessionTermination(IWSLCSession* session, DWORD timeoutSeconds = 30)
|
||||
{
|
||||
for (DWORD i = 0; i < timeoutSeconds; i++)
|
||||
{
|
||||
Sleep(1000);
|
||||
|
||||
WSLCSessionState state{};
|
||||
auto hr = session->GetState(&state);
|
||||
if (FAILED(hr) || state == WSLCSessionStateTerminated)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
WSLC_TEST_METHOD(VmKillTerminatesSession)
|
||||
{
|
||||
static constexpr auto c_sessionName = L"wslc-vm-kill-test";
|
||||
auto settings = GetDefaultSessionSettings(c_sessionName);
|
||||
auto session = CreateSession(settings);
|
||||
|
||||
KillSessionVm(c_sessionName);
|
||||
|
||||
VERIFY_IS_TRUE(WaitForSessionTermination(session.get()));
|
||||
}
|
||||
|
||||
WSLC_TEST_METHOD(VmKillFailsInFlightOperations)
|
||||
{
|
||||
static constexpr auto c_sessionName = L"wslc-vm-kill-inflight-test";
|
||||
auto settings = GetDefaultSessionSettings(c_sessionName);
|
||||
auto session = CreateSession(settings);
|
||||
|
||||
WSLCProcessLauncher launcher("/bin/sleep", {"/bin/sleep", "99999"});
|
||||
auto process = launcher.Launch(*session);
|
||||
|
||||
KillSessionVm(c_sessionName);
|
||||
|
||||
// The process should fail (not hang).
|
||||
auto exitEvent = process.GetExitEvent();
|
||||
bool exited = exitEvent.wait(30000);
|
||||
if (!exited)
|
||||
{
|
||||
WSLCProcessState processState{};
|
||||
int exitCode{};
|
||||
VERIFY_IS_TRUE(FAILED(process.Get().GetState(&processState, &exitCode)));
|
||||
}
|
||||
}
|
||||
|
||||
WSLC_TEST_METHOD(CleanShutdownStillWorks)
|
||||
{
|
||||
auto settings = GetDefaultSessionSettings(L"wslc-clean-shutdown-test");
|
||||
auto session = CreateSession(settings);
|
||||
|
||||
ExpectCommandResult(session.get(), {"/bin/echo", "hello"}, 0);
|
||||
|
||||
auto hr = session->Terminate();
|
||||
VERIFY_IS_TRUE(SUCCEEDED(hr) || hr == HRESULT_FROM_WIN32(RPC_S_CALL_FAILED));
|
||||
|
||||
if (SUCCEEDED(hr))
|
||||
{
|
||||
WSLCSessionState state{};
|
||||
VERIFY_SUCCEEDED(session->GetState(&state));
|
||||
VERIFY_ARE_EQUAL(state, WSLCSessionStateTerminated);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user