terminal/src/renderer/atlas/BackendD3D.cpp
Leonard Hecker 324e0f425a
Move colorbrewer into its own proper header (#17444)
I'm planning to use the `dark2` color palette in the upcoming
cooked read rewrite as a debug aid to paint dirty regions.
Now that it's going to be used in more than one place I figured
it may be time to properly add it to the NOTICE file even if
it still won't be shipped with the final product.
2024-06-20 16:55:13 +00:00

2343 lines
97 KiB
C++

// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include "pch.h"
#include "BackendD3D.h"
#include <til/unicode.h>
#include <custom_shader_ps.h>
#include <custom_shader_vs.h>
#include <shader_ps.h>
#include <shader_vs.h>
#include "BuiltinGlyphs.h"
#include "dwrite.h"
#include "wic.h"
#include "../../types/inc/ColorFix.hpp"
#if ATLAS_DEBUG_SHOW_DIRTY || ATLAS_DEBUG_COLORIZE_GLYPH_ATLAS
#include <til/colorbrewer.h>
#endif
TIL_FAST_MATH_BEGIN
#pragma warning(disable : 4100) // '...': unreferenced formal parameter
#pragma warning(disable : 26440) // Function '...' can be declared 'noexcept'(f.6).
// This code packs various data into smaller-than-int types to save both CPU and GPU memory. This warning would force
// us to add dozens upon dozens of gsl::narrow_cast<>s throughout the file which is more annoying than helpful.
#pragma warning(disable : 4242) // '=': conversion from '...' to '...', possible loss of data
#pragma warning(disable : 4244) // 'initializing': conversion from '...' to '...', possible loss of data
#pragma warning(disable : 4267) // 'argument': conversion from '...' to '...', possible loss of data
#pragma warning(disable : 4838) // conversion from '...' to '...' requires a narrowing conversion
#pragma warning(disable : 26472) // Don't use a static_cast for arithmetic conversions. Use brace initialization, gsl::narrow_cast or gsl::narrow (type.1).
// Disable a bunch of warnings which get in the way of writing performant code.
#pragma warning(disable : 26429) // Symbol 'data' is never tested for nullness, it can be marked as not_null (f.23).
#pragma warning(disable : 26446) // Prefer to use gsl::at() instead of unchecked subscript operator (bounds.4).
#pragma warning(disable : 26459) // You called an STL function '...' with a raw pointer parameter at position '...' that may be unsafe [...].
#pragma warning(disable : 26481) // Don't use pointer arithmetic. Use span instead (bounds.1).
#pragma warning(disable : 26482) // Only index into arrays using constant expressions (bounds.2).
#pragma warning(disable : 26490) // Don't use reinterpret_cast (type.1).
// Initializing large arrays can be very costly compared to how cheap some of these functions are.
#define ALLOW_UNINITIALIZED_BEGIN _Pragma("warning(push)") _Pragma("warning(disable : 26494)")
#define ALLOW_UNINITIALIZED_END _Pragma("warning(pop)")
using namespace Microsoft::Console::Render::Atlas;
static constexpr D2D1_MATRIX_3X2_F identityTransform{ .m11 = 1, .m22 = 1 };
static constexpr D2D1_COLOR_F whiteColor{ 1, 1, 1, 1 };
static u64 queryPerfFreq() noexcept
{
LARGE_INTEGER li;
QueryPerformanceFrequency(&li);
return std::bit_cast<u64>(li.QuadPart);
}
static u64 queryPerfCount() noexcept
{
LARGE_INTEGER li;
QueryPerformanceCounter(&li);
return std::bit_cast<u64>(li.QuadPart);
}
BackendD3D::BackendD3D(const RenderingPayload& p)
{
THROW_IF_FAILED(p.device->CreateVertexShader(&shader_vs[0], sizeof(shader_vs), nullptr, _vertexShader.addressof()));
THROW_IF_FAILED(p.device->CreatePixelShader(&shader_ps[0], sizeof(shader_ps), nullptr, _pixelShader.addressof()));
{
static constexpr D3D11_INPUT_ELEMENT_DESC layout[]{
{ "SV_Position", 0, DXGI_FORMAT_R32G32_FLOAT, 0, 0, D3D11_INPUT_PER_VERTEX_DATA, 0 },
{ "shadingType", 0, DXGI_FORMAT_R16_UINT, 1, offsetof(QuadInstance, shadingType), D3D11_INPUT_PER_INSTANCE_DATA, 1 },
{ "renditionScale", 0, DXGI_FORMAT_R8G8_UINT, 1, offsetof(QuadInstance, renditionScale), D3D11_INPUT_PER_INSTANCE_DATA, 1 },
{ "position", 0, DXGI_FORMAT_R16G16_SINT, 1, offsetof(QuadInstance, position), D3D11_INPUT_PER_INSTANCE_DATA, 1 },
{ "size", 0, DXGI_FORMAT_R16G16_UINT, 1, offsetof(QuadInstance, size), D3D11_INPUT_PER_INSTANCE_DATA, 1 },
{ "texcoord", 0, DXGI_FORMAT_R16G16_UINT, 1, offsetof(QuadInstance, texcoord), D3D11_INPUT_PER_INSTANCE_DATA, 1 },
{ "color", 0, DXGI_FORMAT_R8G8B8A8_UNORM, 1, offsetof(QuadInstance, color), D3D11_INPUT_PER_INSTANCE_DATA, 1 },
};
THROW_IF_FAILED(p.device->CreateInputLayout(&layout[0], std::size(layout), &shader_vs[0], sizeof(shader_vs), _inputLayout.addressof()));
}
{
static constexpr f32x2 vertices[]{
{ 0, 0 },
{ 1, 0 },
{ 1, 1 },
{ 0, 1 },
};
static constexpr D3D11_SUBRESOURCE_DATA initialData{ &vertices[0] };
D3D11_BUFFER_DESC desc{};
desc.ByteWidth = sizeof(vertices);
desc.Usage = D3D11_USAGE_IMMUTABLE;
desc.BindFlags = D3D11_BIND_VERTEX_BUFFER;
THROW_IF_FAILED(p.device->CreateBuffer(&desc, &initialData, _vertexBuffer.addressof()));
}
{
static constexpr u16 indices[]{
0, // { 0, 0 }
1, // { 1, 0 }
2, // { 1, 1 }
2, // { 1, 1 }
3, // { 0, 1 }
0, // { 0, 0 }
};
static constexpr D3D11_SUBRESOURCE_DATA initialData{ &indices[0] };
D3D11_BUFFER_DESC desc{};
desc.ByteWidth = sizeof(indices);
desc.Usage = D3D11_USAGE_IMMUTABLE;
desc.BindFlags = D3D11_BIND_INDEX_BUFFER;
THROW_IF_FAILED(p.device->CreateBuffer(&desc, &initialData, _indexBuffer.addressof()));
}
{
static constexpr D3D11_BUFFER_DESC desc{
.ByteWidth = sizeof(VSConstBuffer),
.Usage = D3D11_USAGE_DEFAULT,
.BindFlags = D3D11_BIND_CONSTANT_BUFFER,
};
THROW_IF_FAILED(p.device->CreateBuffer(&desc, nullptr, _vsConstantBuffer.addressof()));
}
{
static constexpr D3D11_BUFFER_DESC desc{
.ByteWidth = sizeof(PSConstBuffer),
.Usage = D3D11_USAGE_DEFAULT,
.BindFlags = D3D11_BIND_CONSTANT_BUFFER,
};
THROW_IF_FAILED(p.device->CreateBuffer(&desc, nullptr, _psConstantBuffer.addressof()));
}
{
// The final step of the ClearType blending algorithm is a lerp() between the premultiplied alpha
// background color and straight alpha foreground color given the 3 RGB weights in alphaCorrected:
// lerp(background, foreground, weights)
// Which is equivalent to:
// background * (1 - weights) + foreground * weights
//
// This COULD be implemented using dual source color blending like so:
// .SrcBlend = D3D11_BLEND_SRC1_COLOR
// .DestBlend = D3D11_BLEND_INV_SRC1_COLOR
// .BlendOp = D3D11_BLEND_OP_ADD
// Because:
// background * (1 - weights) + foreground * weights
// ^ ^ ^ ^ ^
// Dest INV_SRC1_COLOR | Src SRC1_COLOR
// OP_ADD
//
// BUT we need simultaneous support for regular "source over" alpha blending
// (SHADING_TYPE_PASSTHROUGH) like this:
// background * (1 - alpha) + foreground
//
// This is why we set:
// .SrcBlend = D3D11_BLEND_ONE
//
// --> We need to multiply the foreground with the weights ourselves.
static constexpr D3D11_BLEND_DESC desc{
.RenderTarget = { {
.BlendEnable = TRUE,
.SrcBlend = D3D11_BLEND_ONE,
.DestBlend = D3D11_BLEND_INV_SRC1_COLOR,
.BlendOp = D3D11_BLEND_OP_ADD,
.SrcBlendAlpha = D3D11_BLEND_ONE,
.DestBlendAlpha = D3D11_BLEND_INV_SRC1_ALPHA,
.BlendOpAlpha = D3D11_BLEND_OP_ADD,
.RenderTargetWriteMask = D3D11_COLOR_WRITE_ENABLE_ALL,
} },
};
THROW_IF_FAILED(p.device->CreateBlendState(&desc, _blendState.addressof()));
}
#if ATLAS_DEBUG_SHADER_HOT_RELOAD
_sourceDirectory = std::filesystem::path{ __FILE__ }.parent_path();
_sourceCodeWatcher = wil::make_folder_change_reader_nothrow(_sourceDirectory.c_str(), false, wil::FolderChangeEvents::FileName | wil::FolderChangeEvents::LastWriteTime, [this](wil::FolderChangeEvent, PCWSTR path) {
if (til::ends_with(path, L".hlsl"))
{
auto expected = INT64_MAX;
const auto invalidationTime = std::chrono::steady_clock::now() + std::chrono::milliseconds(100);
_sourceCodeInvalidationTime.compare_exchange_strong(expected, invalidationTime.time_since_epoch().count(), std::memory_order_relaxed);
}
});
#endif
}
#pragma warning(suppress : 26432) // If you define or delete any default operation in the type '...', define or delete them all (c.21).
BackendD3D::~BackendD3D()
{
// In case an exception is thrown for some reason between BeginDraw() and EndDraw()
// we still technically need to call EndDraw() before releasing any resources.
if (_d2dBeganDrawing)
{
#pragma warning(suppress : 26447) // The function is declared 'noexcept' but calls function '...' which may throw exceptions (f.6).
LOG_IF_FAILED(_d2dRenderTarget->EndDraw());
}
}
void BackendD3D::ReleaseResources() noexcept
{
_renderTargetView.reset();
_customRenderTargetView.reset();
// Ensure _handleSettingsUpdate() is called so that _renderTarget gets recreated.
_generation = {};
}
void BackendD3D::Render(RenderingPayload& p)
{
if (_generation != p.s.generation())
{
_handleSettingsUpdate(p);
}
_debugUpdateShaders(p);
// After a Present() the render target becomes unbound.
p.deviceContext->OMSetRenderTargets(1, _customRenderTargetView ? _customRenderTargetView.addressof() : _renderTargetView.addressof(), nullptr);
// Invalidating the render target helps with spotting invalid quad instances and Present1() bugs.
#if ATLAS_DEBUG_SHOW_DIRTY || ATLAS_DEBUG_DUMP_RENDER_TARGET
{
static constexpr f32 clearColor[4]{};
p.deviceContext->ClearView(_renderTargetView.get(), &clearColor[0], nullptr, 0);
}
#endif
_drawBackground(p);
_drawCursorBackground(p);
_drawText(p);
_drawSelection(p);
_debugShowDirty(p);
_flushQuads(p);
if (_customPixelShader)
{
_executeCustomShader(p);
}
_debugDumpRenderTarget(p);
}
bool BackendD3D::RequiresContinuousRedraw() noexcept
{
return _requiresContinuousRedraw;
}
void BackendD3D::_handleSettingsUpdate(const RenderingPayload& p)
{
if (!_renderTargetView)
{
wil::com_ptr<ID3D11Texture2D> buffer;
THROW_IF_FAILED(p.swapChain.swapChain->GetBuffer(0, __uuidof(ID3D11Texture2D), reinterpret_cast<void**>(buffer.addressof())));
THROW_IF_FAILED(p.device->CreateRenderTargetView(buffer.get(), nullptr, _renderTargetView.put()));
}
const auto fontChanged = _fontGeneration != p.s->font.generation();
const auto miscChanged = _miscGeneration != p.s->misc.generation();
const auto cellCountChanged = _viewportCellCount != p.s->viewportCellCount;
if (fontChanged)
{
_updateFontDependents(p);
}
if (miscChanged)
{
_recreateCustomShader(p);
}
if (cellCountChanged)
{
_recreateBackgroundColorBitmap(p);
}
// Similar to _renderTargetView above, we might have to recreate the _customRenderTargetView whenever _swapChainManager
// resets it. We only do it after calling _recreateCustomShader however, since that sets the _customPixelShader.
if (_customPixelShader && !_customRenderTargetView)
{
_recreateCustomRenderTargetView(p);
}
_recreateConstBuffer(p);
_setupDeviceContextState(p);
_generation = p.s.generation();
_fontGeneration = p.s->font.generation();
_miscGeneration = p.s->misc.generation();
_targetSize = p.s->targetSize;
_viewportCellCount = p.s->viewportCellCount;
}
void BackendD3D::_updateFontDependents(const RenderingPayload& p)
{
const auto& font = *p.s->font;
// Curlyline is drawn with a desired height relative to the font size. The
// baseline of curlyline is at the middle of singly underline. When there's
// limited space to draw a curlyline, we apply a limit on the peak height.
{
const int cellHeight = font.cellSize.y;
const int duTop = font.doubleUnderline[0].position;
const int duBottom = font.doubleUnderline[1].position;
const int duHeight = font.doubleUnderline[0].height;
// This gives it the same position and height as our double-underline. There's no particular reason for that, apart from
// it being simple to implement and robust against more peculiar fonts with unusually large/small descenders, etc.
// We still need to ensure though that it doesn't clip out of the cellHeight at the bottom, which is why `position` has a min().
const auto height = std::max(3, duBottom + duHeight - duTop);
const auto position = std::min(duTop, cellHeight - height - duHeight);
_curlyLineHalfHeight = height * 0.5f;
_curlyUnderline.position = gsl::narrow_cast<u16>(position);
_curlyUnderline.height = gsl::narrow_cast<u16>(height);
}
DWrite_GetRenderParams(p.dwriteFactory.get(), &_gamma, &_cleartypeEnhancedContrast, &_grayscaleEnhancedContrast, _textRenderingParams.put());
// Clearing the atlas requires BeginDraw(), which is expensive. Defer this until we need Direct2D anyways.
_fontChangedResetGlyphAtlas = true;
_textShadingType = font.antialiasingMode == AntialiasingMode::ClearType ? ShadingType::TextClearType : ShadingType::TextGrayscale;
// _ligatureOverhangTriggerLeft/Right are essentially thresholds for a glyph's width at
// which point we consider it wider than allowed and "this looks like a coding ligature".
// See _drawTextOverlapSplit for more information about what this does.
{
// No ligatures -> No thresholds.
auto ligaturesDisabled = false;
for (const auto& feature : font.fontFeatures)
{
if (feature.nameTag == DWRITE_FONT_FEATURE_TAG_STANDARD_LIGATURES)
{
ligaturesDisabled = !feature.parameter;
break;
}
}
if (ligaturesDisabled)
{
_ligatureOverhangTriggerLeft = til::CoordTypeMin;
_ligatureOverhangTriggerRight = til::CoordTypeMax;
}
else
{
const auto halfCellWidth = font.cellSize.x / 2;
_ligatureOverhangTriggerLeft = -halfCellWidth;
_ligatureOverhangTriggerRight = font.advanceWidth + halfCellWidth;
}
}
if (_d2dRenderTarget)
{
_d2dRenderTargetUpdateFontSettings(p);
}
_softFontBitmap.reset();
}
void BackendD3D::_d2dRenderTargetUpdateFontSettings(const RenderingPayload& p) const noexcept
{
const auto& font = *p.s->font;
_d2dRenderTarget->SetDpi(font.dpi, font.dpi);
_d2dRenderTarget->SetTextAntialiasMode(static_cast<D2D1_TEXT_ANTIALIAS_MODE>(font.antialiasingMode));
}
void BackendD3D::_recreateCustomShader(const RenderingPayload& p)
{
_customRenderTargetView.reset();
_customOffscreenTexture.reset();
_customOffscreenTextureView.reset();
_customVertexShader.reset();
_customPixelShader.reset();
_customShaderConstantBuffer.reset();
_customShaderSamplerState.reset();
_customShaderTexture.reset();
_customShaderTextureView.reset();
_requiresContinuousRedraw = false;
if (!p.s->misc->customPixelShaderPath.empty())
{
const char* target = nullptr;
switch (p.device->GetFeatureLevel())
{
case D3D_FEATURE_LEVEL_10_0:
target = "ps_4_0";
break;
case D3D_FEATURE_LEVEL_10_1:
target = "ps_4_1";
break;
default:
target = "ps_5_0";
break;
}
static constexpr auto flags =
D3DCOMPILE_PACK_MATRIX_COLUMN_MAJOR
#ifdef NDEBUG
| D3DCOMPILE_OPTIMIZATION_LEVEL3;
#else
// Only enable strictness and warnings in DEBUG mode
// as these settings makes it very difficult to develop
// shaders as windows terminal is not telling the user
// what's wrong, windows terminal just fails.
// Keep it in DEBUG mode to catch errors in shaders
// shipped with windows terminal
| D3DCOMPILE_ENABLE_STRICTNESS | D3DCOMPILE_WARNINGS_ARE_ERRORS | D3DCOMPILE_DEBUG | D3DCOMPILE_SKIP_OPTIMIZATION;
#endif
wil::com_ptr<ID3DBlob> error;
wil::com_ptr<ID3DBlob> blob;
const auto hr = D3DCompileFromFile(
/* pFileName */ p.s->misc->customPixelShaderPath.c_str(),
/* pDefines */ nullptr,
/* pInclude */ D3D_COMPILE_STANDARD_FILE_INCLUDE,
/* pEntrypoint */ "main",
/* pTarget */ target,
/* Flags1 */ flags,
/* Flags2 */ 0,
/* ppCode */ blob.addressof(),
/* ppErrorMsgs */ error.addressof());
if (SUCCEEDED(hr))
{
THROW_IF_FAILED(p.device->CreatePixelShader(blob->GetBufferPointer(), blob->GetBufferSize(), nullptr, _customPixelShader.addressof()));
// Try to determine whether the shader uses the Time variable
wil::com_ptr<ID3D11ShaderReflection> reflector;
if (SUCCEEDED_LOG(D3DReflect(blob->GetBufferPointer(), blob->GetBufferSize(), IID_PPV_ARGS(reflector.addressof()))))
{
// Depending on the version of the d3dcompiler_*.dll, the next two functions either return nullptr
// on failure or an instance of CInvalidSRConstantBuffer or CInvalidSRVariable respectively,
// which cause GetDesc() to return E_FAIL. In other words, we have to assume that any failure in the
// next few lines indicates that the cbuffer is entirely unused (--> _requiresContinuousRedraw=false).
if (ID3D11ShaderReflectionConstantBuffer* constantBufferReflector = reflector->GetConstantBufferByIndex(0)) // shader buffer
{
if (ID3D11ShaderReflectionVariable* variableReflector = constantBufferReflector->GetVariableByIndex(0)) // time
{
D3D11_SHADER_VARIABLE_DESC variableDescriptor;
if (SUCCEEDED(variableReflector->GetDesc(&variableDescriptor)))
{
// only if time is used
_requiresContinuousRedraw = WI_IsFlagSet(variableDescriptor.uFlags, D3D_SVF_USED);
}
}
}
}
else
{
// Unless we can determine otherwise, assume this shader requires evaluation every frame
_requiresContinuousRedraw = true;
}
}
else
{
if (error)
{
LOG_HR_MSG(hr, "%.*hs", static_cast<int>(error->GetBufferSize()), static_cast<char*>(error->GetBufferPointer()));
}
else
{
LOG_HR(hr);
}
if (p.warningCallback)
{
p.warningCallback(D2DERR_SHADER_COMPILE_FAILED, p.s->misc->customPixelShaderPath);
}
}
if (!p.s->misc->customPixelShaderImagePath.empty())
{
try
{
WIC::LoadTextureFromFile(p.device.get(), p.s->misc->customPixelShaderImagePath.c_str(), _customShaderTexture.addressof(), _customShaderTextureView.addressof());
}
catch (...)
{
LOG_CAUGHT_EXCEPTION();
_customPixelShader.reset();
if (p.warningCallback)
{
p.warningCallback(D2DERR_SHADER_COMPILE_FAILED, p.s->misc->customPixelShaderImagePath);
}
}
}
}
else if (p.s->misc->useRetroTerminalEffect)
{
THROW_IF_FAILED(p.device->CreatePixelShader(&custom_shader_ps[0], sizeof(custom_shader_ps), nullptr, _customPixelShader.put()));
}
if (_customPixelShader)
{
THROW_IF_FAILED(p.device->CreateVertexShader(&custom_shader_vs[0], sizeof(custom_shader_vs), nullptr, _customVertexShader.put()));
{
static constexpr D3D11_BUFFER_DESC desc{
.ByteWidth = sizeof(CustomConstBuffer),
.Usage = D3D11_USAGE_DYNAMIC,
.BindFlags = D3D11_BIND_CONSTANT_BUFFER,
.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE,
};
THROW_IF_FAILED(p.device->CreateBuffer(&desc, nullptr, _customShaderConstantBuffer.put()));
}
{
static constexpr D3D11_SAMPLER_DESC desc{
.Filter = D3D11_FILTER_MIN_MAG_MIP_LINEAR,
.AddressU = D3D11_TEXTURE_ADDRESS_BORDER,
.AddressV = D3D11_TEXTURE_ADDRESS_BORDER,
.AddressW = D3D11_TEXTURE_ADDRESS_BORDER,
.MaxAnisotropy = 1,
.ComparisonFunc = D3D11_COMPARISON_ALWAYS,
.MaxLOD = D3D11_FLOAT32_MAX,
};
THROW_IF_FAILED(p.device->CreateSamplerState(&desc, _customShaderSamplerState.put()));
}
// Since floats are imprecise we need to constrain the time value into a range that can be accurately represented.
// Assuming a monitor refresh rate of 1000 Hz, we can still easily represent 1000 seconds accurately (roughly 16 minutes).
// 10000 seconds would already result in a 50% error. So to avoid this, we use queryPerfCount() modulo _customShaderPerfTickMod.
// The use of a power of 10 is intentional, because shaders are often periodic and this makes any decimal multiplier up to 3 fractional
// digits not break the periodicity. For instance, with a wraparound of 1000 seconds sin(1.234*x) is still perfectly periodic.
const auto freq = queryPerfFreq();
_customShaderPerfTickMod = freq * 1000;
_customShaderSecsPerPerfTick = 1.0f / freq;
}
}
void BackendD3D::_recreateCustomRenderTargetView(const RenderingPayload& p)
{
// Avoid memory usage spikes by releasing memory first.
_customOffscreenTexture.reset();
_customOffscreenTextureView.reset();
const D3D11_TEXTURE2D_DESC desc{
.Width = p.s->targetSize.x,
.Height = p.s->targetSize.y,
.MipLevels = 1,
.ArraySize = 1,
.Format = DXGI_FORMAT_B8G8R8A8_UNORM,
.SampleDesc = { 1, 0 },
.BindFlags = D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_RENDER_TARGET,
};
THROW_IF_FAILED(p.device->CreateTexture2D(&desc, nullptr, _customOffscreenTexture.addressof()));
THROW_IF_FAILED(p.device->CreateShaderResourceView(_customOffscreenTexture.get(), nullptr, _customOffscreenTextureView.addressof()));
THROW_IF_FAILED(p.device->CreateRenderTargetView(_customOffscreenTexture.get(), nullptr, _customRenderTargetView.addressof()));
}
void BackendD3D::_recreateBackgroundColorBitmap(const RenderingPayload& p)
{
// Avoid memory usage spikes by releasing memory first.
_backgroundBitmap.reset();
_backgroundBitmapView.reset();
const D3D11_TEXTURE2D_DESC desc{
.Width = p.s->viewportCellCount.x,
.Height = p.s->viewportCellCount.y,
.MipLevels = 1,
.ArraySize = 1,
.Format = DXGI_FORMAT_R8G8B8A8_UNORM,
.SampleDesc = { 1, 0 },
.Usage = D3D11_USAGE_DYNAMIC,
.BindFlags = D3D11_BIND_SHADER_RESOURCE,
.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE,
};
THROW_IF_FAILED(p.device->CreateTexture2D(&desc, nullptr, _backgroundBitmap.addressof()));
THROW_IF_FAILED(p.device->CreateShaderResourceView(_backgroundBitmap.get(), nullptr, _backgroundBitmapView.addressof()));
_backgroundBitmapGeneration = {};
}
void BackendD3D::_recreateConstBuffer(const RenderingPayload& p) const
{
{
VSConstBuffer data{};
data.positionScale = { 2.0f / p.s->targetSize.x, -2.0f / p.s->targetSize.y };
p.deviceContext->UpdateSubresource(_vsConstantBuffer.get(), 0, nullptr, &data, 0, 0);
}
{
PSConstBuffer data{};
data.backgroundColor = colorFromU32Premultiply<f32x4>(p.s->misc->backgroundColor);
data.backgroundCellSize = { static_cast<f32>(p.s->font->cellSize.x), static_cast<f32>(p.s->font->cellSize.y) };
data.backgroundCellCount = { static_cast<f32>(p.s->viewportCellCount.x), static_cast<f32>(p.s->viewportCellCount.y) };
DWrite_GetGammaRatios(_gamma, data.gammaRatios);
data.enhancedContrast = p.s->font->antialiasingMode == AntialiasingMode::ClearType ? _cleartypeEnhancedContrast : _grayscaleEnhancedContrast;
data.underlineWidth = p.s->font->underline.height;
data.doubleUnderlineWidth = p.s->font->doubleUnderline[0].height;
data.curlyLineHalfHeight = _curlyLineHalfHeight;
data.shadedGlyphDotSize = std::max(1.0f, std::roundf(std::max(p.s->font->cellSize.x / 16.0f, p.s->font->cellSize.y / 32.0f)));
p.deviceContext->UpdateSubresource(_psConstantBuffer.get(), 0, nullptr, &data, 0, 0);
}
}
void BackendD3D::_setupDeviceContextState(const RenderingPayload& p)
{
// IA: Input Assembler
ID3D11Buffer* vertexBuffers[]{ _vertexBuffer.get(), _instanceBuffer.get() };
static constexpr UINT strides[]{ sizeof(f32x2), sizeof(QuadInstance) };
static constexpr UINT offsets[]{ 0, 0 };
p.deviceContext->IASetIndexBuffer(_indexBuffer.get(), DXGI_FORMAT_R16_UINT, 0);
p.deviceContext->IASetInputLayout(_inputLayout.get());
p.deviceContext->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST);
p.deviceContext->IASetVertexBuffers(0, 2, &vertexBuffers[0], &strides[0], &offsets[0]);
// VS: Vertex Shader
p.deviceContext->VSSetShader(_vertexShader.get(), nullptr, 0);
p.deviceContext->VSSetConstantBuffers(0, 1, _vsConstantBuffer.addressof());
// RS: Rasterizer Stage
D3D11_VIEWPORT viewport{};
viewport.Width = static_cast<f32>(p.s->targetSize.x);
viewport.Height = static_cast<f32>(p.s->targetSize.y);
p.deviceContext->RSSetViewports(1, &viewport);
// PS: Pixel Shader
ID3D11ShaderResourceView* resources[]{ _backgroundBitmapView.get(), _glyphAtlasView.get() };
p.deviceContext->PSSetShader(_pixelShader.get(), nullptr, 0);
p.deviceContext->PSSetConstantBuffers(0, 1, _psConstantBuffer.addressof());
p.deviceContext->PSSetShaderResources(0, 2, &resources[0]);
// OM: Output Merger
p.deviceContext->OMSetBlendState(_blendState.get(), nullptr, 0xffffffff);
p.deviceContext->OMSetRenderTargets(1, _customRenderTargetView ? _customRenderTargetView.addressof() : _renderTargetView.addressof(), nullptr);
}
void BackendD3D::_debugUpdateShaders(const RenderingPayload& p) noexcept
{
#if ATLAS_DEBUG_SHADER_HOT_RELOAD
try
{
const auto invalidationTime = _sourceCodeInvalidationTime.load(std::memory_order_relaxed);
if (invalidationTime == INT64_MAX || invalidationTime > std::chrono::steady_clock::now().time_since_epoch().count())
{
return;
}
_sourceCodeInvalidationTime.store(INT64_MAX, std::memory_order_relaxed);
static constexpr auto flags =
D3DCOMPILE_PACK_MATRIX_COLUMN_MAJOR | D3DCOMPILE_ENABLE_STRICTNESS | D3DCOMPILE_WARNINGS_ARE_ERRORS
#ifndef NDEBUG
| D3DCOMPILE_DEBUG | D3DCOMPILE_SKIP_OPTIMIZATION
#endif
;
static const auto compile = [](const std::filesystem::path& path, const char* target) {
wil::com_ptr<ID3DBlob> error;
wil::com_ptr<ID3DBlob> blob;
const auto hr = D3DCompileFromFile(
/* pFileName */ path.c_str(),
/* pDefines */ nullptr,
/* pInclude */ D3D_COMPILE_STANDARD_FILE_INCLUDE,
/* pEntrypoint */ "main",
/* pTarget */ target,
/* Flags1 */ flags,
/* Flags2 */ 0,
/* ppCode */ blob.addressof(),
/* ppErrorMsgs */ error.addressof());
if (error)
{
std::thread t{ [error = std::move(error)]() noexcept {
MessageBoxA(nullptr, static_cast<const char*>(error->GetBufferPointer()), "Compilation error", MB_ICONERROR | MB_OK);
} };
t.detach();
}
THROW_IF_FAILED(hr);
return blob;
};
struct FileVS
{
std::wstring_view filename;
wil::com_ptr<ID3D11VertexShader> BackendD3D::*target;
};
struct FilePS
{
std::wstring_view filename;
wil::com_ptr<ID3D11PixelShader> BackendD3D::*target;
};
static constexpr std::array filesVS{
FileVS{ L"shader_vs.hlsl", &BackendD3D::_vertexShader },
};
static constexpr std::array filesPS{
FilePS{ L"shader_ps.hlsl", &BackendD3D::_pixelShader },
};
std::array<wil::com_ptr<ID3D11VertexShader>, filesVS.size()> compiledVS;
std::array<wil::com_ptr<ID3D11PixelShader>, filesPS.size()> compiledPS;
// Compile our files before moving them into `this` below to ensure we're
// always in a consistent state where all shaders are seemingly valid.
for (size_t i = 0; i < filesVS.size(); ++i)
{
const auto blob = compile(_sourceDirectory / filesVS[i].filename, "vs_4_0");
THROW_IF_FAILED(p.device->CreateVertexShader(blob->GetBufferPointer(), blob->GetBufferSize(), nullptr, compiledVS[i].addressof()));
}
for (size_t i = 0; i < filesPS.size(); ++i)
{
const auto blob = compile(_sourceDirectory / filesPS[i].filename, "ps_4_0");
THROW_IF_FAILED(p.device->CreatePixelShader(blob->GetBufferPointer(), blob->GetBufferSize(), nullptr, compiledPS[i].addressof()));
}
for (size_t i = 0; i < filesVS.size(); ++i)
{
this->*filesVS[i].target = std::move(compiledVS[i]);
}
for (size_t i = 0; i < filesPS.size(); ++i)
{
this->*filesPS[i].target = std::move(compiledPS[i]);
}
_setupDeviceContextState(p);
}
CATCH_LOG()
#endif
}
void BackendD3D::_d2dBeginDrawing() noexcept
{
if (!_d2dBeganDrawing)
{
_d2dRenderTarget->BeginDraw();
_d2dBeganDrawing = true;
}
}
void BackendD3D::_d2dEndDrawing()
{
if (_d2dBeganDrawing)
{
THROW_IF_FAILED(_d2dRenderTarget->EndDraw());
_d2dBeganDrawing = false;
}
}
void BackendD3D::_resetGlyphAtlas(const RenderingPayload& p)
{
// The index returned by _BitScanReverse is undefined when the input is 0. We can simultaneously guard
// against that and avoid unreasonably small textures, by clamping the min. texture size to `minArea`.
// `minArea` results in a 64kB RGBA texture which is the min. alignment for placed memory.
static constexpr u32 minArea = 128 * 128;
static constexpr u32 maxArea = D3D10_REQ_TEXTURE2D_U_OR_V_DIMENSION * D3D10_REQ_TEXTURE2D_U_OR_V_DIMENSION;
const auto cellArea = static_cast<u32>(p.s->font->cellSize.x) * p.s->font->cellSize.y;
const auto targetArea = static_cast<u32>(p.s->targetSize.x) * p.s->targetSize.y;
const auto minAreaByFont = cellArea * 95; // Covers all printable ASCII characters
const auto minAreaByGrowth = static_cast<u32>(_rectPacker.width) * _rectPacker.height * 2;
// It's hard to say what the max. size of the cache should be. Optimally I think we should use as much
// memory as is available, but the rendering code in this project is a big mess and so integrating
// memory pressure feedback (RegisterVideoMemoryBudgetChangeNotificationEvent) is rather difficult.
// As an alternative I'm using 1.25x the size of the swap chain. The 1.25x is there to avoid situations, where
// we're locked into a state, where on every render pass we're starting with a half full atlas, drawing once,
// filling it with the remaining half and drawing again, requiring two rendering passes on each frame.
const auto maxAreaByFont = targetArea + targetArea / 4;
auto area = std::min(maxAreaByFont, std::max(minAreaByFont, minAreaByGrowth));
area = clamp(area, minArea, maxArea);
// This block of code calculates the size of a power-of-2 texture that has an area larger than the given `area`.
// For instance, for an area of 985x1946 = 1916810 it would result in a u/v of 2048x1024 (area = 2097152).
// This has 2 benefits: GPUs like power-of-2 textures and it ensures that we don't resize the texture
// every time you resize the window by a pixel. Instead it only grows/shrinks by a factor of 2.
unsigned long index;
_BitScanReverse(&index, area - 1);
const auto u = static_cast<u16>(1u << ((index + 2) / 2));
const auto v = static_cast<u16>(1u << ((index + 1) / 2));
if (u != _rectPacker.width || v != _rectPacker.height)
{
_resizeGlyphAtlas(p, u, v);
}
stbrp_init_target(&_rectPacker, u, v, _rectPackerData.data(), _rectPackerData.size());
// This is a little imperfect, because it only releases the memory of the glyph mappings, not the memory held by
// any DirectWrite fonts. On the other side, the amount of fonts on a system is always finite, where "finite"
// is pretty low, relatively speaking. Additionally this allows us to cache the boxGlyphs map indefinitely.
// It's not great, but it's not terrible.
for (auto& slot : _glyphAtlasMap.container())
{
for (auto& glyphs : slot.glyphs)
{
glyphs.clear();
}
}
for (auto& glyphs : _builtinGlyphs.glyphs)
{
glyphs.clear();
}
_d2dBeginDrawing();
_d2dRenderTarget->Clear();
_fontChangedResetGlyphAtlas = false;
}
void BackendD3D::_resizeGlyphAtlas(const RenderingPayload& p, const u16 u, const u16 v)
{
#if defined(_M_X64) || defined(_M_IX86)
static const auto faultyMacTypeVersion = _checkMacTypeVersion(p);
#else
// The affected versions of MacType are unavailable on ARM.
static constexpr auto faultyMacTypeVersion = false;
#endif
_d2dRenderTarget.reset();
_d2dRenderTarget4.reset();
_glyphAtlas.reset();
_glyphAtlasView.reset();
{
const D3D11_TEXTURE2D_DESC desc{
.Width = u,
.Height = v,
.MipLevels = 1,
.ArraySize = 1,
.Format = DXGI_FORMAT_B8G8R8A8_UNORM,
.SampleDesc = { 1, 0 },
.BindFlags = D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_RENDER_TARGET,
};
THROW_IF_FAILED(p.device->CreateTexture2D(&desc, nullptr, _glyphAtlas.addressof()));
THROW_IF_FAILED(p.device->CreateShaderResourceView(_glyphAtlas.get(), nullptr, _glyphAtlasView.addressof()));
}
{
const auto surface = _glyphAtlas.query<IDXGISurface>();
static constexpr D2D1_RENDER_TARGET_PROPERTIES props{
.type = D2D1_RENDER_TARGET_TYPE_DEFAULT,
.pixelFormat = { DXGI_FORMAT_B8G8R8A8_UNORM, D2D1_ALPHA_MODE_PREMULTIPLIED },
};
// ID2D1RenderTarget and ID2D1DeviceContext are the same and I'm tired of pretending they're not.
THROW_IF_FAILED(p.d2dFactory->CreateDxgiSurfaceRenderTarget(surface.get(), &props, reinterpret_cast<ID2D1RenderTarget**>(_d2dRenderTarget.addressof())));
_d2dRenderTarget.try_query_to(_d2dRenderTarget4.addressof());
_d2dRenderTarget->SetUnitMode(D2D1_UNIT_MODE_PIXELS);
// Ensure that D2D uses the exact same gamma as our shader uses.
_d2dRenderTarget->SetTextRenderingParams(_textRenderingParams.get());
_d2dRenderTargetUpdateFontSettings(p);
}
// We have our own glyph cache so Direct2D's cache doesn't help much.
// This saves us 1MB of RAM, which is not much, but also not nothing.
if (_d2dRenderTarget4)
{
wil::com_ptr<ID2D1Device> device;
_d2dRenderTarget4->GetDevice(device.addressof());
device->SetMaximumTextureMemory(0);
if (!faultyMacTypeVersion)
{
if (const auto device4 = device.try_query<ID2D1Device4>())
{
device4->SetMaximumColorGlyphCacheMemory(0);
}
}
}
{
THROW_IF_FAILED(_d2dRenderTarget->CreateSolidColorBrush(&whiteColor, nullptr, _emojiBrush.put()));
THROW_IF_FAILED(_d2dRenderTarget->CreateSolidColorBrush(&whiteColor, nullptr, _brush.put()));
}
ID3D11ShaderResourceView* resources[]{ _backgroundBitmapView.get(), _glyphAtlasView.get() };
p.deviceContext->PSSetShaderResources(0, 2, &resources[0]);
_rectPackerData = Buffer<stbrp_node>{ u };
}
// MacType is a popular 3rd party system to give the font rendering on Windows a softer look.
// It's particularly popular in China. Unfortunately, it hooks ID2D1Device4 incorrectly:
// https://github.com/snowie2000/mactype/pull/938
// This results in crashes. Not a lot of them, but enough to constantly show up.
// The issue was fixed in the MacType v1.2023.5.31 release, the only one in 2023.
//
// Please feel free to remove this check in a few years.
bool BackendD3D::_checkMacTypeVersion(const RenderingPayload& p)
{
#ifdef _WIN64
static constexpr auto name = L"MacType64.Core.dll";
#else
static constexpr auto name = L"MacType.Core.dll";
#endif
wil::unique_hmodule handle;
if (!GetModuleHandleExW(0, name, handle.addressof()))
{
return false;
}
const auto resource = FindResourceW(handle.get(), MAKEINTRESOURCE(VS_VERSION_INFO), RT_VERSION);
if (!resource)
{
return false;
}
const auto dataHandle = LoadResource(handle.get(), resource);
if (!dataHandle)
{
return false;
}
const auto data = LockResource(dataHandle);
if (!data)
{
return false;
}
VS_FIXEDFILEINFO* info;
UINT varLen = 0;
if (!VerQueryValueW(data, L"\\", reinterpret_cast<void**>(&info), &varLen))
{
return false;
}
const auto faulty = info->dwFileVersionMS < (1 << 16 | 2023);
if (faulty && p.warningCallback)
{
p.warningCallback(ATLAS_ENGINE_ERROR_MAC_TYPE, {});
}
return faulty;
}
BackendD3D::QuadInstance& BackendD3D::_getLastQuad() noexcept
{
assert(_instancesCount != 0);
return _instances[_instancesCount - 1];
}
// NOTE: Up to 5M calls per second -> no std::vector, no std::unordered_map.
// This function is an easy >100x faster than std::vector, can be
// inlined and reduces overall (!) renderer CPU usage by 5%.
BackendD3D::QuadInstance& BackendD3D::_appendQuad()
{
if (_instancesCount >= _instances.size())
{
_bumpInstancesSize();
}
return _instances[_instancesCount++];
}
void BackendD3D::_bumpInstancesSize()
{
auto newSize = std::max(_instancesCount, _instances.size() * 2);
newSize = std::max(size_t{ 256 }, newSize);
Expects(newSize > _instances.size());
// Our render loop heavily relies on memcpy() which is up to between 1.5x (Intel)
// and 40x (AMD) faster for allocations with an alignment of 32 or greater.
auto newInstances = Buffer<QuadInstance, 32>{ newSize };
std::copy_n(_instances.data(), _instances.size(), newInstances.data());
_instances = std::move(newInstances);
}
void BackendD3D::_flushQuads(const RenderingPayload& p)
{
if (!_instancesCount)
{
return;
}
if (!_cursorRects.empty())
{
_drawCursorForeground();
}
// TODO: Shrink instances buffer
if (_instancesCount > _instanceBufferCapacity)
{
_recreateInstanceBuffers(p);
}
{
D3D11_MAPPED_SUBRESOURCE mapped{};
THROW_IF_FAILED(p.deviceContext->Map(_instanceBuffer.get(), 0, D3D11_MAP_WRITE_DISCARD, 0, &mapped));
memcpy(mapped.pData, _instances.data(), _instancesCount * sizeof(QuadInstance));
p.deviceContext->Unmap(_instanceBuffer.get(), 0);
}
// I found 4 approaches to drawing lots of quads quickly. There are probably even more.
// They can often be found in discussions about "particle" or "point sprite" rendering in game development.
// * Compute Shader: My understanding is that at the time of writing games are moving over to bucketing
// particles into "tiles" on the screen and drawing them with a compute shader. While this improves
// performance, it doesn't mix well with our goal of allowing arbitrary overlaps between glyphs.
// Additionally none of the next 3 approaches use any significant amount of GPU time in the first place.
// * Geometry Shader: Geometry shaders can generate vertices on the fly, which would neatly replace our need
// for an index buffer. However, many sources claim they're significantly slower than the following approaches.
// * DrawIndexed & DrawInstanced: Again, many sources claim that GPU instancing (Draw(Indexed)Instanced) performs
// poorly for small meshes, and instead indexed vertices with a SRV (shader resource view) should be used.
// The popular "Vertex Shader Tricks" talk from Bill Bilodeau at GDC 2014 suggests this approach, explains
// how it works (you divide the `SV_VertexID` by 4 and index into the SRV that contains the per-instance data;
// it's basically manual instancing inside the vertex shader) and shows how it outperforms regular instancing.
// However on my own limited test hardware (built around ~2020), I found that for at least our use case,
// GPU instancing matches the performance of using a custom buffer. In fact on my Nvidia GPU in particular,
// instancing with ~10k instances appears to be about 50% faster and so DrawInstanced was chosen.
// Instead I found that packing instance data as tightly as possible made the biggest performance difference,
// and packing 16 bit integers with ID3D11InputLayout is quite a bit more convenient too.
p.deviceContext->DrawIndexedInstanced(6, static_cast<UINT>(_instancesCount), 0, 0, 0);
_instancesCount = 0;
}
void BackendD3D::_recreateInstanceBuffers(const RenderingPayload& p)
{
// We use the viewport size of the terminal as the initial estimate for the amount of instances we'll see.
const auto minCapacity = static_cast<size_t>(p.s->viewportCellCount.x) * p.s->viewportCellCount.y;
auto newCapacity = std::max(_instancesCount, minCapacity);
auto newSize = newCapacity * sizeof(QuadInstance);
// Round up to multiples of 64kB to avoid reallocating too often.
// 64kB is the minimum alignment for committed resources in D3D12.
newSize = alignForward<size_t>(newSize, 64 * 1024);
newCapacity = newSize / sizeof(QuadInstance);
_instanceBuffer.reset();
{
const D3D11_BUFFER_DESC desc{
.ByteWidth = gsl::narrow<UINT>(newSize),
.Usage = D3D11_USAGE_DYNAMIC,
.BindFlags = D3D11_BIND_VERTEX_BUFFER,
.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE,
.StructureByteStride = sizeof(QuadInstance),
};
THROW_IF_FAILED(p.device->CreateBuffer(&desc, nullptr, _instanceBuffer.addressof()));
}
// IA: Input Assembler
ID3D11Buffer* vertexBuffers[]{ _vertexBuffer.get(), _instanceBuffer.get() };
static constexpr UINT strides[]{ sizeof(f32x2), sizeof(QuadInstance) };
static constexpr UINT offsets[]{ 0, 0 };
p.deviceContext->IASetVertexBuffers(0, 2, &vertexBuffers[0], &strides[0], &offsets[0]);
_instanceBufferCapacity = newCapacity;
}
void BackendD3D::_drawBackground(const RenderingPayload& p)
{
// Not uploading the bitmap halves (!) the GPU load for any given frame on 2023 hardware.
if (_backgroundBitmapGeneration != p.colorBitmapGenerations[0])
{
_uploadBackgroundBitmap(p);
}
_appendQuad() = {
.shadingType = static_cast<u16>(ShadingType::Background),
.size = p.s->targetSize,
};
}
void BackendD3D::_uploadBackgroundBitmap(const RenderingPayload& p)
{
D3D11_MAPPED_SUBRESOURCE mapped{};
THROW_IF_FAILED(p.deviceContext->Map(_backgroundBitmap.get(), 0, D3D11_MAP_WRITE_DISCARD, 0, &mapped));
auto src = std::bit_cast<const char*>(p.backgroundBitmap.data());
const auto srcEnd = std::bit_cast<const char*>(p.backgroundBitmap.data() + p.backgroundBitmap.size());
const auto srcStride = p.colorBitmapRowStride * sizeof(u32);
auto dst = static_cast<char*>(mapped.pData);
while (src < srcEnd)
{
memcpy(dst, src, srcStride);
src += srcStride;
dst += mapped.RowPitch;
}
p.deviceContext->Unmap(_backgroundBitmap.get(), 0);
_backgroundBitmapGeneration = p.colorBitmapGenerations[0];
}
void BackendD3D::_drawText(RenderingPayload& p)
{
if (_fontChangedResetGlyphAtlas)
{
_resetGlyphAtlas(p);
}
til::CoordType dirtyTop = til::CoordTypeMax;
til::CoordType dirtyBottom = til::CoordTypeMin;
u16 y = 0;
for (const auto row : p.rows)
{
f32 baselineX = 0;
f32 baselineY = y * p.s->font->cellSize.y + p.s->font->baseline;
f32 scaleX = 1;
f32 scaleY = 1;
if (row->lineRendition != LineRendition::SingleWidth)
{
scaleX = 2;
if (row->lineRendition >= LineRendition::DoubleHeightTop)
{
scaleY = 2;
baselineY /= 2;
}
}
const u8x2 renditionScale{
static_cast<u8>(row->lineRendition != LineRendition::SingleWidth ? 2 : 1),
static_cast<u8>(row->lineRendition >= LineRendition::DoubleHeightTop ? 2 : 1),
};
for (const auto& m : row->mappings)
{
auto x = m.glyphsFrom;
const auto glyphsTo = m.glyphsTo;
const auto fontFace = m.fontFace.get();
// The lack of a fontFace indicates a soft font.
AtlasFontFaceEntry* fontFaceEntry = &_builtinGlyphs;
if (fontFace) [[likely]]
{
fontFaceEntry = _glyphAtlasMap.insert(fontFace).first;
}
const auto& glyphs = fontFaceEntry->glyphs[WI_EnumValue(row->lineRendition)];
while (x < glyphsTo)
{
size_t dx = 1;
u32 glyphIndex = row->glyphIndices[x];
// Note: !fontFace is only nullptr for builtin glyphs which then use glyphIndices for UTF16 code points.
// In other words, this doesn't accidentally corrupt any actual glyph indices.
if (!fontFace && til::is_leading_surrogate(glyphIndex))
{
glyphIndex = til::combine_surrogates(glyphIndex, row->glyphIndices[x + 1]);
dx = 2;
}
auto glyphEntry = glyphs.lookup(glyphIndex);
if (!glyphEntry)
{
glyphEntry = _drawGlyph(p, *row, *fontFaceEntry, glyphIndex);
}
// A shadingType of 0 (ShadingType::Default) indicates a glyph that is whitespace.
if (glyphEntry->shadingType != ShadingType::Default)
{
auto l = static_cast<til::CoordType>(lrintf((baselineX + row->glyphOffsets[x].advanceOffset) * scaleX));
auto t = static_cast<til::CoordType>(lrintf((baselineY - row->glyphOffsets[x].ascenderOffset) * scaleY));
l += glyphEntry->offset.x;
t += glyphEntry->offset.y;
row->dirtyTop = std::min(row->dirtyTop, t);
row->dirtyBottom = std::max(row->dirtyBottom, t + glyphEntry->size.y);
_appendQuad() = {
.shadingType = static_cast<u16>(glyphEntry->shadingType),
.renditionScale = renditionScale,
.position = { static_cast<i16>(l), static_cast<i16>(t) },
.size = glyphEntry->size,
.texcoord = glyphEntry->texcoord,
.color = row->colors[x],
};
if (glyphEntry->overlapSplit)
{
_drawTextOverlapSplit(p, y);
}
}
baselineX += row->glyphAdvances[x];
x += dx;
}
}
if (!row->gridLineRanges.empty())
{
_drawGridlines(p, y);
}
if (p.invalidatedRows.contains(y))
{
dirtyTop = std::min(dirtyTop, row->dirtyTop);
dirtyBottom = std::max(dirtyBottom, row->dirtyBottom);
}
++y;
}
if (dirtyTop < dirtyBottom)
{
p.dirtyRectInPx.top = std::min(p.dirtyRectInPx.top, dirtyTop);
p.dirtyRectInPx.bottom = std::max(p.dirtyRectInPx.bottom, dirtyBottom);
}
_d2dEndDrawing();
}
// There are a number of coding-oriented fonts that feature ligatures which (for instance)
// translate text like "!=" into a glyph that looks like "≠" (just 2 columns wide and not 1).
// Glyphs like that still need to be colored in potentially multiple colors however, so this
// function will handle these ligatures by splitting them up into multiple QuadInstances.
//
// It works by iteratively splitting the wide glyph into shorter and shorter segments like so
// (whitespaces indicate that the glyph was split up in a leading and trailing half):
// <!--
// < !--
// < ! --
// < ! - -
void BackendD3D::_drawTextOverlapSplit(const RenderingPayload& p, u16 y)
{
auto& originalQuad = _getLastQuad();
// If the current row has a non-default line rendition then every glyph is scaled up by 2x horizontally.
// This requires us to make some changes: For instance, if the ligature occupies columns 3, 4 and 5 (0-indexed)
// then we need to get the foreground colors from columns 2 and 4, because columns 0,1 2,3 4,5 6,7 and so on form pairs.
// A wide glyph would be a total of 4 actual columns wide! In other words, we need to properly round our clip rects and columns.
i32 columnAdvance = 1;
i32 columnAdvanceInPx = p.s->font->cellSize.x;
i32 cellCount = p.s->viewportCellCount.x;
if (p.rows[y]->lineRendition != LineRendition::SingleWidth)
{
columnAdvance = 2;
columnAdvanceInPx <<= 1;
cellCount >>= 1;
}
i32 originalLeft = originalQuad.position.x;
i32 originalRight = originalQuad.position.x + originalQuad.size.x;
originalLeft = std::max(originalLeft, 0);
originalRight = std::min(originalRight, cellCount * columnAdvanceInPx);
if (originalLeft >= originalRight)
{
return;
}
const auto colors = &p.foregroundBitmap[p.colorBitmapRowStride * y];
// As explained in the beginning, column and clipLeft should be in multiples of columnAdvance
// and columnAdvanceInPx respectively, because that's how line renditions work.
auto column = originalLeft / columnAdvanceInPx;
auto clipLeft = column * columnAdvanceInPx;
column *= columnAdvance;
// Our loop below will split the ligature by processing it from left to right.
// Some fonts however implement ligatures by replacing a string like "&&" with a whitespace padding glyph,
// followed by the actual "&&" glyph which has a 1 column advance width. In that case the originalQuad
// will have the .color of the 2nd column and not of the 1st one. We need to fix that here.
auto lastFg = colors[column];
originalQuad.color = lastFg;
column += columnAdvance;
clipLeft += columnAdvanceInPx;
// We must ensure to exit the loop while `column` is less than `cellCount.x`,
// otherwise we cause a potential out of bounds access into foregroundBitmap.
// This may happen with glyphs that are severely overlapping their cells,
// outside of the viewport. The `clipLeft < originalRight` condition doubles
// as a `column < cellCount.x` condition thanks to us std::min()ing it above.
for (; clipLeft < originalRight; column += columnAdvance, clipLeft += columnAdvanceInPx)
{
const auto fg = colors[column];
if (lastFg != fg)
{
// NOTE: _appendQuad might reallocate and any pointers
// acquired before calling this function are now invalid.
auto& next = _appendQuad();
// The item at -1 is the quad we've just appended, which means
// that the previous quad we want to split up is at -2.
auto& prev = _instances[_instancesCount - 2];
const auto prevWidth = clipLeft - prev.position.x;
const auto nextWidth = prev.size.x - prevWidth;
prev.size.x = gsl::narrow<u16>(prevWidth);
next = prev;
next.position.x = gsl::narrow<i16>(next.position.x + prevWidth);
next.texcoord.x = gsl::narrow<u16>(next.texcoord.x + prevWidth);
next.size.x = gsl::narrow<u16>(nextWidth);
next.color = fg;
lastFg = fg;
}
}
}
BackendD3D::AtlasGlyphEntry* BackendD3D::_drawGlyph(const RenderingPayload& p, const ShapedRow& row, AtlasFontFaceEntry& fontFaceEntry, u32 glyphIndex)
{
// The lack of a fontFace indicates a soft font.
if (!fontFaceEntry.fontFace)
{
return _drawBuiltinGlyph(p, row, fontFaceEntry, glyphIndex);
}
const auto glyphIndexU16 = static_cast<u16>(glyphIndex);
const DWRITE_GLYPH_RUN glyphRun{
.fontFace = fontFaceEntry.fontFace.get(),
.fontEmSize = p.s->font->fontSize,
.glyphCount = 1,
.glyphIndices = &glyphIndexU16,
};
// It took me a while to figure out how to rasterize glyphs manually with DirectWrite without depending on Direct2D.
// The benefits are a reduction in memory usage, an increase in performance under certain circumstances and most
// importantly, the ability to debug the renderer more easily, because many graphics debuggers don't support Direct2D.
// Direct2D has one big advantage however: It features a clever glyph uploader with a pool of D3D11_USAGE_STAGING textures,
// which I was too short on time to implement myself. This makes rasterization with Direct2D roughly 2x faster.
//
// This code remains, because it features some parts that are slightly more and some parts that are outright difficult to figure out.
#if 0
const auto wantsClearType = p.s->font->antialiasingMode == AntialiasingMode::ClearType;
const auto wantsAliased = p.s->font->antialiasingMode == AntialiasingMode::Aliased;
const auto antialiasMode = wantsClearType ? DWRITE_TEXT_ANTIALIAS_MODE_CLEARTYPE : DWRITE_TEXT_ANTIALIAS_MODE_GRAYSCALE;
const auto outlineThreshold = wantsAliased ? DWRITE_OUTLINE_THRESHOLD_ALIASED : DWRITE_OUTLINE_THRESHOLD_ANTIALIASED;
DWRITE_RENDERING_MODE renderingMode{};
DWRITE_GRID_FIT_MODE gridFitMode{};
THROW_IF_FAILED(fontFaceEntry.fontFace->GetRecommendedRenderingMode(
/* fontEmSize */ glyphRun.fontEmSize,
/* dpiX */ 1, // fontEmSize is already in pixel
/* dpiY */ 1, // fontEmSize is already in pixel
/* transform */ nullptr,
/* isSideways */ FALSE,
/* outlineThreshold */ outlineThreshold,
/* measuringMode */ DWRITE_MEASURING_MODE_NATURAL,
/* renderingParams */ _textRenderingParams.get(),
/* renderingMode */ &renderingMode,
/* gridFitMode */ &gridFitMode));
wil::com_ptr<IDWriteGlyphRunAnalysis> glyphRunAnalysis;
THROW_IF_FAILED(p.dwriteFactory->CreateGlyphRunAnalysis(
/* glyphRun */ &glyphRun,
/* transform */ nullptr,
/* renderingMode */ renderingMode,
/* measuringMode */ DWRITE_MEASURING_MODE_NATURAL,
/* gridFitMode */ gridFitMode,
/* antialiasMode */ antialiasMode,
/* baselineOriginX */ 0,
/* baselineOriginY */ 0,
/* glyphRunAnalysis */ glyphRunAnalysis.put()));
RECT textureBounds{};
if (wantsClearType)
{
THROW_IF_FAILED(glyphRunAnalysis->GetAlphaTextureBounds(DWRITE_TEXTURE_CLEARTYPE_3x1, &textureBounds));
// Some glyphs cannot be drawn with ClearType, such as bitmap fonts. In that case
// GetAlphaTextureBounds() supposedly returns an empty RECT, but I haven't tested that yet.
if (!IsRectEmpty(&textureBounds))
{
// Allocate a buffer of `3 * width * height` bytes.
THROW_IF_FAILED(glyphRunAnalysis->CreateAlphaTexture(DWRITE_TEXTURE_CLEARTYPE_3x1, &textureBounds, buffer.data(), buffer.size()));
// The buffer contains RGB ClearType weights which can now be packed into RGBA and uploaded to the glyph atlas.
return;
}
// --> Retry with grayscale AA.
}
// Even though it says "ALIASED" and even though the docs for the flag still say:
// > [...] that is, each pixel is either fully opaque or fully transparent [...]
// don't be confused: It's grayscale antialiased. lol
THROW_IF_FAILED(glyphRunAnalysis->GetAlphaTextureBounds(DWRITE_TEXTURE_ALIASED_1x1, &textureBounds));
// Allocate a buffer of `width * height` bytes.
THROW_IF_FAILED(glyphRunAnalysis->CreateAlphaTexture(DWRITE_TEXTURE_ALIASED_1x1, &textureBounds, buffer.data(), buffer.size()));
// The buffer now contains a grayscale alpha mask.
#endif
// This code finds the local font file path. Useful for debugging as it
// gets you the font.ttf <> glyphIndex pair to uniquely identify glyphs.
#if 0
std::vector<std::wstring> paths;
UINT32 numberOfFiles;
THROW_IF_FAILED(fontFaceEntry.fontFace->GetFiles(&numberOfFiles, nullptr));
wil::com_ptr<IDWriteFontFile> fontFiles[8];
THROW_IF_FAILED(fontFaceEntry.fontFace->GetFiles(&numberOfFiles, fontFiles[0].addressof()));
for (UINT32 i = 0; i < numberOfFiles; ++i)
{
wil::com_ptr<IDWriteFontFileLoader> loader;
THROW_IF_FAILED(fontFiles[i]->GetLoader(loader.addressof()));
void const* fontFileReferenceKey;
UINT32 fontFileReferenceKeySize;
THROW_IF_FAILED(fontFiles[i]->GetReferenceKey(&fontFileReferenceKey, &fontFileReferenceKeySize));
if (const auto localLoader = loader.try_query<IDWriteLocalFontFileLoader>())
{
UINT32 filePathLength;
THROW_IF_FAILED(localLoader->GetFilePathLengthFromKey(fontFileReferenceKey, fontFileReferenceKeySize, &filePathLength));
filePathLength++;
std::wstring filePath(filePathLength, L'\0');
THROW_IF_FAILED(localLoader->GetFilePathFromKey(fontFileReferenceKey, fontFileReferenceKeySize, filePath.data(), filePathLength));
paths.emplace_back(std::move(filePath));
}
}
#endif
const int scale = row.lineRendition != LineRendition::SingleWidth;
D2D1_MATRIX_3X2_F transform = identityTransform;
if (scale)
{
transform.m11 = 2.0f;
transform.m22 = row.lineRendition >= LineRendition::DoubleHeightTop ? 2.0f : 1.0f;
_d2dRenderTarget->SetTransform(&transform);
}
const auto restoreTransform = wil::scope_exit([&]() noexcept {
_d2dRenderTarget->SetTransform(&identityTransform);
});
// This calculates the black box of the glyph, or in other words,
// it's extents/size relative to its baseline origin (at 0,0).
//
// bounds.top ------++-----######--+
// (-7) || ############
// ||#### ####
// |### #####
// baseline ______ |### #####|
// origin \|############# |
// (= 0,0) \|########### |
// ++-------###---+
// ## ### |
// bounds.bottom ---+#########-----+
// (+2) | |
// bounds.left bounds.right
// (-1) (+14)
//
bool isColorGlyph = false;
D2D1_RECT_F bounds = GlyphRunEmptyBounds;
const auto antialiasingCleanup = wil::scope_exit([&]() {
if (isColorGlyph)
{
_d2dRenderTarget4->SetTextAntialiasMode(static_cast<D2D1_TEXT_ANTIALIAS_MODE>(p.s->font->antialiasingMode));
}
});
{
wil::com_ptr<IDWriteColorGlyphRunEnumerator1> enumerator;
if (p.s->font->colorGlyphs)
{
enumerator = TranslateColorGlyphRun(p.dwriteFactory4.get(), {}, &glyphRun);
}
if (!enumerator)
{
THROW_IF_FAILED(_d2dRenderTarget->GetGlyphRunWorldBounds({}, &glyphRun, DWRITE_MEASURING_MODE_NATURAL, &bounds));
}
else
{
isColorGlyph = true;
_d2dRenderTarget4->SetTextAntialiasMode(D2D1_TEXT_ANTIALIAS_MODE_GRAYSCALE);
while (ColorGlyphRunMoveNext(enumerator.get()))
{
const auto colorGlyphRun = ColorGlyphRunGetCurrentRun(enumerator.get());
ColorGlyphRunAccumulateBounds(_d2dRenderTarget.get(), colorGlyphRun, bounds);
}
}
}
// The bounds may be empty if the glyph is whitespace.
if (bounds.left >= bounds.right || bounds.top >= bounds.bottom)
{
return _drawGlyphAllocateEntry(row, fontFaceEntry, glyphIndex);
}
const auto bl = lrintf(bounds.left);
const auto bt = lrintf(bounds.top);
const auto br = lrintf(bounds.right);
const auto bb = lrintf(bounds.bottom);
stbrp_rect rect{
.w = br - bl,
.h = bb - bt,
};
_drawGlyphAtlasAllocate(p, rect);
_d2dBeginDrawing();
const D2D1_POINT_2F baselineOrigin{
static_cast<f32>(rect.x - bl),
static_cast<f32>(rect.y - bt),
};
if (scale)
{
transform.dx = (1.0f - transform.m11) * baselineOrigin.x;
transform.dy = (1.0f - transform.m22) * baselineOrigin.y;
_d2dRenderTarget->SetTransform(&transform);
}
if (!isColorGlyph)
{
_d2dRenderTarget->DrawGlyphRun(baselineOrigin, &glyphRun, _brush.get(), DWRITE_MEASURING_MODE_NATURAL);
}
else
{
const auto enumerator = TranslateColorGlyphRun(p.dwriteFactory4.get(), baselineOrigin, &glyphRun);
while (ColorGlyphRunMoveNext(enumerator.get()))
{
const auto colorGlyphRun = ColorGlyphRunGetCurrentRun(enumerator.get());
ColorGlyphRunDraw(_d2dRenderTarget4.get(), _emojiBrush.get(), _brush.get(), colorGlyphRun);
}
}
// Ligatures are drawn with strict cell-wise foreground color, while other text allows colors to overhang
// their cells. This makes sure that italics and such retain their color and don't look "cut off".
//
// The former condition makes sure to exclude diacritics and such from being considered a ligature,
// while the latter condition-pair makes sure to exclude regular BMP wide glyphs that overlap a little.
const auto triggerLeft = _ligatureOverhangTriggerLeft << scale;
const auto triggerRight = _ligatureOverhangTriggerRight << scale;
const auto overlapSplit = rect.w >= p.s->font->cellSize.x && (bl <= triggerLeft || br >= triggerRight);
const auto glyphEntry = _drawGlyphAllocateEntry(row, fontFaceEntry, glyphIndex);
glyphEntry->shadingType = isColorGlyph ? ShadingType::TextPassthrough : _textShadingType;
glyphEntry->overlapSplit = overlapSplit;
glyphEntry->offset.x = bl;
glyphEntry->offset.y = bt;
glyphEntry->size.x = rect.w;
glyphEntry->size.y = rect.h;
glyphEntry->texcoord.x = rect.x;
glyphEntry->texcoord.y = rect.y;
if (row.lineRendition >= LineRendition::DoubleHeightTop)
{
_splitDoubleHeightGlyph(p, row, fontFaceEntry, glyphEntry);
}
return glyphEntry;
}
BackendD3D::AtlasGlyphEntry* BackendD3D::_drawBuiltinGlyph(const RenderingPayload& p, const ShapedRow& row, AtlasFontFaceEntry& fontFaceEntry, u32 glyphIndex)
{
auto baseline = p.s->font->baseline;
stbrp_rect rect{
.w = p.s->font->cellSize.x,
.h = p.s->font->cellSize.y,
};
if (row.lineRendition != LineRendition::SingleWidth)
{
const auto heightShift = static_cast<u8>(row.lineRendition >= LineRendition::DoubleHeightTop);
rect.w <<= 1;
rect.h <<= heightShift;
baseline <<= heightShift;
}
_drawGlyphAtlasAllocate(p, rect);
_d2dBeginDrawing();
auto shadingType = ShadingType::TextGrayscale;
const D2D1_RECT_F r{
static_cast<f32>(rect.x),
static_cast<f32>(rect.y),
static_cast<f32>(rect.x + rect.w),
static_cast<f32>(rect.y + rect.h),
};
if (BuiltinGlyphs::IsSoftFontChar(glyphIndex))
{
shadingType = _drawSoftFontGlyph(p, r, glyphIndex);
}
else
{
// This code works in tandem with SHADING_TYPE_TEXT_BUILTIN_GLYPH in our pixel shader.
// Unless someone removed it, it should have a lengthy comment visually explaining
// what each of the 3 RGB components do. The short version is:
// R: stretch the checkerboard pattern (Shape_Filled050) horizontally
// G: invert the pixels
// B: overrides the above and fills it
static constexpr D2D1_COLOR_F shadeColorMap[] = {
{ 1, 0, 0, 1 }, // Shape_Filled025
{ 0, 0, 0, 1 }, // Shape_Filled050
{ 1, 1, 0, 1 }, // Shape_Filled075
{ 1, 1, 1, 1 }, // Shape_Filled100
};
BuiltinGlyphs::DrawBuiltinGlyph(p.d2dFactory.get(), _d2dRenderTarget.get(), _brush.get(), shadeColorMap, r, glyphIndex);
shadingType = ShadingType::TextBuiltinGlyph;
}
const auto glyphEntry = _drawGlyphAllocateEntry(row, fontFaceEntry, glyphIndex);
glyphEntry->shadingType = shadingType;
glyphEntry->overlapSplit = 0;
glyphEntry->offset.x = 0;
glyphEntry->offset.y = -baseline;
glyphEntry->size.x = rect.w;
glyphEntry->size.y = rect.h;
glyphEntry->texcoord.x = rect.x;
glyphEntry->texcoord.y = rect.y;
if (row.lineRendition >= LineRendition::DoubleHeightTop)
{
_splitDoubleHeightGlyph(p, row, fontFaceEntry, glyphEntry);
}
return glyphEntry;
}
BackendD3D::ShadingType BackendD3D::_drawSoftFontGlyph(const RenderingPayload& p, const D2D1_RECT_F& rect, u32 glyphIndex)
{
const auto width = static_cast<size_t>(p.s->font->softFontCellSize.width);
const auto height = static_cast<size_t>(p.s->font->softFontCellSize.height);
const auto softFontIndex = glyphIndex - 0xEF20u;
const auto data = til::safe_slice_len(p.s->font->softFontPattern, height * softFontIndex, height);
// This happens if someone wrote a U+EF2x character (by accident), but we don't even have soft fonts enabled yet.
if (data.empty() || data.size() != height)
{
return ShadingType::Default;
}
if (!_softFontBitmap)
{
// Allocating such a tiny texture is very wasteful (min. texture size on GPUs
// right now is 64kB), but this is a seldom used feature, so it's fine...
const D2D1_SIZE_U size{
static_cast<UINT32>(width),
static_cast<UINT32>(height),
};
const D2D1_BITMAP_PROPERTIES1 bitmapProperties{
.pixelFormat = { DXGI_FORMAT_B8G8R8A8_UNORM, D2D1_ALPHA_MODE_PREMULTIPLIED },
.dpiX = static_cast<f32>(p.s->font->dpi),
.dpiY = static_cast<f32>(p.s->font->dpi),
};
THROW_IF_FAILED(_d2dRenderTarget->CreateBitmap(size, nullptr, 0, &bitmapProperties, _softFontBitmap.addressof()));
}
{
auto bitmapData = Buffer<u32>{ width * height };
auto dst = bitmapData.begin();
for (auto srcBits : data)
{
for (size_t x = 0; x < width; x++)
{
const auto srcBitIsSet = (srcBits & 0x8000) != 0;
*dst++ = srcBitIsSet ? 0xffffffff : 0x00000000;
srcBits <<= 1;
}
}
const auto pitch = static_cast<UINT32>(width * sizeof(u32));
THROW_IF_FAILED(_softFontBitmap->CopyFromMemory(nullptr, bitmapData.data(), pitch));
}
_d2dRenderTarget->PushAxisAlignedClip(&rect, D2D1_ANTIALIAS_MODE_ALIASED);
const auto restoreD2D = wil::scope_exit([&]() {
_d2dRenderTarget->PopAxisAlignedClip();
});
const auto interpolation = p.s->font->antialiasingMode == AntialiasingMode::Aliased ? D2D1_INTERPOLATION_MODE_NEAREST_NEIGHBOR : D2D1_INTERPOLATION_MODE_HIGH_QUALITY_CUBIC;
_d2dRenderTarget->DrawBitmap(_softFontBitmap.get(), &rect, 1, interpolation, nullptr, nullptr);
return ShadingType::TextGrayscale;
}
void BackendD3D::_drawGlyphAtlasAllocate(const RenderingPayload& p, stbrp_rect& rect)
{
if (stbrp_pack_rects(&_rectPacker, &rect, 1))
{
return;
}
_d2dEndDrawing();
_flushQuads(p);
_resetGlyphAtlas(p);
if (!stbrp_pack_rects(&_rectPacker, &rect, 1))
{
THROW_HR(HRESULT_FROM_WIN32(ERROR_POSSIBLE_DEADLOCK));
}
}
BackendD3D::AtlasGlyphEntry* BackendD3D::_drawGlyphAllocateEntry(const ShapedRow& row, AtlasFontFaceEntry& fontFaceEntry, u32 glyphIndex)
{
const auto glyphEntry = fontFaceEntry.glyphs[WI_EnumValue(row.lineRendition)].insert(glyphIndex).first;
glyphEntry->shadingType = ShadingType::Default;
return glyphEntry;
}
// If this is a double-height glyph (DECDHL), we need to split it into 2 glyph entries:
// One for the top/bottom half each, because that's how DECDHL works. This will clip the
// `glyphEntry` to only contain the one specified by `fontFaceEntry.lineRendition`
// and create a second entry in our glyph cache hashmap that contains the other half.
void BackendD3D::_splitDoubleHeightGlyph(const RenderingPayload& p, const ShapedRow& row, AtlasFontFaceEntry& fontFaceEntry, AtlasGlyphEntry* glyphEntry)
{
// Twice the line height, twice the descender gap. For both.
glyphEntry->offset.y -= p.s->font->descender;
const auto isTop = row.lineRendition == LineRendition::DoubleHeightTop;
const auto otherLineRendition = isTop ? LineRendition::DoubleHeightBottom : LineRendition::DoubleHeightTop;
const auto entry2 = fontFaceEntry.glyphs[WI_EnumValue(otherLineRendition)].insert(glyphEntry->glyphIndex).first;
*entry2 = *glyphEntry;
const auto top = isTop ? glyphEntry : entry2;
const auto bottom = isTop ? entry2 : glyphEntry;
const auto topSize = clamp(-glyphEntry->offset.y - p.s->font->baseline, 0, static_cast<int>(glyphEntry->size.y));
top->offset.y += p.s->font->cellSize.y;
top->size.y = topSize;
bottom->offset.y += topSize;
bottom->size.y = std::max(0, bottom->size.y - topSize);
bottom->texcoord.y += topSize;
// Things like diacritics might be so small that they only exist on either half of the
// double-height row. This effectively turns the other (unneeded) side into whitespace.
if (!top->size.y)
{
top->shadingType = ShadingType::Default;
}
if (!bottom->size.y)
{
bottom->shadingType = ShadingType::Default;
}
}
void BackendD3D::_drawGridlines(const RenderingPayload& p, u16 y)
{
const auto row = p.rows[y];
const auto horizontalShift = static_cast<u8>(row->lineRendition != LineRendition::SingleWidth);
const auto verticalShift = static_cast<u8>(row->lineRendition >= LineRendition::DoubleHeightTop);
const auto cellSize = p.s->font->cellSize;
const auto rowTop = static_cast<i16>(cellSize.y * y);
const auto rowBottom = static_cast<i16>(rowTop + cellSize.y);
auto textCellTop = rowTop;
if (row->lineRendition == LineRendition::DoubleHeightBottom)
{
textCellTop -= cellSize.y;
}
const i32 clipTop = row->lineRendition == LineRendition::DoubleHeightBottom ? rowTop : 0;
const i32 clipBottom = row->lineRendition == LineRendition::DoubleHeightTop ? rowBottom : p.s->targetSize.y;
const auto appendVerticalLines = [&](const GridLineRange& r, FontDecorationPosition pos) {
const auto textCellWidth = cellSize.x << horizontalShift;
const auto offset = pos.position << horizontalShift;
const auto width = static_cast<u16>(pos.height << horizontalShift);
auto posX = r.from * cellSize.x + offset;
const auto end = r.to * cellSize.x;
for (; posX < end; posX += textCellWidth)
{
_appendQuad() = {
.shadingType = static_cast<u16>(ShadingType::SolidLine),
.position = { static_cast<i16>(posX), rowTop },
.size = { width, p.s->font->cellSize.y },
.color = r.gridlineColor,
};
}
};
const auto appendHorizontalLine = [&](const GridLineRange& r, FontDecorationPosition pos, ShadingType shadingType, const u32 color) {
const auto offset = pos.position << verticalShift;
const auto height = static_cast<u16>(pos.height << verticalShift);
const auto left = static_cast<i16>(r.from * cellSize.x);
const auto width = static_cast<u16>((r.to - r.from) * cellSize.x);
i32 rt = textCellTop + offset;
i32 rb = rt + height;
rt = clamp(rt, clipTop, clipBottom);
rb = clamp(rb, clipTop, clipBottom);
if (rt < rb)
{
_appendQuad() = {
.shadingType = static_cast<u16>(shadingType),
.renditionScale = { static_cast<u8>(1 << horizontalShift), static_cast<u8>(1 << verticalShift) },
.position = { left, static_cast<i16>(rt) },
.size = { width, static_cast<u16>(rb - rt) },
.color = color,
};
}
};
for (const auto& r : row->gridLineRanges)
{
// AtlasEngine.cpp shouldn't add any gridlines if they don't do anything.
assert(r.lines.any());
if (r.lines.test(GridLines::Left))
{
appendVerticalLines(r, p.s->font->gridLeft);
}
if (r.lines.test(GridLines::Right))
{
appendVerticalLines(r, p.s->font->gridRight);
}
if (r.lines.test(GridLines::Top))
{
appendHorizontalLine(r, p.s->font->gridTop, ShadingType::SolidLine, r.gridlineColor);
}
if (r.lines.test(GridLines::Bottom))
{
appendHorizontalLine(r, p.s->font->gridBottom, ShadingType::SolidLine, r.gridlineColor);
}
if (r.lines.test(GridLines::Strikethrough))
{
appendHorizontalLine(r, p.s->font->strikethrough, ShadingType::SolidLine, r.gridlineColor);
}
if (r.lines.test(GridLines::Underline))
{
appendHorizontalLine(r, p.s->font->underline, ShadingType::SolidLine, r.underlineColor);
}
else if (r.lines.any(GridLines::DottedUnderline, GridLines::HyperlinkUnderline))
{
appendHorizontalLine(r, p.s->font->underline, ShadingType::DottedLine, r.underlineColor);
}
else if (r.lines.test(GridLines::DashedUnderline))
{
appendHorizontalLine(r, p.s->font->underline, ShadingType::DashedLine, r.underlineColor);
}
else if (r.lines.test(GridLines::CurlyUnderline))
{
appendHorizontalLine(r, _curlyUnderline, ShadingType::CurlyLine, r.underlineColor);
}
else if (r.lines.test(GridLines::DoubleUnderline))
{
for (const auto pos : p.s->font->doubleUnderline)
{
appendHorizontalLine(r, pos, ShadingType::SolidLine, r.underlineColor);
}
}
}
}
void BackendD3D::_drawCursorBackground(const RenderingPayload& p)
{
_cursorRects.clear();
if (p.cursorRect.empty())
{
return;
}
_cursorPosition = {
p.s->font->cellSize.x * p.cursorRect.left,
p.s->font->cellSize.y * p.cursorRect.top,
p.s->font->cellSize.x * p.cursorRect.right,
p.s->font->cellSize.y * p.cursorRect.bottom,
};
const auto cursorColor = p.s->cursor->cursorColor;
const auto offset = p.cursorRect.top * p.colorBitmapRowStride;
for (auto x1 = p.cursorRect.left; x1 < p.cursorRect.right; ++x1)
{
const auto x0 = x1;
const auto bg = p.backgroundBitmap[offset + x1] | 0xff000000;
for (; x1 < p.cursorRect.right && (p.backgroundBitmap[offset + x1] | 0xff000000) == bg; ++x1)
{
}
const i16x2 position{
static_cast<i16>(p.s->font->cellSize.x * x0),
static_cast<i16>(p.s->font->cellSize.y * p.cursorRect.top),
};
const u16x2 size{
static_cast<u16>(p.s->font->cellSize.x * (x1 - x0)),
p.s->font->cellSize.y,
};
auto background = cursorColor;
auto foreground = bg;
if (cursorColor == 0xffffffff)
{
background = bg ^ 0xffffff;
foreground = 0xffffffff;
}
// The legacy console used to invert colors by just doing `bg ^ 0xc0c0c0`. This resulted
// in a minimum squared distance of just 0.029195 across all possible color combinations.
background = ColorFix::GetPerceivableColor(background, bg, 0.25f * 0.25f);
auto& c0 = _cursorRects.emplace_back(position, size, background, foreground);
switch (static_cast<CursorType>(p.s->cursor->cursorType))
{
case CursorType::Legacy:
{
const auto height = (c0.size.y * p.s->cursor->heightPercentage + 50) / 100;
c0.position.y += c0.size.y - height;
c0.size.y = height;
break;
}
case CursorType::VerticalBar:
c0.size.x = p.s->font->thinLineWidth;
break;
case CursorType::Underscore:
c0.position.y += p.s->font->underline.position;
c0.size.y = p.s->font->underline.height;
break;
case CursorType::EmptyBox:
{
auto& c1 = _cursorRects.emplace_back(c0);
if (x0 == p.cursorRect.left)
{
auto& c = _cursorRects.emplace_back(c0);
// Make line a little shorter vertically so it doesn't overlap with the top/bottom horizontal lines.
c.position.y += p.s->font->thinLineWidth;
c.size.y -= 2 * p.s->font->thinLineWidth;
// The actual adjustment...
c.size.x = p.s->font->thinLineWidth;
}
if (x1 == p.cursorRect.right)
{
auto& c = _cursorRects.emplace_back(c0);
// Make line a little shorter vertically so it doesn't overlap with the top/bottom horizontal lines.
c.position.y += p.s->font->thinLineWidth;
c.size.y -= 2 * p.s->font->thinLineWidth;
// The actual adjustment...
c.position.x += c.size.x - p.s->font->thinLineWidth;
c.size.x = p.s->font->thinLineWidth;
}
c0.size.y = p.s->font->thinLineWidth;
c1.position.y += c1.size.y - p.s->font->thinLineWidth;
c1.size.y = p.s->font->thinLineWidth;
break;
}
case CursorType::FullBox:
break;
case CursorType::DoubleUnderscore:
{
auto& c1 = _cursorRects.emplace_back(c0);
c0.position.y += p.s->font->doubleUnderline[0].position;
c0.size.y = p.s->font->thinLineWidth;
c1.position.y += p.s->font->doubleUnderline[1].position;
c1.size.y = p.s->font->thinLineWidth;
break;
}
default:
break;
}
}
for (const auto& c : _cursorRects)
{
_appendQuad() = {
.shadingType = static_cast<u16>(ShadingType::Cursor),
.position = c.position,
.size = c.size,
.color = c.background,
};
}
}
void BackendD3D::_drawCursorForeground()
{
// NOTE: _appendQuad() may reallocate the _instances vector. It's important to iterate
// by index, because pointers (or iterators) would get invalidated. It's also important
// to cache the original _instancesCount since it'll get changed with each append.
auto instancesCount = _instancesCount;
size_t instancesOffset = 0;
assert(instancesCount != 0);
// All of the text drawing primitives are drawn as a single block, after drawing
// the background and cursor background and before drawing the selection overlay.
// To avoid having to check the shadingType in the loop below, we'll find the
// start and end of this "block" here in advance.
for (; instancesOffset < instancesCount; ++instancesOffset)
{
const auto shadingType = static_cast<ShadingType>(_instances[instancesOffset].shadingType);
if (shadingType >= ShadingType::TextDrawingFirst && shadingType <= ShadingType::TextDrawingLast)
{
break;
}
}
// We can also skip any instances (= any rows) at the beginning that are clearly not overlapping with
// the cursor. This reduces the CPU cost of this function by roughly half (a few microseconds).
for (; instancesOffset < instancesCount; ++instancesOffset)
{
const auto& it = _instances[instancesOffset];
if ((it.position.y + it.size.y) > _cursorPosition.top)
{
break;
}
}
// Now do the same thing as above, but backwards from the end.
for (; instancesCount > instancesOffset; --instancesCount)
{
const auto shadingType = static_cast<ShadingType>(_instances[instancesCount - 1].shadingType);
if (shadingType >= ShadingType::TextDrawingFirst && shadingType <= ShadingType::TextDrawingLast)
{
break;
}
}
for (; instancesCount > instancesOffset; --instancesCount)
{
const auto& it = _instances[instancesCount - 1];
if (it.position.y < _cursorPosition.bottom)
{
break;
}
}
// For cursors with multiple rectangles this really isn't all that fast, because it iterates
// over the instances vector multiple times. But I also don't really care, because the
// double-underline and empty-box cursors are pretty annoying to deal with in any case.
//
// It would definitely help if instead of position & size QuadInstances would use left/top/right/bottom
// with f32, because then computing the intersection would be much faster via SIMD. But that would
// make the struct size larger and cost more power to transmit more data to the GPU. ugh.
for (const auto& c : _cursorRects)
{
const int cursorL = c.position.x;
const int cursorT = c.position.y;
const int cursorR = cursorL + c.size.x;
const int cursorB = cursorT + c.size.y;
for (size_t i = instancesOffset; i < instancesCount; ++i)
{
const auto& it = _instances[i];
const int instanceL = it.position.x;
const int instanceT = it.position.y;
const int instanceR = instanceL + it.size.x;
const int instanceB = instanceT + it.size.y;
if (instanceL < cursorR && instanceR > cursorL && instanceT < cursorB && instanceB > cursorT)
{
// The _instances vector is _huge_ (easily up to 50k items) whereas only 1-2 items will actually overlap
// with the cursor. --> Make this loop more compact by putting as much as possible into a function call.
const auto added = _drawCursorForegroundSlowPath(c, i);
i += added;
instancesCount += added;
}
}
}
}
size_t BackendD3D::_drawCursorForegroundSlowPath(const CursorRect& c, size_t offset)
{
// We won't die from copying 24 bytes. It simplifies the code below especially in
// respect to when/if we overwrite the _instances[offset] slot with a cutout.
#pragma warning(suppress : 26820) // This is a potentially expensive copy operation. Consider using a reference unless a copy is required (p.9).
const auto it = _instances[offset];
// There's one special exception to the rule: Emojis. We currently don't really support inverting
// (or reversing) colored glyphs like that, so we can return early here and avoid cutting them up.
// It'd be too expensive to check for these rare glyph types inside the _drawCursorForeground() loop.
if (static_cast<ShadingType>(it.shadingType) == ShadingType::TextPassthrough)
{
return 0;
}
const int cursorL = c.position.x;
const int cursorT = c.position.y;
const int cursorR = cursorL + c.size.x;
const int cursorB = cursorT + c.size.y;
const int instanceL = it.position.x;
const int instanceT = it.position.y;
const int instanceR = instanceL + it.size.x;
const int instanceB = instanceT + it.size.y;
const auto intersectionL = std::max(cursorL, instanceL);
const auto intersectionT = std::max(cursorT, instanceT);
const auto intersectionR = std::min(cursorR, instanceR);
const auto intersectionB = std::min(cursorB, instanceB);
// We should only get called if there's actually an intersection.
assert(intersectionL < intersectionR && intersectionT < intersectionB);
// We need to ensure that the glyph doesn't "dirty" the cursor background with its un-inverted/un-reversed color.
// If it did, and we'd draw the inverted/reversed glyph on top, it would look smudged.
// As such, this cuts a cursor-sized hole into the original glyph and splits it up.
//
// > Always initialize an object
// I would pay money if this warning was a little smarter. The array can remain uninitialized,
// because it acts like a tiny small_vector, but without the assertions.
#pragma warning(suppress : 26494) // Variable 'cutouts' is uninitialized. Always initialize an object (type.5).
rect<int> cutouts[4];
size_t cutoutCount = 0;
if (instanceT < intersectionT)
{
cutouts[cutoutCount++] = { instanceL, instanceT, instanceR, intersectionT };
}
if (instanceB > intersectionB)
{
cutouts[cutoutCount++] = { instanceL, intersectionB, instanceR, instanceB };
}
if (instanceL < intersectionL)
{
cutouts[cutoutCount++] = { instanceL, intersectionT, intersectionL, intersectionB };
}
if (instanceR > intersectionR)
{
cutouts[cutoutCount++] = { intersectionR, intersectionT, instanceR, intersectionB };
}
const auto addedInstances = cutoutCount ? cutoutCount - 1 : 0;
// Make place for cutoutCount-many items at position.
// NOTE: _bumpInstancesSize() reallocates the vector and all references to _instances will now be invalid.
if (addedInstances)
{
const auto instancesCount = _instancesCount;
_instancesCount += addedInstances;
if (_instancesCount >= _instances.size())
{
_bumpInstancesSize();
}
const auto src = _instances.data() + offset;
const auto dst = src + addedInstances;
const auto count = instancesCount - offset;
assert(src >= _instances.begin() && (src + count) < _instances.end());
assert(dst >= _instances.begin() && (dst + count) < _instances.end());
memmove(dst, src, count * sizeof(QuadInstance));
}
// Now that there's space we can write the glyph cutouts back into the instances vector.
for (size_t i = 0; i < cutoutCount; ++i)
{
const auto& cutout = cutouts[i];
auto& target = _instances[offset + i];
target.shadingType = it.shadingType;
target.renditionScale.x = it.renditionScale.x;
target.renditionScale.y = it.renditionScale.y;
target.position.x = static_cast<i16>(cutout.left);
target.position.y = static_cast<i16>(cutout.top);
target.size.x = static_cast<u16>(cutout.right - cutout.left);
target.size.y = static_cast<u16>(cutout.bottom - cutout.top);
target.texcoord.x = static_cast<u16>(it.texcoord.x + cutout.left - instanceL);
target.texcoord.y = static_cast<u16>(it.texcoord.y + cutout.top - instanceT);
target.color = it.color;
}
auto color = c.foreground == 0xffffffff ? it.color ^ 0xffffff : c.foreground;
color = ColorFix::GetPerceivableColor(color, c.background, 0.5f * 0.5f);
// If the cursor covers the entire glyph (like, let's say, a full-box cursor with an ASCII character),
// we don't append a new quad, but rather reuse the one that already exists (cutoutCount == 0).
auto& target = cutoutCount ? _appendQuad() : _instances[offset];
target.shadingType = it.shadingType;
target.renditionScale.x = it.renditionScale.x;
target.renditionScale.y = it.renditionScale.y;
target.position.x = static_cast<i16>(intersectionL);
target.position.y = static_cast<i16>(intersectionT);
target.size.x = static_cast<u16>(intersectionR - intersectionL);
target.size.y = static_cast<u16>(intersectionB - intersectionT);
target.texcoord.x = static_cast<u16>(it.texcoord.x + intersectionL - instanceL);
target.texcoord.y = static_cast<u16>(it.texcoord.y + intersectionT - instanceT);
target.color = color;
return addedInstances;
}
void BackendD3D::_drawSelection(const RenderingPayload& p)
{
u16 y = 0;
u16 lastFrom = 0;
u16 lastTo = 0;
for (const auto& row : p.rows)
{
if (row->selectionTo > row->selectionFrom)
{
// If the current selection line matches the previous one, we can just extend the previous quad downwards.
// The way this is implemented isn't very smart, but we also don't have very many rows to iterate through.
if (row->selectionFrom == lastFrom && row->selectionTo == lastTo)
{
_getLastQuad().size.y += p.s->font->cellSize.y;
}
else
{
_appendQuad() = {
.shadingType = static_cast<u16>(ShadingType::Selection),
.position = {
static_cast<i16>(p.s->font->cellSize.x * row->selectionFrom),
static_cast<i16>(p.s->font->cellSize.y * y),
},
.size = {
static_cast<u16>(p.s->font->cellSize.x * (row->selectionTo - row->selectionFrom)),
static_cast<u16>(p.s->font->cellSize.y),
},
.color = p.s->misc->selectionColor,
};
lastFrom = row->selectionFrom;
lastTo = row->selectionTo;
}
}
y++;
}
}
void BackendD3D::_debugShowDirty(const RenderingPayload& p)
{
#if ATLAS_DEBUG_SHOW_DIRTY
_presentRects[_presentRectsPos] = p.dirtyRectInPx;
_presentRectsPos = (_presentRectsPos + 1) % std::size(_presentRects);
for (size_t i = 0; i < std::size(_presentRects); ++i)
{
const auto& rect = _presentRects[(_presentRectsPos + i) % std::size(_presentRects)];
if (rect.non_empty())
{
_appendQuad() = {
.shadingType = static_cast<u16>(ShadingType::Selection),
.position = {
static_cast<i16>(rect.left),
static_cast<i16>(rect.top),
},
.size = {
static_cast<u16>(rect.right - rect.left),
static_cast<u16>(rect.bottom - rect.top),
},
.color = til::colorbrewer::pastel1[i] | 0x1f000000,
};
}
}
#endif
}
void BackendD3D::_debugDumpRenderTarget(const RenderingPayload& p)
{
#if ATLAS_DEBUG_DUMP_RENDER_TARGET
if (_dumpRenderTargetCounter == 0)
{
ExpandEnvironmentStringsW(ATLAS_DEBUG_DUMP_RENDER_TARGET_PATH, &_dumpRenderTargetBasePath[0], gsl::narrow_cast<DWORD>(std::size(_dumpRenderTargetBasePath)));
std::filesystem::create_directories(_dumpRenderTargetBasePath);
}
wchar_t path[MAX_PATH];
swprintf_s(path, L"%s\\%u_%08zu.png", &_dumpRenderTargetBasePath[0], GetCurrentProcessId(), _dumpRenderTargetCounter);
SaveTextureToPNG(p.deviceContext.get(), _swapChainManager.GetBuffer().get(), p.s->font->dpi, &path[0]);
_dumpRenderTargetCounter++;
#endif
}
void BackendD3D::_executeCustomShader(RenderingPayload& p)
{
{
// See the comment in _recreateCustomShader() which initializes the two members below and explains what they do.
const auto now = queryPerfCount();
const auto time = static_cast<int>(now % _customShaderPerfTickMod) * _customShaderSecsPerPerfTick;
const CustomConstBuffer data{
.time = time,
.scale = static_cast<f32>(p.s->font->dpi) / static_cast<f32>(USER_DEFAULT_SCREEN_DPI),
.resolution = {
static_cast<f32>(_viewportCellCount.x * p.s->font->cellSize.x),
static_cast<f32>(_viewportCellCount.y * p.s->font->cellSize.y),
},
.background = colorFromU32Premultiply<f32x4>(p.s->misc->backgroundColor),
};
D3D11_MAPPED_SUBRESOURCE mapped{};
THROW_IF_FAILED(p.deviceContext->Map(_customShaderConstantBuffer.get(), 0, D3D11_MAP_WRITE_DISCARD, 0, &mapped));
memcpy(mapped.pData, &data, sizeof(data));
p.deviceContext->Unmap(_customShaderConstantBuffer.get(), 0);
}
{
// Before we do anything else we have to unbound _renderTargetView from being
// a render target, otherwise we can't use it as a shader resource below.
p.deviceContext->OMSetRenderTargets(1, _renderTargetView.addressof(), nullptr);
// IA: Input Assembler
p.deviceContext->IASetIndexBuffer(nullptr, DXGI_FORMAT_UNKNOWN, 0);
p.deviceContext->IASetInputLayout(nullptr);
p.deviceContext->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP);
p.deviceContext->IASetVertexBuffers(0, 0, nullptr, nullptr, nullptr);
// VS: Vertex Shader
p.deviceContext->VSSetShader(_customVertexShader.get(), nullptr, 0);
p.deviceContext->VSSetConstantBuffers(0, 0, nullptr);
// PS: Pixel Shader
p.deviceContext->PSSetShader(_customPixelShader.get(), nullptr, 0);
p.deviceContext->PSSetConstantBuffers(0, 1, _customShaderConstantBuffer.addressof());
ID3D11ShaderResourceView* const resourceViews[]{
_customOffscreenTextureView.get(), // The terminal contents
_customShaderTextureView.get(), // the experimental.pixelShaderImagePath, if there is one
};
p.deviceContext->PSSetShaderResources(0, resourceViews[1] ? 2 : 1, &resourceViews[0]);
p.deviceContext->PSSetSamplers(0, 1, _customShaderSamplerState.addressof());
// OM: Output Merger
p.deviceContext->OMSetBlendState(nullptr, nullptr, 0xffffffff);
}
p.deviceContext->Draw(4, 0);
{
// IA: Input Assembler
ID3D11Buffer* vertexBuffers[]{ _vertexBuffer.get(), _instanceBuffer.get() };
static constexpr UINT strides[]{ sizeof(f32x2), sizeof(QuadInstance) };
static constexpr UINT offsets[]{ 0, 0 };
p.deviceContext->IASetIndexBuffer(_indexBuffer.get(), DXGI_FORMAT_R16_UINT, 0);
p.deviceContext->IASetInputLayout(_inputLayout.get());
p.deviceContext->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST);
p.deviceContext->IASetVertexBuffers(0, 2, &vertexBuffers[0], &strides[0], &offsets[0]);
// VS: Vertex Shader
p.deviceContext->VSSetShader(_vertexShader.get(), nullptr, 0);
p.deviceContext->VSSetConstantBuffers(0, 1, _vsConstantBuffer.addressof());
// PS: Pixel Shader
ID3D11ShaderResourceView* resources[]{ _backgroundBitmapView.get(), _glyphAtlasView.get() };
p.deviceContext->PSSetShader(_pixelShader.get(), nullptr, 0);
p.deviceContext->PSSetConstantBuffers(0, 1, _psConstantBuffer.addressof());
p.deviceContext->PSSetShaderResources(0, 2, &resources[0]);
p.deviceContext->PSSetSamplers(0, 0, nullptr);
// OM: Output Merger
p.deviceContext->OMSetBlendState(_blendState.get(), nullptr, 0xffffffff);
p.deviceContext->OMSetRenderTargets(1, _customRenderTargetView.addressof(), nullptr);
}
// With custom shaders, everything might be invalidated, so we have to
// indirectly disable Present1() and its dirty rects this way.
p.dirtyRectInPx = { 0, 0, p.s->targetSize.x, p.s->targetSize.y };
}
TIL_FAST_MATH_END