mirror of
https://github.com/microsoft/terminal.git
synced 2026-02-03 18:17:40 -06:00
Improve sixel performance (#19639)
Together with #19640 this bumps the performance on my HW by around 60%. You can now stream 4K images at >60 FPS on a high-end CPU. ## Validation Steps Performed * Used imagemagick to dump large images ✅
This commit is contained in:
parent
64caebecd3
commit
c0debf8e26
1
.github/actions/spelling/expect/expect.txt
vendored
1
.github/actions/spelling/expect/expect.txt
vendored
@ -1017,6 +1017,7 @@ minkernel
|
||||
MINMAXINFO
|
||||
minwin
|
||||
minwindef
|
||||
misprediction
|
||||
MMBB
|
||||
mmcc
|
||||
MMCPL
|
||||
|
||||
@ -729,6 +729,11 @@ void SixelParser::_decreaseFilledBackgroundHeight(const int decreasedHeight) noe
|
||||
}
|
||||
}
|
||||
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable : 26429) // Symbol 'imageBufferPtr' is never tested for nullness, it can be marked as not_null (f.23).
|
||||
#pragma warning(disable : 26481) // Don't use pointer arithmetic. Use span instead (bounds.1).
|
||||
#pragma warning(disable : 26490) // Don't use reinterpret_cast (type.1).
|
||||
|
||||
void SixelParser::_writeToImageBuffer(int sixelValue, int repeatCount)
|
||||
{
|
||||
// On terminals that support the raster attributes command (which sets the
|
||||
@ -736,32 +741,88 @@ void SixelParser::_writeToImageBuffer(int sixelValue, int repeatCount)
|
||||
// is received. So if we haven't filled it yet, we need to do so now.
|
||||
_fillImageBackground();
|
||||
|
||||
repeatCount = std::min(repeatCount, _imageMaxWidth - _imageCursor.x);
|
||||
if (repeatCount <= 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (sixelValue == 0)
|
||||
{
|
||||
_imageCursor.x += repeatCount;
|
||||
return;
|
||||
}
|
||||
|
||||
// This allows us to unsafely cast _imageBuffer to uint16_t
|
||||
// and benefit from compiler/STL optimizations.
|
||||
static_assert(sizeof(IndexedPixel) == sizeof(int16_t));
|
||||
static_assert(alignof(IndexedPixel) == alignof(int16_t));
|
||||
|
||||
// Then we need to render the 6 vertical pixels that are represented by the
|
||||
// bits in the sixel value. Although note that each of these sixel pixels
|
||||
// may cover more than one device pixel, depending on the aspect ratio.
|
||||
const auto targetOffset = _imageCursor.y * _imageMaxWidth + _imageCursor.x;
|
||||
auto imageBufferPtr = std::next(_imageBuffer.data(), targetOffset);
|
||||
repeatCount = std::min(repeatCount, _imageMaxWidth - _imageCursor.x);
|
||||
for (auto i = 0; i < 6; i++)
|
||||
const auto foreground = std::bit_cast<int16_t>(_foregroundPixel);
|
||||
auto imageBufferPtr = reinterpret_cast<int16_t*>(_imageBuffer.data() + targetOffset);
|
||||
|
||||
// A aspect ratio of 1:1 is the most common and worth optimizing.
|
||||
if (_pixelAspectRatio == 1)
|
||||
{
|
||||
if (sixelValue & 1)
|
||||
do
|
||||
{
|
||||
auto repeatAspectRatio = _pixelAspectRatio;
|
||||
do
|
||||
// This gets unrolled by MSVC. It's written this way to use CMOV instructions.
|
||||
// Modern CPUs have fat caches and deep pipelines. It's better to do pointless reads
|
||||
// from memory than causing branch misprediction, as sixelValue is highly random.
|
||||
for (int i = 0; i < 6; i++)
|
||||
{
|
||||
std::fill_n(imageBufferPtr, repeatCount, _foregroundPixel);
|
||||
std::advance(imageBufferPtr, _imageMaxWidth);
|
||||
} while (--repeatAspectRatio > 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
std::advance(imageBufferPtr, _imageMaxWidth * _pixelAspectRatio);
|
||||
}
|
||||
sixelValue >>= 1;
|
||||
const auto test = sixelValue & (1 << i);
|
||||
// Possibly pointless read from memory, but...
|
||||
const auto before = imageBufferPtr[i * _imageMaxWidth];
|
||||
// ...it allows the compiler to turn this into a CMOV (= no branch misprediction).
|
||||
const auto after = test ? foreground : before;
|
||||
imageBufferPtr[i * _imageMaxWidth] = after;
|
||||
}
|
||||
|
||||
_imageCursor.x += 1;
|
||||
imageBufferPtr += 1;
|
||||
repeatCount -= 1;
|
||||
} while (repeatCount > 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (auto i = 0; i < 6; i++)
|
||||
{
|
||||
if (sixelValue & 1)
|
||||
{
|
||||
auto repeatAspectRatio = _pixelAspectRatio;
|
||||
do
|
||||
{
|
||||
// If this used std::fill_n or just a primitive loop, MSVC would compile
|
||||
// it to a `rep stosw` instruction, which has a high startup cost.
|
||||
// This is not ideal when our repeatCount is almost always small.
|
||||
// The way this does ptr++ and len-- is also ideal for optimization.
|
||||
auto ptr = imageBufferPtr;
|
||||
auto remaining = repeatCount;
|
||||
do
|
||||
{
|
||||
__iso_volatile_store16(ptr++, foreground);
|
||||
} while (--remaining != 0);
|
||||
|
||||
imageBufferPtr += _imageMaxWidth;
|
||||
} while (--repeatAspectRatio > 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
std::advance(imageBufferPtr, _imageMaxWidth * _pixelAspectRatio);
|
||||
}
|
||||
sixelValue >>= 1;
|
||||
}
|
||||
_imageCursor.x += repeatCount;
|
||||
}
|
||||
_imageCursor.x += repeatCount;
|
||||
}
|
||||
|
||||
#pragma warning(pop)
|
||||
|
||||
void SixelParser::_eraseImageBufferRows(const int rowCount, const til::CoordType rowOffset) noexcept
|
||||
{
|
||||
const auto pixelCount = rowCount * _cellSize.height;
|
||||
|
||||
@ -42,7 +42,7 @@ namespace Microsoft::Console::VirtualTerminal
|
||||
// to retain the 16-bit size.
|
||||
static constexpr size_t MAX_COLORS = 256;
|
||||
using IndexType = uint8_t;
|
||||
struct IndexedPixel
|
||||
struct alignas(int16_t) IndexedPixel
|
||||
{
|
||||
uint8_t transparent = false;
|
||||
IndexType colorIndex = 0;
|
||||
@ -104,8 +104,8 @@ namespace Microsoft::Console::VirtualTerminal
|
||||
const size_t _maxColors;
|
||||
size_t _colorsUsed = 0;
|
||||
size_t _colorsAvailable = 0;
|
||||
bool _colorTableChanged = false;
|
||||
IndexedPixel _foregroundPixel = {};
|
||||
bool _colorTableChanged = false;
|
||||
|
||||
void _initImageBuffer();
|
||||
void _resizeImageBuffer(const til::CoordType requiredHeight);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user