Ok habe bisschen rumgebastellt und massiv die Performance verbessert.
Code: Alles auswählen
#include <emmintrin.h>
#include <tmmintrin.h>
// Factors by which the YUV components are mixed. (In 9.6 fixed point format)
static const __m128i RedVFactor = _mm_set1_epi16(static_cast<short>(2.032 * 64 + 0.5));
static const __m128i GreenUFactor = _mm_set1_epi16(static_cast<short>(0.395 * 64 + 0.5));
static const __m128i GreenVFactor = _mm_set1_epi16(static_cast<short>(0.581 * 64 + 0.5));
static const __m128i BlueUFactor = _mm_set1_epi16(static_cast<short>(1.140 * 64 + 0.5));
// Masks that move the corresponding bytes into the right position.
static const __m128i ShuffleRed = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
static const __m128i ShuffleGreen = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
static const __m128i ShuffleBlue = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
static const __m128i Select0 = _mm_setr_epi8(-1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1);
static const __m128i Select1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
static const __m128i Select2 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
void convertPlanesYUV420pToBGRWithSSEVersion2(unsigned char const* PlaneY, unsigned char const* PlaneU, unsigned char const* PlaneV, unsigned char* TargetBuffer, int ResX, int ResY) {
// The following code works with 16 by 2 blocks of pixels at once.
static const int StepSizeX = 16;
static const int StepSizeY = 2;
assert((ResX % StepSizeX) == 0); // I hope this is given and must not be handled?
assert((ResY % StepSizeY) == 0); // I hope this is given and must not be handled?
for (int CntY = 0; CntY < ResY; CntY += StepSizeY) {
const unsigned char* const CurrentPlaneY = PlaneY + CntY * ResX;
const unsigned char* const CurrentPlaneU = PlaneU + (CntY / 2) * (ResX / 2);
const unsigned char* const CurrentPlaneV = PlaneV + (CntY / 2) * (ResX / 2);
unsigned char* const CurrentTargetBuffer = TargetBuffer + CntY * (ResX * 3);
for (int CntX = 0; CntX < ResX; CntX += StepSizeX) {
// Load the data...
const __m128i LoadedY0 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(CurrentPlaneY + CntX)); // Load 16 bytes
const __m128i LoadedY1 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(CurrentPlaneY + CntX + ResX)); // Load 16 bytes
const __m128i LoadedU = _mm_loadl_epi64(reinterpret_cast<__m128i const*>(CurrentPlaneU + CntX / 2)); // Load 8 bytes
const __m128i LoadedV = _mm_loadl_epi64(reinterpret_cast<__m128i const*>(CurrentPlaneV + CntX / 2)); // Load 8 bytes
// Bring it into 16 bit ints and adjust the format to
// (1 bit sign, 9 bits integral, 6 bits fractional).
// The integral part is a little bigger than usual to avoid intermediate overflow of the multiplication or the clamping.
const __m128i Zero = _mm_setzero_si128();
const __m128i Y00 = _mm_sub_epi16(_mm_slli_epi16(_mm_unpacklo_epi8(LoadedY0, Zero), 6), _mm_set1_epi16(16 << 6)); // First 8 shorts of first row.
const __m128i Y10 = _mm_sub_epi16(_mm_slli_epi16(_mm_unpackhi_epi8(LoadedY0, Zero), 6), _mm_set1_epi16(16 << 6)); // Second 8 shorts of first row.
const __m128i Y01 = _mm_sub_epi16(_mm_slli_epi16(_mm_unpacklo_epi8(LoadedY1, Zero), 6), _mm_set1_epi16(16 << 6)); // First 8 shorts of second row.
const __m128i Y11 = _mm_sub_epi16(_mm_slli_epi16(_mm_unpackhi_epi8(LoadedY1, Zero), 6), _mm_set1_epi16(16 << 6)); // Second 8 shorts of second row.
const __m128i U = _mm_sub_epi16(_mm_unpacklo_epi8(LoadedU, Zero), _mm_set1_epi16(128)); // Don't shift them yet. They will be shifted by the multiplication by the *Factors.
const __m128i V = _mm_sub_epi16(_mm_unpacklo_epi8(LoadedV, Zero), _mm_set1_epi16(128)); // Don't shift them yet. They will be shifted by the multiplication by the *Factors
// Calculate resulting red components...
const __m128i RedAdjust = _mm_mullo_epi16(V, RedVFactor);
const __m128i R00 = _mm_adds_epi16(Y00, _mm_unpacklo_epi16(RedAdjust, RedAdjust));
const __m128i R10 = _mm_adds_epi16(Y10, _mm_unpackhi_epi16(RedAdjust, RedAdjust));
const __m128i R01 = _mm_adds_epi16(Y01, _mm_unpacklo_epi16(RedAdjust, RedAdjust));
const __m128i R11 = _mm_adds_epi16(Y11, _mm_unpackhi_epi16(RedAdjust, RedAdjust));
// Calculate resulting green components...
const __m128i GreenAdjust = _mm_adds_epi16(_mm_mullo_epi16(U, GreenUFactor), _mm_mullo_epi16(V, GreenVFactor));
const __m128i G00 = _mm_subs_epi16(Y00, _mm_unpacklo_epi16(GreenAdjust, GreenAdjust));
const __m128i G10 = _mm_subs_epi16(Y10, _mm_unpackhi_epi16(GreenAdjust, GreenAdjust));
const __m128i G01 = _mm_subs_epi16(Y01, _mm_unpacklo_epi16(GreenAdjust, GreenAdjust));
const __m128i G11 = _mm_subs_epi16(Y11, _mm_unpackhi_epi16(GreenAdjust, GreenAdjust));
// Calculate resulting blue components...
const __m128i BlueAdjust = _mm_mullo_epi16(U, BlueUFactor);
const __m128i B00 = _mm_adds_epi16(Y00, _mm_unpacklo_epi16(BlueAdjust, BlueAdjust));
const __m128i B10 = _mm_adds_epi16(Y10, _mm_unpackhi_epi16(BlueAdjust, BlueAdjust));
const __m128i B01 = _mm_adds_epi16(Y01, _mm_unpacklo_epi16(BlueAdjust, BlueAdjust));
const __m128i B11 = _mm_adds_epi16(Y11, _mm_unpackhi_epi16(BlueAdjust, BlueAdjust));
// Compress shorts back into bytes.
// (Also does the clamping.)
const __m128i OutR0 = _mm_packus_epi16(_mm_srai_epi16(R00, 6), _mm_srai_epi16(R10, 6)); // First row, To do rounding, add 32 before shifting.
const __m128i OutR1 = _mm_packus_epi16(_mm_srai_epi16(R01, 6), _mm_srai_epi16(R11, 6)); // Second row
const __m128i OutG0 = _mm_packus_epi16(_mm_srai_epi16(G00, 6), _mm_srai_epi16(G10, 6));
const __m128i OutG1 = _mm_packus_epi16(_mm_srai_epi16(G01, 6), _mm_srai_epi16(G11, 6));
const __m128i OutB0 = _mm_packus_epi16(_mm_srai_epi16(B00, 6), _mm_srai_epi16(B10, 6));
const __m128i OutB1 = _mm_packus_epi16(_mm_srai_epi16(B01, 6), _mm_srai_epi16(B11, 6));
// Now we need to write these values out in the correct order.
// First bring the bytes into their final positions.
const __m128i RedBytes0 = _mm_shuffle_epi8(OutR0, ShuffleRed);
const __m128i GreenBytes0 = _mm_shuffle_epi8(OutG0, ShuffleGreen);
const __m128i BlueBytes0 = _mm_shuffle_epi8(OutB0, ShuffleBlue);
const __m128i RedBytes1 = _mm_shuffle_epi8(OutR1, ShuffleRed);
const __m128i GreenBytes1 = _mm_shuffle_epi8(OutG1, ShuffleGreen);
const __m128i BlueBytes1 = _mm_shuffle_epi8(OutB1, ShuffleBlue);
// Now choose 1 red byte, 1 green byte and 1 blue byte in an alternating fashion.
// I also tried using "_mm_blendv_epi8" from SSE4.1, but it's actually slower by 15% overall.
const __m128i Out0 = _mm_or_si128(_mm_and_si128(BlueBytes0, Select0), _mm_or_si128(_mm_and_si128(GreenBytes0, Select1), _mm_and_si128(RedBytes0, Select2))); // Now choose 1 red byte, 1 green byte and 1 blue byte in an alternating fashion.
const __m128i Out1 = _mm_or_si128(_mm_and_si128(BlueBytes0, Select2), _mm_or_si128(_mm_and_si128(GreenBytes0, Select0), _mm_and_si128(RedBytes0, Select1)));
const __m128i Out2 = _mm_or_si128(_mm_and_si128(BlueBytes0, Select1), _mm_or_si128(_mm_and_si128(GreenBytes0, Select2), _mm_and_si128(RedBytes0, Select0)));
const __m128i Out3 = _mm_or_si128(_mm_and_si128(BlueBytes1, Select0), _mm_or_si128(_mm_and_si128(GreenBytes1, Select1), _mm_and_si128(RedBytes1, Select2))); // Now choose 1 red byte, 1 green byte and 1 blue byte in an alternating fashion.
const __m128i Out4 = _mm_or_si128(_mm_and_si128(BlueBytes1, Select2), _mm_or_si128(_mm_and_si128(GreenBytes1, Select0), _mm_and_si128(RedBytes1, Select1)));
const __m128i Out5 = _mm_or_si128(_mm_and_si128(BlueBytes1, Select1), _mm_or_si128(_mm_and_si128(GreenBytes1, Select2), _mm_and_si128(RedBytes1, Select0)));
// Write the data to the target...
_mm_storeu_si128(reinterpret_cast<__m128i*>(CurrentTargetBuffer + CntX * 3), Out0);
_mm_storeu_si128(reinterpret_cast<__m128i*>(CurrentTargetBuffer + CntX * 3 + 16), Out1);
_mm_storeu_si128(reinterpret_cast<__m128i*>(CurrentTargetBuffer + CntX * 3 + 32), Out2);
_mm_storeu_si128(reinterpret_cast<__m128i*>(CurrentTargetBuffer + (CntX + ResX) * 3), Out3);
_mm_storeu_si128(reinterpret_cast<__m128i*>(CurrentTargetBuffer + (CntX + ResX) * 3 + 16), Out4);
_mm_storeu_si128(reinterpret_cast<__m128i*>(CurrentTargetBuffer + (CntX + ResX) * 3 + 32), Out5);
}
}
}
Der Code setzt jetzt SSSE3 für den Shuffle vorraus, aber das sollte inzwischen eigentlich jeder haben.
Ansonsten habe ich im Wesentlichen die Sachen umgesetzt, die ich vorgeschlagen habe.
Der Code rundet vereinzelt leicht anders, aber in meinem Test ergibt sich eine Abweichung von maximal 1, wobei in den meisten Fällen keine Abweichung besteht.
Falls jemand noch weiter experimentieren will, ich habe meinen vollständigen Testcode (mit Windows Funktionen) angehängt.