diff --git a/src/connection.ts b/src/connection.ts index fbb0f3221..80119511a 100644 --- a/src/connection.ts +++ b/src/connection.ts @@ -239,7 +239,6 @@ export default class Connection { draw(frame: any) { this._draw?.(frame); - // globals.I420ToARGB(frame); } close() { diff --git a/src/globals.js b/src/globals.js index 3aaf6685c..c9452b349 100644 --- a/src/globals.js +++ b/src/globals.js @@ -328,17 +328,18 @@ export function I420ToARGB(yb) { vPtr = malloc(n); } HEAPU8.set(yb.v.bytes, vPtr); - const w = yb.format.width; - const h = yb.format.height; + const w = yb.format.displayWidth; + const h = yb.format.displayHeight; n = w * h * 4; if (outPtrLen != n) { if (outPtr) free(outPtr); outPtrLen = n; outPtr = malloc(n); + HEAPU8.fill(255, outPtr, outPtr + n); } // const res = wasmExports.I420ToARGB(yPtr, yb.y.stride, uPtr, yb.u.stride, vPtr, yb.v.stride, outPtr, w * 4, w, h); - const res = wasmExports.AVX_YUV_to_RGBA(outPtr, yPtr, uPtr, vPtr, w, h); - // const res = wasmExports.yuv420_rgb24_std(w, h, yPtr, uPtr, vPtr, yb.y.stride, yb.v.stride, outPtr, w * 4, 0); + // const res = wasmExports.AVX_YUV_to_ARGB(outPtr, yPtr, yb.y.stride, uPtr, yb.u.stride, vPtr, yb.v.stride, w, h); + const res = wasmExports.yuv420_rgb24_std(w, h, yPtr, uPtr, vPtr, yb.y.stride, yb.v.stride, outPtr, w * 4, 1); const out = HEAPU8.slice(outPtr, outPtr + n); /* testSpeed[1] += new Date().getTime() - tm0; diff --git a/yuv-to-rgb.c b/yuv-to-rgb.c index 14eb60c78..ed99462e8 100644 --- a/yuv-to-rgb.c +++ b/yuv-to-rgb.c @@ -45,7 +45,7 @@ static int foo; static int frame; void -AVX_YUV_to_RGBA(unsigned char *dst, unsigned char *y, unsigned char* u, unsigned char* v, int width, int height) { +AVX_YUV_to_ARGB(unsigned char *dst, unsigned char *y, int ystride, unsigned char* u, int ustride, unsigned char* v, int vstride, int width, int height) { int r, g, b; unsigned char *uline, *vline; int w, h; @@ -54,27 +54,27 @@ AVX_YUV_to_RGBA(unsigned char *dst, unsigned char *y, unsigned char* u, unsigned initialized = !0; build_tables(); } - int half_width = width / 2; // Loop the image, taking into account sub-sample for the chroma channels for (h = 0; h < height; h++) { uline = u; vline = v; - for (w = 0; w < width; w++, y++) { + for (w = 0; w < width; w++) { r = *y + T1[*vline]; g = *y + T2[*vline] + T3[*uline]; b = *y + T4[*uline]; - *dst++ = clamp(r); // 16-bit to 8-bit, chuck precision + *dst++ = clamp(b); // 16-bit to 8-bit, chuck precision *dst++ = clamp(g); - *dst++ = clamp(b); - *dst++ = 255; + *dst++ = clamp(r); + ++dst; if (w & 0x01) { uline++; vline++; } } + y += ystride; if (h & 0x01) { - u += half_width; - v += half_width; + u += ustride; + v += vstride; } } } diff --git a/yuv.wasm b/yuv.wasm index f93a6eb2a..f203c685c 100755 Binary files a/yuv.wasm and b/yuv.wasm differ diff --git a/yuv_rgb.c b/yuv_rgb.c new file mode 100644 index 000000000..9ac3b4bae --- /dev/null +++ b/yuv_rgb.c @@ -0,0 +1,1305 @@ +// Copyright 2016 Adrien Descamps +// Distributed under BSD 3-Clause License + +#include "yuv_rgb.h" + +//#include +// +#include + +#include + + +uint8_t clamp(int16_t value) +{ + return value<0 ? 0 : (value>255 ? 255 : value); +} + +// Definitions +// +// E'R, E'G, E'B, E'Y, E'Cb and E'Cr refer to the analog signals +// E'R, E'G, E'B and E'Y range is [0:1], while E'Cb and E'Cr range is [-0.5:0.5] +// R, G, B, Y, Cb and Cr refer to the digitalized values +// The digitalized values can use their full range ([0:255] for 8bit values), +// or a subrange (typically [16:235] for Y and [16:240] for CbCr). +// We assume here that RGB range is always [0:255], since it is the case for +// most digitalized images. +// For 8bit values : +// * Y = round((YMax-YMin)*E'Y + YMin) +// * Cb = round((CbRange)*E'Cb + 128) +// * Cr = round((CrRange)*E'Cr + 128) +// Where *Min and *Max are the range of each channel +// +// In the analog domain , the RGB to YCbCr transformation is defined as: +// * E'Y = Rf*E'R + Gf*E'G + Bf*E'B +// Where Rf, Gf and Bf are constants defined in each standard, with +// Rf + Gf + Bf = 1 (necessary to ensure that E'Y range is [0:1]) +// * E'Cb = (E'B - E'Y) / CbNorm +// * E'Cr = (E'R - E'Y) / CrNorm +// Where CbNorm and CrNorm are constants, dependent of Rf, Gf, Bf, computed +// to normalize to a [-0.5:0.5] range : CbNorm=2*(1-Bf) and CrNorm=2*(1-Rf) +// +// Algorithms +// +// Most operations will be made in a fixed point format for speed, using +// N bits of precision. In next section the [x] convention is used for +// a fixed point rounded value, that is (int being the c type conversion) +// * [x] = int(x*(2^N)+0.5) +// N can be different for each factor, we simply use the highest value +// that will not overflow in 16 bits intermediate variables. +//. +// For RGB to YCbCr conversion, we start by generating a pseudo Y value +// (noted Y') in fixed point format, using the full range for now. +// * Y' = ([Rf]*R + [Gf]*G + [Bf]*B)>>N +// We can then compute Cb and Cr by +// * Cb = ((B - Y')*[CbRange/(255*CbNorm)])>>N + 128 +// * Cr = ((R - Y')*[CrRange/(255*CrNorm)])>>N + 128 +// And finally, we normalize Y to its digital range +// * Y = (Y'*[(YMax-YMin)/255])>>N + YMin +// +// For YCbCr to RGB conversion, we first compute the full range Y' value : +// * Y' = ((Y-YMin)*[255/(YMax-YMin)])>>N +// We can then compute B and R values by : +// * B = ((Cb-128)*[(255*CbNorm)/CbRange])>>N + Y' +// * R = ((Cr-128)*[(255*CrNorm)/CrRange])>>N + Y' +// And finally, for G we know that: +// * G = (Y' - (Rf*R + Bf*B)) / Gf +// From above: +// * G = (Y' - Rf * ((Cr-128)*(255*CrNorm)/CrRange + Y') - Bf * ((Cb-128)*(255*CbNorm)/CbRange + Y')) / Gf +// Since 1-Rf-Bf=Gf, we can take Y' out of the division by Gf, and we get: +// * G = Y' - (Cr-128)*Rf/Gf*(255*CrNorm)/CrRange - (Cb-128)*Bf/Gf*(255*CbNorm)/CbRange +// That we can compute, with fixed point arithmetic, by +// * G = Y' - ((Cr-128)*[Rf/Gf*(255*CrNorm)/CrRange] + (Cb-128)*[Bf/Gf*(255*CbNorm)/CbRange])>>N +// +// Note : in ITU-T T.871(JPEG), Y=Y', so that part could be optimized out + + +#define FIXED_POINT_VALUE(value, precision) ((int)(((value)*(1<r_factor*rgb_ptr1[0] + param->g_factor*rgb_ptr1[1] + param->b_factor*rgb_ptr1[2])>>8; + u_tmp = rgb_ptr1[2]-y_tmp; + v_tmp = rgb_ptr1[0]-y_tmp; + y_ptr1[0]=((y_tmp*param->y_factor)>>7) + param->y_offset; + + y_tmp = (param->r_factor*rgb_ptr1[3] + param->g_factor*rgb_ptr1[4] + param->b_factor*rgb_ptr1[5])>>8; + u_tmp += rgb_ptr1[5]-y_tmp; + v_tmp += rgb_ptr1[3]-y_tmp; + y_ptr1[1]=((y_tmp*param->y_factor)>>7) + param->y_offset; + + y_tmp = (param->r_factor*rgb_ptr2[0] + param->g_factor*rgb_ptr2[1] + param->b_factor*rgb_ptr2[2])>>8; + u_tmp += rgb_ptr2[2]-y_tmp; + v_tmp += rgb_ptr2[0]-y_tmp; + y_ptr2[0]=((y_tmp*param->y_factor)>>7) + param->y_offset; + + y_tmp = (param->r_factor*rgb_ptr2[3] + param->g_factor*rgb_ptr2[4] + param->b_factor*rgb_ptr2[5])>>8; + u_tmp += rgb_ptr2[5]-y_tmp; + v_tmp += rgb_ptr2[3]-y_tmp; + y_ptr2[1]=((y_tmp*param->y_factor)>>7) + param->y_offset; + + u_ptr[0] = (((u_tmp>>2)*param->cb_factor)>>8) + 128; + v_ptr[0] = (((v_tmp>>2)*param->cb_factor)>>8) + 128; + + rgb_ptr1 += 6; + rgb_ptr2 += 6; + y_ptr1 += 2; + y_ptr2 += 2; + u_ptr += 1; + v_ptr += 1; + } + } +} + +void rgb32_yuv420_std( + uint32_t width, uint32_t height, + const uint8_t *RGBA, uint32_t RGBA_stride, + uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, + YCbCrType yuv_type) +{ + const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]); + + uint32_t x, y; + for(y=0; y<(height-1); y+=2) + { + const uint8_t *rgb_ptr1=RGBA+y*RGBA_stride, + *rgb_ptr2=RGBA+(y+1)*RGBA_stride; + + uint8_t *y_ptr1=Y+y*Y_stride, + *y_ptr2=Y+(y+1)*Y_stride, + *u_ptr=U+(y/2)*UV_stride, + *v_ptr=V+(y/2)*UV_stride; + + for(x=0; x<(width-1); x+=2) + { + // compute yuv for the four pixels, u and v values are summed + uint8_t y_tmp; + int16_t u_tmp, v_tmp; + + y_tmp = (param->r_factor*rgb_ptr1[0] + param->g_factor*rgb_ptr1[1] + param->b_factor*rgb_ptr1[2])>>8; + u_tmp = rgb_ptr1[2]-y_tmp; + v_tmp = rgb_ptr1[0]-y_tmp; + y_ptr1[0]=((y_tmp*param->y_factor)>>7) + param->y_offset; + + y_tmp = (param->r_factor*rgb_ptr1[4] + param->g_factor*rgb_ptr1[5] + param->b_factor*rgb_ptr1[6])>>8; + u_tmp += rgb_ptr1[6]-y_tmp; + v_tmp += rgb_ptr1[4]-y_tmp; + y_ptr1[1]=((y_tmp*param->y_factor)>>7) + param->y_offset; + + y_tmp = (param->r_factor*rgb_ptr2[0] + param->g_factor*rgb_ptr2[1] + param->b_factor*rgb_ptr2[2])>>8; + u_tmp += rgb_ptr2[2]-y_tmp; + v_tmp += rgb_ptr2[0]-y_tmp; + y_ptr2[0]=((y_tmp*param->y_factor)>>7) + param->y_offset; + + y_tmp = (param->r_factor*rgb_ptr2[4] + param->g_factor*rgb_ptr2[5] + param->b_factor*rgb_ptr2[6])>>8; + u_tmp += rgb_ptr2[6]-y_tmp; + v_tmp += rgb_ptr2[4]-y_tmp; + y_ptr2[1]=((y_tmp*param->y_factor)>>7) + param->y_offset; + + u_ptr[0] = (((u_tmp>>2)*param->cb_factor)>>8) + 128; + v_ptr[0] = (((v_tmp>>2)*param->cb_factor)>>8) + 128; + + rgb_ptr1 += 8; + rgb_ptr2 += 8; + y_ptr1 += 2; + y_ptr2 += 2; + u_ptr += 1; + v_ptr += 1; + } + } +} + + +void yuv420_rgb24_std( + uint32_t width, uint32_t height, + const uint8_t *Y, const uint8_t *U, const uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, + uint8_t *RGB, uint32_t RGB_stride, + YCbCrType yuv_type) +{ + const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]); + uint32_t x, y; + for(y=0; y<(height-1); y+=2) + { + const uint8_t *y_ptr1=Y+y*Y_stride, + *y_ptr2=Y+(y+1)*Y_stride, + *u_ptr=U+(y/2)*UV_stride, + *v_ptr=V+(y/2)*UV_stride; + + uint8_t *rgb_ptr1=RGB+y*RGB_stride, + *rgb_ptr2=RGB+(y+1)*RGB_stride; + + for(x=0; x<(width-1); x+=2) + { + int8_t u_tmp, v_tmp; + u_tmp = u_ptr[0]-128; + v_tmp = v_ptr[0]-128; + + //compute Cb Cr color offsets, common to four pixels + int16_t b_cb_offset, r_cr_offset, g_cbcr_offset; + b_cb_offset = (param->cb_factor*u_tmp)>>6; + r_cr_offset = (param->cr_factor*v_tmp)>>6; + g_cbcr_offset = (param->g_cb_factor*u_tmp + param->g_cr_factor*v_tmp)>>7; + + int16_t y_tmp; + y_tmp = (param->y_factor*(y_ptr1[0]-param->y_offset))>>7; + rgb_ptr1[2] = clamp(y_tmp + r_cr_offset); + rgb_ptr1[1] = clamp(y_tmp - g_cbcr_offset); + rgb_ptr1[0] = clamp(y_tmp + b_cb_offset); + + y_tmp = (param->y_factor*(y_ptr1[1]-param->y_offset))>>7; + rgb_ptr1[6] = clamp(y_tmp + r_cr_offset); + rgb_ptr1[5] = clamp(y_tmp - g_cbcr_offset); + rgb_ptr1[4] = clamp(y_tmp + b_cb_offset); + + y_tmp = (param->y_factor*(y_ptr2[0]-param->y_offset))>>7; + rgb_ptr2[2] = clamp(y_tmp + r_cr_offset); + rgb_ptr2[1] = clamp(y_tmp - g_cbcr_offset); + rgb_ptr2[0] = clamp(y_tmp + b_cb_offset); + + y_tmp = (param->y_factor*(y_ptr2[1]-param->y_offset))>>7; + rgb_ptr2[6] = clamp(y_tmp + r_cr_offset); + rgb_ptr2[5] = clamp(y_tmp - g_cbcr_offset); + rgb_ptr2[4] = clamp(y_tmp + b_cb_offset); + + rgb_ptr1 += 8; + rgb_ptr2 += 8; + y_ptr1 += 2; + y_ptr2 += 2; + u_ptr += 1; + v_ptr += 1; + } + } +} + +void nv12_rgb24_std( + uint32_t width, uint32_t height, + const uint8_t *Y, const uint8_t *UV, uint32_t Y_stride, uint32_t UV_stride, + uint8_t *RGB, uint32_t RGB_stride, + YCbCrType yuv_type) +{ + const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]); + uint32_t x, y; + for(y=0; y<(height-1); y+=2) + { + const uint8_t *y_ptr1=Y+y*Y_stride, + *y_ptr2=Y+(y+1)*Y_stride, + *uv_ptr=UV+(y/2)*UV_stride; + + uint8_t *rgb_ptr1=RGB+y*RGB_stride, + *rgb_ptr2=RGB+(y+1)*RGB_stride; + + for(x=0; x<(width-1); x+=2) + { + int8_t u_tmp, v_tmp; + u_tmp = uv_ptr[0]-128; + v_tmp = uv_ptr[1]-128; + + //compute Cb Cr color offsets, common to four pixels + int16_t b_cb_offset, r_cr_offset, g_cbcr_offset; + b_cb_offset = (param->cb_factor*u_tmp)>>6; + r_cr_offset = (param->cr_factor*v_tmp)>>6; + g_cbcr_offset = (param->g_cb_factor*u_tmp + param->g_cr_factor*v_tmp)>>7; + + int16_t y_tmp; + y_tmp = (param->y_factor*(y_ptr1[0]-param->y_offset))>>7; + rgb_ptr1[0] = clamp(y_tmp + r_cr_offset); + rgb_ptr1[1] = clamp(y_tmp - g_cbcr_offset); + rgb_ptr1[2] = clamp(y_tmp + b_cb_offset); + + y_tmp = (param->y_factor*(y_ptr1[1]-param->y_offset))>>7; + rgb_ptr1[3] = clamp(y_tmp + r_cr_offset); + rgb_ptr1[4] = clamp(y_tmp - g_cbcr_offset); + rgb_ptr1[5] = clamp(y_tmp + b_cb_offset); + + y_tmp = (param->y_factor*(y_ptr2[0]-param->y_offset))>>7; + rgb_ptr2[0] = clamp(y_tmp + r_cr_offset); + rgb_ptr2[1] = clamp(y_tmp - g_cbcr_offset); + rgb_ptr2[2] = clamp(y_tmp + b_cb_offset); + + y_tmp = (param->y_factor*(y_ptr2[1]-param->y_offset))>>7; + rgb_ptr2[3] = clamp(y_tmp + r_cr_offset); + rgb_ptr2[4] = clamp(y_tmp - g_cbcr_offset); + rgb_ptr2[5] = clamp(y_tmp + b_cb_offset); + + rgb_ptr1 += 6; + rgb_ptr2 += 6; + y_ptr1 += 2; + y_ptr2 += 2; + uv_ptr += 2; + } + } +} + +void nv21_rgb24_std( + uint32_t width, uint32_t height, + const uint8_t *Y, const uint8_t *UV, uint32_t Y_stride, uint32_t UV_stride, + uint8_t *RGB, uint32_t RGB_stride, + YCbCrType yuv_type) +{ + const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]); + uint32_t x, y; + for(y=0; y<(height-1); y+=2) + { + const uint8_t *y_ptr1=Y+y*Y_stride, + *y_ptr2=Y+(y+1)*Y_stride, + *uv_ptr=UV+(y/2)*UV_stride; + + uint8_t *rgb_ptr1=RGB+y*RGB_stride, + *rgb_ptr2=RGB+(y+1)*RGB_stride; + + for(x=0; x<(width-1); x+=2) + { + int8_t u_tmp, v_tmp; + u_tmp = uv_ptr[1]-128; + v_tmp = uv_ptr[0]-128; + + //compute Cb Cr color offsets, common to four pixels + int16_t b_cb_offset, r_cr_offset, g_cbcr_offset; + b_cb_offset = (param->cb_factor*u_tmp)>>6; + r_cr_offset = (param->cr_factor*v_tmp)>>6; + g_cbcr_offset = (param->g_cb_factor*u_tmp + param->g_cr_factor*v_tmp)>>7; + + int16_t y_tmp; + y_tmp = (param->y_factor*(y_ptr1[0]-param->y_offset))>>7; + rgb_ptr1[0] = clamp(y_tmp + r_cr_offset); + rgb_ptr1[1] = clamp(y_tmp - g_cbcr_offset); + rgb_ptr1[2] = clamp(y_tmp + b_cb_offset); + + y_tmp = (param->y_factor*(y_ptr1[1]-param->y_offset))>>7; + rgb_ptr1[3] = clamp(y_tmp + r_cr_offset); + rgb_ptr1[4] = clamp(y_tmp - g_cbcr_offset); + rgb_ptr1[5] = clamp(y_tmp + b_cb_offset); + + y_tmp = (param->y_factor*(y_ptr2[0]-param->y_offset))>>7; + rgb_ptr2[0] = clamp(y_tmp + r_cr_offset); + rgb_ptr2[1] = clamp(y_tmp - g_cbcr_offset); + rgb_ptr2[2] = clamp(y_tmp + b_cb_offset); + + y_tmp = (param->y_factor*(y_ptr2[1]-param->y_offset))>>7; + rgb_ptr2[3] = clamp(y_tmp + r_cr_offset); + rgb_ptr2[4] = clamp(y_tmp - g_cbcr_offset); + rgb_ptr2[5] = clamp(y_tmp + b_cb_offset); + + rgb_ptr1 += 6; + rgb_ptr2 += 6; + y_ptr1 += 2; + y_ptr2 += 2; + uv_ptr += 2; + } + } +} + + +#ifdef __SSE2__ + +//see rgb.txt +#define UNPACK_RGB24_32_STEP(RS1, RS2, RS3, RS4, RS5, RS6, RD1, RD2, RD3, RD4, RD5, RD6) \ +RD1 = _mm_unpacklo_epi8(RS1, RS4); \ +RD2 = _mm_unpackhi_epi8(RS1, RS4); \ +RD3 = _mm_unpacklo_epi8(RS2, RS5); \ +RD4 = _mm_unpackhi_epi8(RS2, RS5); \ +RD5 = _mm_unpacklo_epi8(RS3, RS6); \ +RD6 = _mm_unpackhi_epi8(RS3, RS6); + +#define RGB2YUV_16(R, G, B, Y, U, V) \ +Y = _mm_add_epi16(_mm_mullo_epi16(R, _mm_set1_epi16(param->r_factor)), \ + _mm_mullo_epi16(G, _mm_set1_epi16(param->g_factor))); \ +Y = _mm_add_epi16(Y, _mm_mullo_epi16(B, _mm_set1_epi16(param->b_factor))); \ +Y = _mm_srli_epi16(Y, 8); \ +U = _mm_mullo_epi16(_mm_sub_epi16(B, Y), _mm_set1_epi16(param->cb_factor)); \ +U = _mm_add_epi16(_mm_srai_epi16(U, 8), _mm_set1_epi16(128)); \ +V = _mm_mullo_epi16(_mm_sub_epi16(R, Y), _mm_set1_epi16(param->cr_factor)); \ +V = _mm_add_epi16(_mm_srai_epi16(V, 8), _mm_set1_epi16(128)); \ +Y = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(Y, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); + +#define RGB2YUV_32 \ + __m128i r_16, g_16, b_16; \ + __m128i y1_16, y2_16, cb1_16, cb2_16, cr1_16, cr2_16, Y, cb, cr; \ + __m128i tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; \ + __m128i rgb1 = LOAD_SI128((const __m128i*)(rgb_ptr1)), \ + rgb2 = LOAD_SI128((const __m128i*)(rgb_ptr1+16)), \ + rgb3 = LOAD_SI128((const __m128i*)(rgb_ptr1+32)), \ + rgb4 = LOAD_SI128((const __m128i*)(rgb_ptr2)), \ + rgb5 = LOAD_SI128((const __m128i*)(rgb_ptr2+16)), \ + rgb6 = LOAD_SI128((const __m128i*)(rgb_ptr2+32)); \ + /* unpack rgb24 data to r, g and b data in separate channels*/ \ + /* see rgb.txt to get an idea of the algorithm, note that we only go to the next to last step*/ \ + /* here, because averaging in horizontal direction is easier like this*/ \ + /* The last step is applied further on the Y channel only*/ \ + UNPACK_RGB24_32_STEP(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6) \ + UNPACK_RGB24_32_STEP(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, rgb1, rgb2, rgb3, rgb4, rgb5, rgb6) \ + UNPACK_RGB24_32_STEP(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6) \ + UNPACK_RGB24_32_STEP(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, rgb1, rgb2, rgb3, rgb4, rgb5, rgb6) \ + /* first compute Y', (B-Y') and (R-Y'), in 16bits values, for the first line */ \ + /* Y is saved for each pixel, while only sums of (B-Y') and (R-Y') for pairs of adjacents pixels are saved*/ \ + r_16 = _mm_unpacklo_epi8(rgb1, _mm_setzero_si128()); \ + g_16 = _mm_unpacklo_epi8(rgb2, _mm_setzero_si128()); \ + b_16 = _mm_unpacklo_epi8(rgb3, _mm_setzero_si128()); \ + y1_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ + _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ + y1_16 = _mm_add_epi16(y1_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ + y1_16 = _mm_srli_epi16(y1_16, 8); \ + cb1_16 = _mm_sub_epi16(b_16, y1_16); \ + cr1_16 = _mm_sub_epi16(r_16, y1_16); \ + r_16 = _mm_unpacklo_epi8(rgb4, _mm_setzero_si128()); \ + g_16 = _mm_unpacklo_epi8(rgb5, _mm_setzero_si128()); \ + b_16 = _mm_unpacklo_epi8(rgb6, _mm_setzero_si128()); \ + y2_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ + _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ + y2_16 = _mm_add_epi16(y2_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ + y2_16 = _mm_srli_epi16(y2_16, 8); \ + cb1_16 = _mm_add_epi16(cb1_16, _mm_sub_epi16(b_16, y2_16)); \ + cr1_16 = _mm_add_epi16(cr1_16, _mm_sub_epi16(r_16, y2_16)); \ + /* Rescale Y' to Y, pack it to 8bit values and save it */ \ + y1_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y1_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ + y2_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y2_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ + Y = _mm_packus_epi16(y1_16, y2_16); \ + Y = _mm_unpackhi_epi8(_mm_slli_si128(Y, 8), Y); \ + SAVE_SI128((__m128i*)(y_ptr1), Y); \ + /* same for the second line, compute Y', (B-Y') and (R-Y'), in 16bits values */ \ + /* Y is saved for each pixel, while only sums of (B-Y') and (R-Y') for pairs of adjacents pixels are added to the previous values*/ \ + r_16 = _mm_unpackhi_epi8(rgb1, _mm_setzero_si128()); \ + g_16 = _mm_unpackhi_epi8(rgb2, _mm_setzero_si128()); \ + b_16 = _mm_unpackhi_epi8(rgb3, _mm_setzero_si128()); \ + y1_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ + _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ + y1_16 = _mm_add_epi16(y1_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ + y1_16 = _mm_srli_epi16(y1_16, 8); \ + cb1_16 = _mm_add_epi16(cb1_16, _mm_sub_epi16(b_16, y1_16)); \ + cr1_16 = _mm_add_epi16(cr1_16, _mm_sub_epi16(r_16, y1_16)); \ + r_16 = _mm_unpackhi_epi8(rgb4, _mm_setzero_si128()); \ + g_16 = _mm_unpackhi_epi8(rgb5, _mm_setzero_si128()); \ + b_16 = _mm_unpackhi_epi8(rgb6, _mm_setzero_si128()); \ + y2_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ + _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ + y2_16 = _mm_add_epi16(y2_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ + y2_16 = _mm_srli_epi16(y2_16, 8); \ + cb1_16 = _mm_add_epi16(cb1_16, _mm_sub_epi16(b_16, y2_16)); \ + cr1_16 = _mm_add_epi16(cr1_16, _mm_sub_epi16(r_16, y2_16)); \ + /* Rescale Y' to Y, pack it to 8bit values and save it */ \ + y1_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y1_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ + y2_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y2_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ + Y = _mm_packus_epi16(y1_16, y2_16); \ + Y = _mm_unpackhi_epi8(_mm_slli_si128(Y, 8), Y); \ + SAVE_SI128((__m128i*)(y_ptr2), Y); \ + /* Rescale Cb and Cr to their final range */ \ + cb1_16 = _mm_add_epi16(_mm_srai_epi16(_mm_mullo_epi16(_mm_srai_epi16(cb1_16, 2), _mm_set1_epi16(param->cb_factor)), 8), _mm_set1_epi16(128)); \ + cr1_16 = _mm_add_epi16(_mm_srai_epi16(_mm_mullo_epi16(_mm_srai_epi16(cr1_16, 2), _mm_set1_epi16(param->cr_factor)), 8), _mm_set1_epi16(128)); \ + \ + /* do the same again with next data */ \ + rgb1 = LOAD_SI128((const __m128i*)(rgb_ptr1+48)), \ + rgb2 = LOAD_SI128((const __m128i*)(rgb_ptr1+64)), \ + rgb3 = LOAD_SI128((const __m128i*)(rgb_ptr1+80)), \ + rgb4 = LOAD_SI128((const __m128i*)(rgb_ptr2+48)), \ + rgb5 = LOAD_SI128((const __m128i*)(rgb_ptr2+64)), \ + rgb6 = LOAD_SI128((const __m128i*)(rgb_ptr2+80)); \ + /* unpack rgb24 data to r, g and b data in separate channels*/ \ + /* see rgb.txt to get an idea of the algorithm, note that we only go to the next to last step*/ \ + /* here, because averaging in horizontal direction is easier like this*/ \ + /* The last step is applied further on the Y channel only*/ \ + UNPACK_RGB24_32_STEP(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6) \ + UNPACK_RGB24_32_STEP(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, rgb1, rgb2, rgb3, rgb4, rgb5, rgb6) \ + UNPACK_RGB24_32_STEP(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6) \ + UNPACK_RGB24_32_STEP(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, rgb1, rgb2, rgb3, rgb4, rgb5, rgb6) \ + /* first compute Y', (B-Y') and (R-Y'), in 16bits values, for the first line */ \ + /* Y is saved for each pixel, while only sums of (B-Y') and (R-Y') for pairs of adjacents pixels are saved*/ \ + r_16 = _mm_unpacklo_epi8(rgb1, _mm_setzero_si128()); \ + g_16 = _mm_unpacklo_epi8(rgb2, _mm_setzero_si128()); \ + b_16 = _mm_unpacklo_epi8(rgb3, _mm_setzero_si128()); \ + y1_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ + _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ + y1_16 = _mm_add_epi16(y1_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ + y1_16 = _mm_srli_epi16(y1_16, 8); \ + cb2_16 = _mm_sub_epi16(b_16, y1_16); \ + cr2_16 = _mm_sub_epi16(r_16, y1_16); \ + r_16 = _mm_unpacklo_epi8(rgb4, _mm_setzero_si128()); \ + g_16 = _mm_unpacklo_epi8(rgb5, _mm_setzero_si128()); \ + b_16 = _mm_unpacklo_epi8(rgb6, _mm_setzero_si128()); \ + y2_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ + _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ + y2_16 = _mm_add_epi16(y2_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ + y2_16 = _mm_srli_epi16(y2_16, 8); \ + cb2_16 = _mm_add_epi16(cb2_16, _mm_sub_epi16(b_16, y2_16)); \ + cr2_16 = _mm_add_epi16(cr2_16, _mm_sub_epi16(r_16, y2_16)); \ + /* Rescale Y' to Y, pack it to 8bit values and save it */ \ + y1_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y1_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ + y2_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y2_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ + Y = _mm_packus_epi16(y1_16, y2_16); \ + Y = _mm_unpackhi_epi8(_mm_slli_si128(Y, 8), Y); \ + SAVE_SI128((__m128i*)(y_ptr1+16), Y); \ + /* same for the second line, compute Y', (B-Y') and (R-Y'), in 16bits values */ \ + /* Y is saved for each pixel, while only sums of (B-Y') and (R-Y') for pairs of adjacents pixels are added to the previous values*/ \ + r_16 = _mm_unpackhi_epi8(rgb1, _mm_setzero_si128()); \ + g_16 = _mm_unpackhi_epi8(rgb2, _mm_setzero_si128()); \ + b_16 = _mm_unpackhi_epi8(rgb3, _mm_setzero_si128()); \ + y1_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ + _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ + y1_16 = _mm_add_epi16(y1_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ + y1_16 = _mm_srli_epi16(y1_16, 8); \ + cb2_16 = _mm_add_epi16(cb2_16, _mm_sub_epi16(b_16, y1_16)); \ + cr2_16 = _mm_add_epi16(cr2_16, _mm_sub_epi16(r_16, y1_16)); \ + r_16 = _mm_unpackhi_epi8(rgb4, _mm_setzero_si128()); \ + g_16 = _mm_unpackhi_epi8(rgb5, _mm_setzero_si128()); \ + b_16 = _mm_unpackhi_epi8(rgb6, _mm_setzero_si128()); \ + y2_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ + _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ + y2_16 = _mm_add_epi16(y2_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ + y2_16 = _mm_srli_epi16(y2_16, 8); \ + cb2_16 = _mm_add_epi16(cb2_16, _mm_sub_epi16(b_16, y2_16)); \ + cr2_16 = _mm_add_epi16(cr2_16, _mm_sub_epi16(r_16, y2_16)); \ + /* Rescale Y' to Y, pack it to 8bit values and save it */ \ + y1_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y1_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ + y2_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y2_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ + Y = _mm_packus_epi16(y1_16, y2_16); \ + Y = _mm_unpackhi_epi8(_mm_slli_si128(Y, 8), Y); \ + SAVE_SI128((__m128i*)(y_ptr2+16), Y); \ + /* Rescale Cb and Cr to their final range */ \ + cb2_16 = _mm_add_epi16(_mm_srai_epi16(_mm_mullo_epi16(_mm_srai_epi16(cb2_16, 2), _mm_set1_epi16(param->cb_factor)), 8), _mm_set1_epi16(128)); \ + cr2_16 = _mm_add_epi16(_mm_srai_epi16(_mm_mullo_epi16(_mm_srai_epi16(cr2_16, 2), _mm_set1_epi16(param->cr_factor)), 8), _mm_set1_epi16(128)); \ + /* Pack and save Cb Cr */ \ + cb = _mm_packus_epi16(cb1_16, cb2_16); \ + cr = _mm_packus_epi16(cr1_16, cr2_16); \ + SAVE_SI128((__m128i*)(u_ptr), cb); \ + SAVE_SI128((__m128i*)(v_ptr), cr); + + +void rgb24_yuv420_sse(uint32_t width, uint32_t height, + const uint8_t *RGB, uint32_t RGB_stride, + uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, + YCbCrType yuv_type) +{ + #define LOAD_SI128 _mm_load_si128 + #define SAVE_SI128 _mm_stream_si128 + const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]); + + uint32_t x, y; + for(y=0; y<(height-1); y+=2) + { + const uint8_t *rgb_ptr1=RGB+y*RGB_stride, + *rgb_ptr2=RGB+(y+1)*RGB_stride; + + uint8_t *y_ptr1=Y+y*Y_stride, + *y_ptr2=Y+(y+1)*Y_stride, + *u_ptr=U+(y/2)*UV_stride, + *v_ptr=V+(y/2)*UV_stride; + + for(x=0; x<(width-31); x+=32) + { + RGB2YUV_32 + + rgb_ptr1+=96; + rgb_ptr2+=96; + y_ptr1+=32; + y_ptr2+=32; + u_ptr+=16; + v_ptr+=16; + } + } + #undef LOAD_SI128 + #undef SAVE_SI128 +} + +void rgb24_yuv420_sseu(uint32_t width, uint32_t height, + const uint8_t *RGB, uint32_t RGB_stride, + uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, + YCbCrType yuv_type) +{ + #define LOAD_SI128 _mm_loadu_si128 + #define SAVE_SI128 _mm_storeu_si128 + const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]); + + uint32_t x, y; + for(y=0; y<(height-1); y+=2) + { + const uint8_t *rgb_ptr1=RGB+y*RGB_stride, + *rgb_ptr2=RGB+(y+1)*RGB_stride; + + uint8_t *y_ptr1=Y+y*Y_stride, + *y_ptr2=Y+(y+1)*Y_stride, + *u_ptr=U+(y/2)*UV_stride, + *v_ptr=V+(y/2)*UV_stride; + + for(x=0; x<(width-31); x+=32) + { + RGB2YUV_32 + + rgb_ptr1+=96; + rgb_ptr2+=96; + y_ptr1+=32; + y_ptr2+=32; + u_ptr+=16; + v_ptr+=16; + } + } + #undef LOAD_SI128 + #undef SAVE_SI128 +} + + +// see rgba.txt +#define UNPACK_RGB32_32_STEP(RS1, RS2, RS3, RS4, RS5, RS6, RS7, RS8, RD1, RD2, RD3, RD4, RD5, RD6, RD7, RD8) \ +RD1 = _mm_unpacklo_epi8(RS1, RS5); \ +RD2 = _mm_unpackhi_epi8(RS1, RS5); \ +RD3 = _mm_unpacklo_epi8(RS2, RS6); \ +RD4 = _mm_unpackhi_epi8(RS2, RS6); \ +RD5 = _mm_unpacklo_epi8(RS3, RS7); \ +RD6 = _mm_unpackhi_epi8(RS3, RS7); \ +RD7 = _mm_unpacklo_epi8(RS4, RS8); \ +RD8 = _mm_unpackhi_epi8(RS4, RS8); + + +#define RGBA2YUV_32 \ + __m128i r_16, g_16, b_16; \ + __m128i y1_16, y2_16, cb1_16, cb2_16, cr1_16, cr2_16, Y, cb, cr; \ + __m128i tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; \ + __m128i rgb1 = LOAD_SI128((const __m128i*)(rgb_ptr1)), \ + rgb2 = LOAD_SI128((const __m128i*)(rgb_ptr1+16)), \ + rgb3 = LOAD_SI128((const __m128i*)(rgb_ptr1+32)), \ + rgb4 = LOAD_SI128((const __m128i*)(rgb_ptr1+48)), \ + rgb5 = LOAD_SI128((const __m128i*)(rgb_ptr2)), \ + rgb6 = LOAD_SI128((const __m128i*)(rgb_ptr2+16)), \ + rgb7 = LOAD_SI128((const __m128i*)(rgb_ptr2+32)), \ + rgb8 = LOAD_SI128((const __m128i*)(rgb_ptr2+48)); \ + /* unpack rgb24 data to r, g and b data in separate channels*/ \ + /* see rgb.txt to get an idea of the algorithm, note that we only go to the next to last step*/ \ + /* here, because averaging in horizontal direction is easier like this*/ \ + /* The last step is applied further on the Y channel only*/ \ + UNPACK_RGB32_32_STEP(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, rgb7, rgb8, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8) \ + UNPACK_RGB32_32_STEP(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, rgb7, rgb8) \ + UNPACK_RGB32_32_STEP(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, rgb7, rgb8, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8) \ + UNPACK_RGB32_32_STEP(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, rgb7, rgb8) \ + /* first compute Y', (B-Y') and (R-Y'), in 16bits values, for the first line */ \ + /* Y is saved for each pixel, while only sums of (B-Y') and (R-Y') for pairs of adjacents pixels are saved*/ \ + r_16 = _mm_unpacklo_epi8(rgb1, _mm_setzero_si128()); \ + g_16 = _mm_unpacklo_epi8(rgb2, _mm_setzero_si128()); \ + b_16 = _mm_unpacklo_epi8(rgb3, _mm_setzero_si128()); \ + y1_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ + _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ + y1_16 = _mm_add_epi16(y1_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ + y1_16 = _mm_srli_epi16(y1_16, 8); \ + cb1_16 = _mm_sub_epi16(b_16, y1_16); \ + cr1_16 = _mm_sub_epi16(r_16, y1_16); \ + r_16 = _mm_unpacklo_epi8(rgb5, _mm_setzero_si128()); \ + g_16 = _mm_unpacklo_epi8(rgb6, _mm_setzero_si128()); \ + b_16 = _mm_unpacklo_epi8(rgb7, _mm_setzero_si128()); \ + y2_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ + _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ + y2_16 = _mm_add_epi16(y2_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ + y2_16 = _mm_srli_epi16(y2_16, 8); \ + cb1_16 = _mm_add_epi16(cb1_16, _mm_sub_epi16(b_16, y2_16)); \ + cr1_16 = _mm_add_epi16(cr1_16, _mm_sub_epi16(r_16, y2_16)); \ + /* Rescale Y' to Y, pack it to 8bit values and save it */ \ + y1_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y1_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ + y2_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y2_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ + Y = _mm_packus_epi16(y1_16, y2_16); \ + Y = _mm_unpackhi_epi8(_mm_slli_si128(Y, 8), Y); \ + SAVE_SI128((__m128i*)(y_ptr1), Y); \ + /* same for the second line, compute Y', (B-Y') and (R-Y'), in 16bits values */ \ + /* Y is saved for each pixel, while only sums of (B-Y') and (R-Y') for pairs of adjacents pixels are added to the previous values*/ \ + r_16 = _mm_unpackhi_epi8(rgb1, _mm_setzero_si128()); \ + g_16 = _mm_unpackhi_epi8(rgb2, _mm_setzero_si128()); \ + b_16 = _mm_unpackhi_epi8(rgb3, _mm_setzero_si128()); \ + y1_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ + _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ + y1_16 = _mm_add_epi16(y1_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ + y1_16 = _mm_srli_epi16(y1_16, 8); \ + cb1_16 = _mm_add_epi16(cb1_16, _mm_sub_epi16(b_16, y1_16)); \ + cr1_16 = _mm_add_epi16(cr1_16, _mm_sub_epi16(r_16, y1_16)); \ + r_16 = _mm_unpackhi_epi8(rgb5, _mm_setzero_si128()); \ + g_16 = _mm_unpackhi_epi8(rgb6, _mm_setzero_si128()); \ + b_16 = _mm_unpackhi_epi8(rgb7, _mm_setzero_si128()); \ + y2_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ + _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ + y2_16 = _mm_add_epi16(y2_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ + y2_16 = _mm_srli_epi16(y2_16, 8); \ + cb1_16 = _mm_add_epi16(cb1_16, _mm_sub_epi16(b_16, y2_16)); \ + cr1_16 = _mm_add_epi16(cr1_16, _mm_sub_epi16(r_16, y2_16)); \ + /* Rescale Y' to Y, pack it to 8bit values and save it */ \ + y1_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y1_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ + y2_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y2_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ + Y = _mm_packus_epi16(y1_16, y2_16); \ + Y = _mm_unpackhi_epi8(_mm_slli_si128(Y, 8), Y); \ + SAVE_SI128((__m128i*)(y_ptr2), Y); \ + /* Rescale Cb and Cr to their final range */ \ + cb1_16 = _mm_add_epi16(_mm_srai_epi16(_mm_mullo_epi16(_mm_srai_epi16(cb1_16, 2), _mm_set1_epi16(param->cb_factor)), 8), _mm_set1_epi16(128)); \ + cr1_16 = _mm_add_epi16(_mm_srai_epi16(_mm_mullo_epi16(_mm_srai_epi16(cr1_16, 2), _mm_set1_epi16(param->cr_factor)), 8), _mm_set1_epi16(128)); \ + \ + /* do the same again with next data */ \ + rgb1 = LOAD_SI128((const __m128i*)(rgb_ptr1+64)), \ + rgb2 = LOAD_SI128((const __m128i*)(rgb_ptr1+80)), \ + rgb3 = LOAD_SI128((const __m128i*)(rgb_ptr1+96)), \ + rgb4 = LOAD_SI128((const __m128i*)(rgb_ptr1+112)), \ + rgb5 = LOAD_SI128((const __m128i*)(rgb_ptr2+64)), \ + rgb6 = LOAD_SI128((const __m128i*)(rgb_ptr2+80)), \ + rgb7 = LOAD_SI128((const __m128i*)(rgb_ptr2+96)), \ + rgb8 = LOAD_SI128((const __m128i*)(rgb_ptr2+112)); \ + /* unpack rgb24 data to r, g and b data in separate channels*/ \ + /* see rgb.txt to get an idea of the algorithm, note that we only go to the next to last step*/ \ + /* here, because averaging in horizontal direction is easier like this*/ \ + /* The last step is applied further on the Y channel only*/ \ + UNPACK_RGB32_32_STEP(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, rgb7, rgb8, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8) \ + UNPACK_RGB32_32_STEP(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, rgb7, rgb8) \ + UNPACK_RGB32_32_STEP(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, rgb7, rgb8, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8) \ + UNPACK_RGB32_32_STEP(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, rgb7, rgb8) \ + /* first compute Y', (B-Y') and (R-Y'), in 16bits values, for the first line */ \ + /* Y is saved for each pixel, while only sums of (B-Y') and (R-Y') for pairs of adjacents pixels are saved*/ \ + r_16 = _mm_unpacklo_epi8(rgb1, _mm_setzero_si128()); \ + g_16 = _mm_unpacklo_epi8(rgb2, _mm_setzero_si128()); \ + b_16 = _mm_unpacklo_epi8(rgb3, _mm_setzero_si128()); \ + y1_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ + _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ + y1_16 = _mm_add_epi16(y1_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ + y1_16 = _mm_srli_epi16(y1_16, 8); \ + cb2_16 = _mm_sub_epi16(b_16, y1_16); \ + cr2_16 = _mm_sub_epi16(r_16, y1_16); \ + r_16 = _mm_unpacklo_epi8(rgb5, _mm_setzero_si128()); \ + g_16 = _mm_unpacklo_epi8(rgb6, _mm_setzero_si128()); \ + b_16 = _mm_unpacklo_epi8(rgb7, _mm_setzero_si128()); \ + y2_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ + _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ + y2_16 = _mm_add_epi16(y2_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ + y2_16 = _mm_srli_epi16(y2_16, 8); \ + cb2_16 = _mm_add_epi16(cb2_16, _mm_sub_epi16(b_16, y2_16)); \ + cr2_16 = _mm_add_epi16(cr2_16, _mm_sub_epi16(r_16, y2_16)); \ + /* Rescale Y' to Y, pack it to 8bit values and save it */ \ + y1_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y1_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ + y2_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y2_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ + Y = _mm_packus_epi16(y1_16, y2_16); \ + Y = _mm_unpackhi_epi8(_mm_slli_si128(Y, 8), Y); \ + SAVE_SI128((__m128i*)(y_ptr1+16), Y); \ + /* same for the second line, compute Y', (B-Y') and (R-Y'), in 16bits values */ \ + /* Y is saved for each pixel, while only sums of (B-Y') and (R-Y') for pairs of adjacents pixels are added to the previous values*/ \ + r_16 = _mm_unpackhi_epi8(rgb1, _mm_setzero_si128()); \ + g_16 = _mm_unpackhi_epi8(rgb2, _mm_setzero_si128()); \ + b_16 = _mm_unpackhi_epi8(rgb3, _mm_setzero_si128()); \ + y1_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ + _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ + y1_16 = _mm_add_epi16(y1_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ + y1_16 = _mm_srli_epi16(y1_16, 8); \ + cb2_16 = _mm_add_epi16(cb2_16, _mm_sub_epi16(b_16, y1_16)); \ + cr2_16 = _mm_add_epi16(cr2_16, _mm_sub_epi16(r_16, y1_16)); \ + r_16 = _mm_unpackhi_epi8(rgb5, _mm_setzero_si128()); \ + g_16 = _mm_unpackhi_epi8(rgb6, _mm_setzero_si128()); \ + b_16 = _mm_unpackhi_epi8(rgb7, _mm_setzero_si128()); \ + y2_16 = _mm_add_epi16(_mm_mullo_epi16(r_16, _mm_set1_epi16(param->r_factor)), \ + _mm_mullo_epi16(g_16, _mm_set1_epi16(param->g_factor))); \ + y2_16 = _mm_add_epi16(y2_16, _mm_mullo_epi16(b_16, _mm_set1_epi16(param->b_factor))); \ + y2_16 = _mm_srli_epi16(y2_16, 8); \ + cb2_16 = _mm_add_epi16(cb2_16, _mm_sub_epi16(b_16, y2_16)); \ + cr2_16 = _mm_add_epi16(cr2_16, _mm_sub_epi16(r_16, y2_16)); \ + /* Rescale Y' to Y, pack it to 8bit values and save it */ \ + y1_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y1_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ + y2_16 = _mm_add_epi16(_mm_srli_epi16(_mm_mullo_epi16(y2_16, _mm_set1_epi16(param->y_factor)), 7), _mm_set1_epi16(param->y_offset)); \ + Y = _mm_packus_epi16(y1_16, y2_16); \ + Y = _mm_unpackhi_epi8(_mm_slli_si128(Y, 8), Y); \ + SAVE_SI128((__m128i*)(y_ptr2+16), Y); \ + /* Rescale Cb and Cr to their final range */ \ + cb2_16 = _mm_add_epi16(_mm_srai_epi16(_mm_mullo_epi16(_mm_srai_epi16(cb2_16, 2), _mm_set1_epi16(param->cb_factor)), 8), _mm_set1_epi16(128)); \ + cr2_16 = _mm_add_epi16(_mm_srai_epi16(_mm_mullo_epi16(_mm_srai_epi16(cr2_16, 2), _mm_set1_epi16(param->cr_factor)), 8), _mm_set1_epi16(128)); \ + /* Pack and save Cb Cr */ \ + cb = _mm_packus_epi16(cb1_16, cb2_16); \ + cr = _mm_packus_epi16(cr1_16, cr2_16); \ + SAVE_SI128((__m128i*)(u_ptr), cb); \ + SAVE_SI128((__m128i*)(v_ptr), cr); + +void rgb32_yuv420_sse(uint32_t width, uint32_t height, + const uint8_t *RGBA, uint32_t RGBA_stride, + uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, + YCbCrType yuv_type) +{ + #define LOAD_SI128 _mm_load_si128 + #define SAVE_SI128 _mm_stream_si128 + const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]); + + uint32_t x, y; + for(y=0; y<(height-1); y+=2) + { + const uint8_t *rgb_ptr1=RGBA+y*RGBA_stride, + *rgb_ptr2=RGBA+(y+1)*RGBA_stride; + + uint8_t *y_ptr1=Y+y*Y_stride, + *y_ptr2=Y+(y+1)*Y_stride, + *u_ptr=U+(y/2)*UV_stride, + *v_ptr=V+(y/2)*UV_stride; + + for(x=0; x<(width-31); x+=32) + { + RGBA2YUV_32 + + rgb_ptr1+=128; + rgb_ptr2+=128; + y_ptr1+=32; + y_ptr2+=32; + u_ptr+=16; + v_ptr+=16; + } + } + #undef LOAD_SI128 + #undef SAVE_SI128 +} + +void rgb32_yuv420_sseu(uint32_t width, uint32_t height, + const uint8_t *RGBA, uint32_t RGBA_stride, + uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, + YCbCrType yuv_type) +{ + #define LOAD_SI128 _mm_loadu_si128 + #define SAVE_SI128 _mm_storeu_si128 + const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]); + + uint32_t x, y; + for(y=0; y<(height-1); y+=2) + { + const uint8_t *rgb_ptr1=RGBA+y*RGBA_stride, + *rgb_ptr2=RGBA+(y+1)*RGBA_stride; + + uint8_t *y_ptr1=Y+y*Y_stride, + *y_ptr2=Y+(y+1)*Y_stride, + *u_ptr=U+(y/2)*UV_stride, + *v_ptr=V+(y/2)*UV_stride; + + for(x=0; x<(width-31); x+=32) + { + RGBA2YUV_32 + + rgb_ptr1+=128; + rgb_ptr2+=128; + y_ptr1+=32; + y_ptr2+=32; + u_ptr+=16; + v_ptr+=16; + } + } + #undef LOAD_SI128 + #undef SAVE_SI128 +} + +#endif + +#ifdef __SSE2__ + +#define UV2RGB_16(U,V,R1,G1,B1,R2,G2,B2) \ + r_tmp = _mm_srai_epi16(_mm_mullo_epi16(V, _mm_set1_epi16(param->cr_factor)), 6); \ + g_tmp = _mm_srai_epi16(_mm_add_epi16( \ + _mm_mullo_epi16(U, _mm_set1_epi16(param->g_cb_factor)), \ + _mm_mullo_epi16(V, _mm_set1_epi16(param->g_cr_factor))), 7); \ + b_tmp = _mm_srai_epi16(_mm_mullo_epi16(U, _mm_set1_epi16(param->cb_factor)), 6); \ + R1 = _mm_unpacklo_epi16(r_tmp, r_tmp); \ + G1 = _mm_unpacklo_epi16(g_tmp, g_tmp); \ + B1 = _mm_unpacklo_epi16(b_tmp, b_tmp); \ + R2 = _mm_unpackhi_epi16(r_tmp, r_tmp); \ + G2 = _mm_unpackhi_epi16(g_tmp, g_tmp); \ + B2 = _mm_unpackhi_epi16(b_tmp, b_tmp); \ + +#define ADD_Y2RGB_16(Y1,Y2,R1,G1,B1,R2,G2,B2) \ + Y1 = _mm_srai_epi16(_mm_mullo_epi16(Y1, _mm_set1_epi16(param->y_factor)), 7); \ + Y2 = _mm_srai_epi16(_mm_mullo_epi16(Y2, _mm_set1_epi16(param->y_factor)), 7); \ + \ + R1 = _mm_add_epi16(Y1, R1); \ + G1 = _mm_sub_epi16(Y1, G1); \ + B1 = _mm_add_epi16(Y1, B1); \ + R2 = _mm_add_epi16(Y2, R2); \ + G2 = _mm_sub_epi16(Y2, G2); \ + B2 = _mm_add_epi16(Y2, B2); \ + +#define PACK_RGB24_32_STEP(RS1, RS2, RS3, RS4, RS5, RS6, RD1, RD2, RD3, RD4, RD5, RD6) \ +RD1 = _mm_packus_epi16(_mm_and_si128(RS1,_mm_set1_epi16(0xFF)), _mm_and_si128(RS2,_mm_set1_epi16(0xFF))); \ +RD2 = _mm_packus_epi16(_mm_and_si128(RS3,_mm_set1_epi16(0xFF)), _mm_and_si128(RS4,_mm_set1_epi16(0xFF))); \ +RD3 = _mm_packus_epi16(_mm_and_si128(RS5,_mm_set1_epi16(0xFF)), _mm_and_si128(RS6,_mm_set1_epi16(0xFF))); \ +RD4 = _mm_packus_epi16(_mm_srli_epi16(RS1,8), _mm_srli_epi16(RS2,8)); \ +RD5 = _mm_packus_epi16(_mm_srli_epi16(RS3,8), _mm_srli_epi16(RS4,8)); \ +RD6 = _mm_packus_epi16(_mm_srli_epi16(RS5,8), _mm_srli_epi16(RS6,8)); \ + +#define PACK_RGB24_32(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ +PACK_RGB24_32_STEP(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ +PACK_RGB24_32_STEP(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \ +PACK_RGB24_32_STEP(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ +PACK_RGB24_32_STEP(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \ +PACK_RGB24_32_STEP(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ + +#define LOAD_UV_PLANAR \ + __m128i u = LOAD_SI128((const __m128i*)(u_ptr)); \ + __m128i v = LOAD_SI128((const __m128i*)(v_ptr)); \ + +#define LOAD_UV_NV12 \ + __m128i uv1 = LOAD_SI128((const __m128i*)(uv_ptr)); \ + __m128i uv2 = LOAD_SI128((const __m128i*)(uv_ptr+16)); \ + __m128i u = _mm_packus_epi16(_mm_and_si128(uv1, _mm_set1_epi16(255)), _mm_and_si128(uv2, _mm_set1_epi16(255))); \ + uv1 = _mm_srli_epi16(uv1, 8); \ + uv2 = _mm_srli_epi16(uv2, 8); \ + __m128i v = _mm_packus_epi16(_mm_and_si128(uv1, _mm_set1_epi16(255)), _mm_and_si128(uv2, _mm_set1_epi16(255))); \ + +#define LOAD_UV_NV21 \ + __m128i uv1 = LOAD_SI128((const __m128i*)(uv_ptr)); \ + __m128i uv2 = LOAD_SI128((const __m128i*)(uv_ptr+16)); \ + __m128i v = _mm_packus_epi16(_mm_and_si128(uv1, _mm_set1_epi16(255)), _mm_and_si128(uv2, _mm_set1_epi16(255))); \ + uv1 = _mm_srli_epi16(uv1, 8); \ + uv2 = _mm_srli_epi16(uv2, 8); \ + __m128i u = _mm_packus_epi16(_mm_and_si128(uv1, _mm_set1_epi16(255)), _mm_and_si128(uv2, _mm_set1_epi16(255))); \ + +#define YUV2RGB_32 \ + __m128i r_tmp, g_tmp, b_tmp; \ + __m128i r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2; \ + __m128i r_uv_16_1, g_uv_16_1, b_uv_16_1, r_uv_16_2, g_uv_16_2, b_uv_16_2; \ + __m128i y_16_1, y_16_2; \ + \ + u = _mm_add_epi8(u, _mm_set1_epi8(-128)); \ + v = _mm_add_epi8(v, _mm_set1_epi8(-128)); \ + \ + /* process first 16 pixels of first line */\ + __m128i u_16 = _mm_srai_epi16(_mm_unpacklo_epi8(u, u), 8); \ + __m128i v_16 = _mm_srai_epi16(_mm_unpacklo_epi8(v, v), 8); \ + \ + UV2RGB_16(u_16, v_16, r_uv_16_1, g_uv_16_1, b_uv_16_1, r_uv_16_2, g_uv_16_2, b_uv_16_2) \ + r_16_1=r_uv_16_1; g_16_1=g_uv_16_1; b_16_1=b_uv_16_1; \ + r_16_2=r_uv_16_2; g_16_2=g_uv_16_2; b_16_2=b_uv_16_2; \ + \ + __m128i y = LOAD_SI128((const __m128i*)(y_ptr1)); \ + y = _mm_sub_epi8(y, _mm_set1_epi8(param->y_offset)); \ + y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \ + y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \ + \ + ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \ + \ + __m128i r_8_11 = _mm_packus_epi16(r_16_1, r_16_2); \ + __m128i g_8_11 = _mm_packus_epi16(g_16_1, g_16_2); \ + __m128i b_8_11 = _mm_packus_epi16(b_16_1, b_16_2); \ + \ + /* process first 16 pixels of second line */\ + r_16_1=r_uv_16_1; g_16_1=g_uv_16_1; b_16_1=b_uv_16_1; \ + r_16_2=r_uv_16_2; g_16_2=g_uv_16_2; b_16_2=b_uv_16_2; \ + \ + y = LOAD_SI128((const __m128i*)(y_ptr2)); \ + y = _mm_sub_epi8(y, _mm_set1_epi8(param->y_offset)); \ + y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \ + y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \ + \ + ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \ + \ + __m128i r_8_21 = _mm_packus_epi16(r_16_1, r_16_2); \ + __m128i g_8_21 = _mm_packus_epi16(g_16_1, g_16_2); \ + __m128i b_8_21 = _mm_packus_epi16(b_16_1, b_16_2); \ + \ + /* process last 16 pixels of first line */\ + u_16 = _mm_srai_epi16(_mm_unpackhi_epi8(u, u), 8); \ + v_16 = _mm_srai_epi16(_mm_unpackhi_epi8(v, v), 8); \ + \ + UV2RGB_16(u_16, v_16, r_uv_16_1, g_uv_16_1, b_uv_16_1, r_uv_16_2, g_uv_16_2, b_uv_16_2) \ + r_16_1=r_uv_16_1; g_16_1=g_uv_16_1; b_16_1=b_uv_16_1; \ + r_16_2=r_uv_16_2; g_16_2=g_uv_16_2; b_16_2=b_uv_16_2; \ + \ + y = LOAD_SI128((const __m128i*)(y_ptr1+16)); \ + y = _mm_sub_epi8(y, _mm_set1_epi8(param->y_offset)); \ + y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \ + y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \ + \ + ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \ + \ + __m128i r_8_12 = _mm_packus_epi16(r_16_1, r_16_2); \ + __m128i g_8_12 = _mm_packus_epi16(g_16_1, g_16_2); \ + __m128i b_8_12 = _mm_packus_epi16(b_16_1, b_16_2); \ + \ + /* process last 16 pixels of second line */\ + r_16_1=r_uv_16_1; g_16_1=g_uv_16_1; b_16_1=b_uv_16_1; \ + r_16_2=r_uv_16_2; g_16_2=g_uv_16_2; b_16_2=b_uv_16_2; \ + \ + y = LOAD_SI128((const __m128i*)(y_ptr2+16)); \ + y = _mm_sub_epi8(y, _mm_set1_epi8(param->y_offset)); \ + y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \ + y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \ + \ + ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \ + \ + __m128i r_8_22 = _mm_packus_epi16(r_16_1, r_16_2); \ + __m128i g_8_22 = _mm_packus_epi16(g_16_1, g_16_2); \ + __m128i b_8_22 = _mm_packus_epi16(b_16_1, b_16_2); \ + \ + __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6; \ + \ + PACK_RGB24_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6) \ + SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \ + SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \ + SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \ + SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \ + SAVE_SI128((__m128i*)(rgb_ptr1+64), rgb_5); \ + SAVE_SI128((__m128i*)(rgb_ptr1+80), rgb_6); \ + \ + PACK_RGB24_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6) \ + SAVE_SI128((__m128i*)(rgb_ptr2), rgb_1); \ + SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_2); \ + SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_3); \ + SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_4); \ + SAVE_SI128((__m128i*)(rgb_ptr2+64), rgb_5); \ + SAVE_SI128((__m128i*)(rgb_ptr2+80), rgb_6); \ + +#define YUV2RGB_32_PLANAR \ + LOAD_UV_PLANAR \ + YUV2RGB_32 + +#define YUV2RGB_32_NV12 \ + LOAD_UV_NV12 \ + YUV2RGB_32 + +#define YUV2RGB_32_NV21 \ + LOAD_UV_NV21 \ + YUV2RGB_32 + + +void yuv420_rgb24_sse( + uint32_t width, uint32_t height, + const uint8_t *Y, const uint8_t *U, const uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, + uint8_t *RGB, uint32_t RGB_stride, + YCbCrType yuv_type) +{ + #define LOAD_SI128 _mm_load_si128 + #define SAVE_SI128 _mm_stream_si128 + const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]); + + uint32_t x, y; + for(y=0; y<(height-1); y+=2) + { + const uint8_t *y_ptr1=Y+y*Y_stride, + *y_ptr2=Y+(y+1)*Y_stride, + *u_ptr=U+(y/2)*UV_stride, + *v_ptr=V+(y/2)*UV_stride; + + uint8_t *rgb_ptr1=RGB+y*RGB_stride, + *rgb_ptr2=RGB+(y+1)*RGB_stride; + + for(x=0; x<(width-31); x+=32) + { + YUV2RGB_32_PLANAR + + y_ptr1+=32; + y_ptr2+=32; + u_ptr+=16; + v_ptr+=16; + rgb_ptr1+=96; + rgb_ptr2+=96; + } + } + #undef LOAD_SI128 + #undef SAVE_SI128 +} + +void yuv420_rgb24_sseu( + uint32_t width, uint32_t height, + const uint8_t *Y, const uint8_t *U, const uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, + uint8_t *RGB, uint32_t RGB_stride, + YCbCrType yuv_type) +{ + #define LOAD_SI128 _mm_loadu_si128 + #define SAVE_SI128 _mm_storeu_si128 + const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]); + + uint32_t x, y; + for(y=0; y<(height-1); y+=2) + { + const uint8_t *y_ptr1=Y+y*Y_stride, + *y_ptr2=Y+(y+1)*Y_stride, + *u_ptr=U+(y/2)*UV_stride, + *v_ptr=V+(y/2)*UV_stride; + + uint8_t *rgb_ptr1=RGB+y*RGB_stride, + *rgb_ptr2=RGB+(y+1)*RGB_stride; + + for(x=0; x<(width-31); x+=32) + { + YUV2RGB_32_PLANAR + + y_ptr1+=32; + y_ptr2+=32; + u_ptr+=16; + v_ptr+=16; + rgb_ptr1+=96; + rgb_ptr2+=96; + } + } + #undef LOAD_SI128 + #undef SAVE_SI128 +} + +void nv12_rgb24_sse( + uint32_t width, uint32_t height, + const uint8_t *Y, const uint8_t *UV, uint32_t Y_stride, uint32_t UV_stride, + uint8_t *RGB, uint32_t RGB_stride, + YCbCrType yuv_type) +{ + #define LOAD_SI128 _mm_load_si128 + #define SAVE_SI128 _mm_stream_si128 + const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]); + + uint32_t x, y; + for(y=0; y<(height-1); y+=2) + { + const uint8_t *y_ptr1=Y+y*Y_stride, + *y_ptr2=Y+(y+1)*Y_stride, + *uv_ptr=UV+(y/2)*UV_stride; + + uint8_t *rgb_ptr1=RGB+y*RGB_stride, + *rgb_ptr2=RGB+(y+1)*RGB_stride; + + for(x=0; x<(width-31); x+=32) + { + YUV2RGB_32_NV12 + + y_ptr1+=32; + y_ptr2+=32; + uv_ptr+=32; + rgb_ptr1+=96; + rgb_ptr2+=96; + } + } + #undef LOAD_SI128 + #undef SAVE_SI128 +} + +void nv12_rgb24_sseu( + uint32_t width, uint32_t height, + const uint8_t *Y, const uint8_t *UV, uint32_t Y_stride, uint32_t UV_stride, + uint8_t *RGB, uint32_t RGB_stride, + YCbCrType yuv_type) +{ + #define LOAD_SI128 _mm_loadu_si128 + #define SAVE_SI128 _mm_storeu_si128 + const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]); + + uint32_t x, y; + for(y=0; y<(height-1); y+=2) + { + const uint8_t *y_ptr1=Y+y*Y_stride, + *y_ptr2=Y+(y+1)*Y_stride, + *uv_ptr=UV+(y/2)*UV_stride; + + uint8_t *rgb_ptr1=RGB+y*RGB_stride, + *rgb_ptr2=RGB+(y+1)*RGB_stride; + + for(x=0; x<(width-31); x+=32) + { + YUV2RGB_32_NV12 + + y_ptr1+=32; + y_ptr2+=32; + uv_ptr+=32; + rgb_ptr1+=96; + rgb_ptr2+=96; + } + } + #undef LOAD_SI128 + #undef SAVE_SI128 +} + +void nv21_rgb24_sse( + uint32_t width, uint32_t height, + const uint8_t *Y, const uint8_t *UV, uint32_t Y_stride, uint32_t UV_stride, + uint8_t *RGB, uint32_t RGB_stride, + YCbCrType yuv_type) +{ + #define LOAD_SI128 _mm_load_si128 + #define SAVE_SI128 _mm_stream_si128 + const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]); + + uint32_t x, y; + for(y=0; y<(height-1); y+=2) + { + const uint8_t *y_ptr1=Y+y*Y_stride, + *y_ptr2=Y+(y+1)*Y_stride, + *uv_ptr=UV+(y/2)*UV_stride; + + uint8_t *rgb_ptr1=RGB+y*RGB_stride, + *rgb_ptr2=RGB+(y+1)*RGB_stride; + + for(x=0; x<(width-31); x+=32) + { + YUV2RGB_32_NV21 + + y_ptr1+=32; + y_ptr2+=32; + uv_ptr+=32; + rgb_ptr1+=96; + rgb_ptr2+=96; + } + } + #undef LOAD_SI128 + #undef SAVE_SI128 +} + +void nv21_rgb24_sseu( + uint32_t width, uint32_t height, + const uint8_t *Y, const uint8_t *UV, uint32_t Y_stride, uint32_t UV_stride, + uint8_t *RGB, uint32_t RGB_stride, + YCbCrType yuv_type) +{ + #define LOAD_SI128 _mm_loadu_si128 + #define SAVE_SI128 _mm_storeu_si128 + const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]); + + uint32_t x, y; + for(y=0; y<(height-1); y+=2) + { + const uint8_t *y_ptr1=Y+y*Y_stride, + *y_ptr2=Y+(y+1)*Y_stride, + *uv_ptr=UV+(y/2)*UV_stride; + + uint8_t *rgb_ptr1=RGB+y*RGB_stride, + *rgb_ptr2=RGB+(y+1)*RGB_stride; + + for(x=0; x<(width-31); x+=32) + { + YUV2RGB_32_NV21 + + y_ptr1+=32; + y_ptr2+=32; + uv_ptr+=32; + rgb_ptr1+=96; + rgb_ptr2+=96; + } + } + #undef LOAD_SI128 + #undef SAVE_SI128 +} + + + +#endif //__SSE2__ diff --git a/yuv_rgb.h b/yuv_rgb.h new file mode 100644 index 000000000..9f431a6bf --- /dev/null +++ b/yuv_rgb.h @@ -0,0 +1,155 @@ +// Copyright 2016 Adrien Descamps +// Distributed under BSD 3-Clause License + +// Provide optimized functions to convert images from 8bits yuv420 to rgb24 format + +// There are a few slightly different variations of the YCbCr color space with different parameters that +// change the conversion matrix. +// The three most common YCbCr color space, defined by BT.601, BT.709 and JPEG standard are implemented here. +// See the respective standards for details +// The matrix values used are derived from http://www.equasys.de/colorconversion.html + +// YUV420 is stored as three separate channels, with U and V (Cb and Cr) subsampled by a 2 factor +// For conversion from yuv to rgb, no interpolation is done, and the same UV value are used for 4 rgb pixels. This +// is suboptimal for image quality, but by far the fastest method. + +// For all methods, width and height should be even, if not, the last row/column of the result image won't be affected. +// For sse methods, if the width if not divisable by 32, the last (width%32) pixels of each line won't be affected. + +#include + +typedef enum +{ + YCBCR_JPEG, + YCBCR_601, + YCBCR_709 +} YCbCrType; + +#ifdef __cplusplus +extern "C" { +#endif + +// yuv to rgb, standard c implementation +void yuv420_rgb24_std( + uint32_t width, uint32_t height, + const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, + uint8_t *rgb, uint32_t rgb_stride, + YCbCrType yuv_type); + +// yuv to rgb, yuv in nv12 semi planar format +void nv12_rgb24_std( + uint32_t width, uint32_t height, + const uint8_t *y, const uint8_t *uv, uint32_t y_stride, uint32_t uv_stride, + uint8_t *rgb, uint32_t rgb_stride, + YCbCrType yuv_type); + +// yuv to rgb, yuv in nv12 semi planar format +void nv21_rgb24_std( + uint32_t width, uint32_t height, + const uint8_t *y, const uint8_t *uv, uint32_t y_stride, uint32_t uv_stride, + uint8_t *rgb, uint32_t rgb_stride, + YCbCrType yuv_type); + +// yuv to rgb, sse implementation +// pointers must be 16 byte aligned, and strides must be divisable by 16 +void yuv420_rgb24_sse( + uint32_t width, uint32_t height, + const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, + uint8_t *rgb, uint32_t rgb_stride, + YCbCrType yuv_type); + +// yuv to rgb, sse implementation +// pointers do not need to be 16 byte aligned +void yuv420_rgb24_sseu( + uint32_t width, uint32_t height, + const uint8_t *y, const uint8_t *u, const uint8_t *v, uint32_t y_stride, uint32_t uv_stride, + uint8_t *rgb, uint32_t rgb_stride, + YCbCrType yuv_type); + +// yuv nv12 to rgb, sse implementation +// pointers must be 16 byte aligned, and strides must be divisable by 16 +void nv12_rgb24_sse( + uint32_t width, uint32_t height, + const uint8_t *y, const uint8_t *uv, uint32_t y_stride, uint32_t uv_stride, + uint8_t *rgb, uint32_t rgb_stride, + YCbCrType yuv_type); + +// yuv nv12 to rgb, sse implementation +// pointers do not need to be 16 byte aligned +void nv12_rgb24_sseu( + uint32_t width, uint32_t height, + const uint8_t *y, const uint8_t *uv, uint32_t y_stride, uint32_t uv_stride, + uint8_t *rgb, uint32_t rgb_stride, + YCbCrType yuv_type); + +// yuv nv21 to rgb, sse implementation +// pointers must be 16 byte aligned, and strides must be divisable by 16 +void nv21_rgb24_sse( + uint32_t width, uint32_t height, + const uint8_t *y, const uint8_t *uv, uint32_t y_stride, uint32_t uv_stride, + uint8_t *rgb, uint32_t rgb_stride, + YCbCrType yuv_type); + +// yuv nv21 to rgb, sse implementation +// pointers do not need to be 16 byte aligned +void nv21_rgb24_sseu( + uint32_t width, uint32_t height, + const uint8_t *y, const uint8_t *uv, uint32_t y_stride, uint32_t uv_stride, + uint8_t *rgb, uint32_t rgb_stride, + YCbCrType yuv_type); + + + + +// rgb to yuv, standard c implementation +void rgb24_yuv420_std( + uint32_t width, uint32_t height, + const uint8_t *rgb, uint32_t rgb_stride, + uint8_t *y, uint8_t *u, uint8_t *v, uint32_t y_stride, uint32_t uv_stride, + YCbCrType yuv_type); + +// rgb to yuv, sse implementation +// pointers must be 16 byte aligned, and strides must be divisible by 16 +void rgb24_yuv420_sse( + uint32_t width, uint32_t height, + const uint8_t *rgb, uint32_t rgb_stride, + uint8_t *y, uint8_t *u, uint8_t *v, uint32_t y_stride, uint32_t uv_stride, + YCbCrType yuv_type); + +// rgb to yuv, sse implementation +// pointers do not need to be 16 byte aligned +void rgb24_yuv420_sseu( + uint32_t width, uint32_t height, + const uint8_t *rgb, uint32_t rgb_stride, + uint8_t *y, uint8_t *u, uint8_t *v, uint32_t y_stride, uint32_t uv_stride, + YCbCrType yuv_type); + +// rgba to yuv, standard c implementation +// alpha channel is ignored +void rgb32_yuv420_std( + uint32_t width, uint32_t height, + const uint8_t *rgba, uint32_t rgba_stride, + uint8_t *y, uint8_t *u, uint8_t *v, uint32_t y_stride, uint32_t uv_stride, + YCbCrType yuv_type); + +// rgba to yuv, sse implementation +// pointers must be 16 byte aligned, and strides must be divisible by 16 +// alpha channel is ignored +void rgb32_yuv420_sse( + uint32_t width, uint32_t height, + const uint8_t *rgba, uint32_t rgba_stride, + uint8_t *y, uint8_t *u, uint8_t *v, uint32_t y_stride, uint32_t uv_stride, + YCbCrType yuv_type); + +// rgba to yuv, sse implementation +// pointers do not need to be 16 byte aligned +// alpha channel is ignored +void rgb32_yuv420_sseu( + uint32_t width, uint32_t height, + const uint8_t *rgba, uint32_t rgba_stride, + uint8_t *y, uint8_t *u, uint8_t *v, uint32_t y_stride, uint32_t uv_stride, + YCbCrType yuv_type); + +#ifdef __cplusplus +} +#endif