diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ccae921f1..4da180c58 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -38,7 +38,7 @@ jobs: - name: Get MacOS dev libraries and tools if: matrix.os == 'macos-latest' run: | - brew install pkg-config libvpx x264 opus sdl2 + brew install pkg-config libvpx x264 opus sdl2 jpeg-turbo - name: Get Windows dev libraries and tools if: matrix.os == 'windows-latest' diff --git a/Makefile b/Makefile index f0afe6ad7..748aa595f 100644 --- a/Makefile +++ b/Makefile @@ -38,7 +38,7 @@ test: go test -v ./pkg/... verify-cores: - go test -run TestAll ./pkg/worker/room -v -renderFrames $(GL_CTX) -outputPath "../../../_rendered" + go test -run TestAll ./pkg/worker/room -v -renderFrames $(GL_CTX) -outputPath "./_rendered" dev.build: compile build diff --git a/README.md b/README.md index b3f181c31..d1d837ad7 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ a better sense of performance. apt-get install -y make gcc pkg-config libvpx-dev libx264-dev libopus-dev libsdl2-dev libyuv-dev # MacOS -brew install pkg-config libvpx x264 opus sdl2 +brew install pkg-config libvpx x264 opus sdl2 jpeg-turbo # Windows (MSYS2) pacman -Sy --noconfirm --needed git make mingw-w64-x86_64-{gcc,pkgconf,dlfcn,libvpx,opus,x264-git,SDL2,libyuv} diff --git a/pkg/encoder/yuv/libyuv/LICENSE b/pkg/encoder/yuv/libyuv/LICENSE deleted file mode 100644 index c911747a6..000000000 --- a/pkg/encoder/yuv/libyuv/LICENSE +++ /dev/null @@ -1,29 +0,0 @@ -Copyright 2011 The LibYuv Project Authors. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - - * Neither the name of Google nor the names of its contributors may - be used to endorse or promote products derived from this software - without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/pkg/encoder/yuv/libyuv/basic_types.h b/pkg/encoder/yuv/libyuv/basic_types.h deleted file mode 100644 index 9c66a132a..000000000 --- a/pkg/encoder/yuv/libyuv/basic_types.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_ -#define INCLUDE_LIBYUV_BASIC_TYPES_H_ - -#include // For size_t and NULL - -#if !defined(INT_TYPES_DEFINED) && !defined(GG_LONGLONG) -#define INT_TYPES_DEFINED - -#include // for uintptr_t and C99 types - -#endif // INT_TYPES_DEFINED - -#if !defined(LIBYUV_API) -#define LIBYUV_API -#endif // LIBYUV_API - -#define LIBYUV_BOOL int - -#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_ diff --git a/pkg/encoder/yuv/libyuv/convert.c b/pkg/encoder/yuv/libyuv/convert.c deleted file mode 100644 index c59da3b1b..000000000 --- a/pkg/encoder/yuv/libyuv/convert.c +++ /dev/null @@ -1,336 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "convert.h" - -#include "basic_types.h" -#include "cpu_id.h" -#include "planar_functions.h" -#include "row.h" - -// Subsample amount uses a shift. -// v is value -// a is amount to add to round up -// s is shift to subsample down -#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) - -static __inline int Abs(int v) { - return v >= 0 ? v : -v; -} - -// Copy I420 with optional flipping. -// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure -// is does row coalescing. -LIBYUV_API -int I420Copy(const uint8_t *src_y, - int src_stride_y, - const uint8_t *src_u, - int src_stride_u, - const uint8_t *src_v, - int src_stride_v, - uint8_t *dst_y, - int dst_stride_y, - uint8_t *dst_u, - int dst_stride_u, - uint8_t *dst_v, - int dst_stride_v, - int width, - int height) { - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || - height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - halfheight = (height + 1) >> 1; - src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (halfheight - 1) * src_stride_u; - src_v = src_v + (halfheight - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } - - if (dst_y) { - CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - } - // Copy UV planes. - CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); - CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); - return 0; -} - -// Convert ARGB to I420. -LIBYUV_API -int ARGBToI420(const uint8_t *src_argb, - int src_stride_argb, - uint8_t *dst_y, - int dst_stride_y, - uint8_t *dst_u, - int dst_stride_u, - uint8_t *dst_v, - int dst_stride_v, - int width, - int height) { - int y; - void (*ARGBToUVRow)(const uint8_t *src_argb0, int src_stride_argb, - uint8_t *dst_u, uint8_t *dst_v, int width) = - ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8_t *src_argb, uint8_t *dst_y, int width) = - ARGBToYRow_C; - if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } -#if defined(HAS_ARGBTOYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToYRow = ARGBToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToYRow = ARGBToYRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVRow = ARGBToUVRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_AVX2; - } - } -#endif - - for (y = 0; y < height - 1; y += 2) { - ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width); - ARGBToYRow(src_argb, dst_y, width); - ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); - src_argb += src_stride_argb * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { - ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); - ARGBToYRow(src_argb, dst_y, width); - } - return 0; -} - -// Convert ABGR to I420. -LIBYUV_API -int ABGRToI420(const uint8_t *src_abgr, - int src_stride_abgr, - uint8_t *dst_y, - int dst_stride_y, - uint8_t *dst_u, - int dst_stride_u, - uint8_t *dst_v, - int dst_stride_v, - int width, - int height) { - int y; - void (*ABGRToUVRow)(const uint8_t *src_abgr0, int src_stride_abgr, - uint8_t *dst_u, uint8_t *dst_v, int width) = - ABGRToUVRow_C; - void (*ABGRToYRow)(const uint8_t *src_abgr, uint8_t *dst_y, int width) = - ABGRToYRow_C; - if (!src_abgr || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_abgr = src_abgr + (height - 1) * src_stride_abgr; - src_stride_abgr = -src_stride_abgr; - } -#if defined(HAS_ABGRTOYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToYRow = ABGRToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ABGRToYRow = ABGRToYRow_SSSE3; - } - } -#endif -#if defined(HAS_ABGRTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToUVRow = ABGRToUVRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_SSSE3; - } - } -#endif -#if defined(HAS_ABGRTOYROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ABGRToYRow = ABGRToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ABGRToYRow = ABGRToYRow_AVX2; - } - } -#endif -#if defined(HAS_ABGRTOUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ABGRToUVRow = ABGRToUVRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ABGRToUVRow = ABGRToUVRow_AVX2; - } - } -#endif - - for (y = 0; y < height - 1; y += 2) { - ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width); - ABGRToYRow(src_abgr, dst_y, width); - ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width); - src_abgr += src_stride_abgr * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { - ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width); - ABGRToYRow(src_abgr, dst_y, width); - } - return 0; -} - -// Convert RGB565 to I420. -LIBYUV_API -int RGB565ToI420(const uint8_t *src_rgb565, - int src_stride_rgb565, - uint8_t *dst_y, - int dst_stride_y, - uint8_t *dst_u, - int dst_stride_u, - uint8_t *dst_v, - int dst_stride_v, - int width, - int height) { - int y; - void (*RGB565ToARGBRow)(const uint8_t *src_rgb, uint8_t *dst_argb, - int width) = RGB565ToARGBRow_C; - void (*ARGBToUVRow)(const uint8_t *src_argb0, int src_stride_argb, - uint8_t *dst_u, uint8_t *dst_v, int width) = - ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8_t *src_argb, uint8_t *dst_y, int width) = - ARGBToYRow_C; - if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565; - src_stride_rgb565 = -src_stride_rgb565; - } - -#if defined(HAS_RGB565TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2; - if (IS_ALIGNED(width, 8)) { - RGB565ToARGBRow = RGB565ToARGBRow_SSE2; - } - } -#endif -#if defined(HAS_RGB565TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - RGB565ToARGBRow = RGB565ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToYRow = ARGBToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToYRow = ARGBToYRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVRow = ARGBToUVRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_AVX2; - } - } -#endif - { -#if !(defined(HAS_RGB565TOYROW_NEON)) - // Allocate 2 rows of ARGB. - const int row_size = (width * 4 + 31) & ~31; - align_buffer_64(row, row_size * 2); -#endif - for (y = 0; y < height - 1; y += 2) { -#if (defined(HAS_RGB565TOYROW_NEON)) -#else - RGB565ToARGBRow(src_rgb565, row, width); - RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + row_size, width); - ARGBToUVRow(row, row_size, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + row_size, dst_y + dst_stride_y, width); -#endif - src_rgb565 += src_stride_rgb565 * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { -#if (defined(HAS_RGB565TOYROW_NEON)) -#else - RGB565ToARGBRow(src_rgb565, row, width); - ARGBToUVRow(row, 0, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); -#endif - } -#if !(defined(HAS_RGB565TOYROW_NEON)) - free_aligned_buffer_64(row); -#endif - } - return 0; -} diff --git a/pkg/encoder/yuv/libyuv/convert.h b/pkg/encoder/yuv/libyuv/convert.h deleted file mode 100644 index 9a81c509c..000000000 --- a/pkg/encoder/yuv/libyuv/convert.h +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_CONVERT_H_ -#define INCLUDE_LIBYUV_CONVERT_H_ - -#include "rotate.h" // For enum RotationMode. - -// Copy I420 to I420. -#define I420ToI420 I420Copy -LIBYUV_API -int I420Copy(const uint8_t *src_y, - int src_stride_y, - const uint8_t *src_u, - int src_stride_u, - const uint8_t *src_v, - int src_stride_v, - uint8_t *dst_y, - int dst_stride_y, - uint8_t *dst_u, - int dst_stride_u, - uint8_t *dst_v, - int dst_stride_v, - int width, - int height); - -// ARGB little endian (bgra in memory) to I420. -LIBYUV_API -int ARGBToI420(const uint8_t *src_argb, - int src_stride_argb, - uint8_t *dst_y, - int dst_stride_y, - uint8_t *dst_u, - int dst_stride_u, - uint8_t *dst_v, - int dst_stride_v, - int width, - int height); - -// ABGR little endian (rgba in memory) to I420. -LIBYUV_API -int ABGRToI420(const uint8_t *src_abgr, - int src_stride_abgr, - uint8_t *dst_y, - int dst_stride_y, - uint8_t *dst_u, - int dst_stride_u, - uint8_t *dst_v, - int dst_stride_v, - int width, - int height); - -// RGB16 (RGBP fourcc) little endian to I420. -LIBYUV_API -int RGB565ToI420(const uint8_t *src_rgb565, - int src_stride_rgb565, - uint8_t *dst_y, - int dst_stride_y, - uint8_t *dst_u, - int dst_stride_u, - uint8_t *dst_v, - int dst_stride_v, - int width, - int height); - -// Convert camera sample to I420 with cropping, rotation and vertical flip. -// "src_size" is needed to parse MJPG. -// "dst_stride_y" number of bytes in a row of the dst_y plane. -// Normally this would be the same as dst_width, with recommended alignment -// to 16 bytes for better efficiency. -// If rotation of 90 or 270 is used, stride is affected. The caller should -// allocate the I420 buffer according to rotation. -// "dst_stride_u" number of bytes in a row of the dst_u plane. -// Normally this would be the same as (dst_width + 1) / 2, with -// recommended alignment to 16 bytes for better efficiency. -// If rotation of 90 or 270 is used, stride is affected. -// "crop_x" and "crop_y" are starting position for cropping. -// To center, crop_x = (src_width - dst_width) / 2 -// crop_y = (src_height - dst_height) / 2 -// "src_width" / "src_height" is size of src_frame in pixels. -// "src_height" can be negative indicating a vertically flipped image source. -// "crop_width" / "crop_height" is the size to crop the src to. -// Must be less than or equal to src_width/src_height -// Cropping parameters are pre-rotation. -// "rotation" can be 0, 90, 180 or 270. -// "fourcc" is a fourcc. ie 'I420', 'YUY2' -// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure. -LIBYUV_API -int ConvertToI420(const uint8_t *sample, - size_t sample_size, - uint8_t *dst_y, - int dst_stride_y, - uint8_t *dst_u, - int dst_stride_u, - uint8_t *dst_v, - int dst_stride_v, - int crop_x, - int crop_y, - int src_width, - int src_height, - int crop_width, - int crop_height, - enum RotationMode rotation, - uint32_t fourcc); - -#endif // INCLUDE_LIBYUV_CONVERT_H_ \ No newline at end of file diff --git a/pkg/encoder/yuv/libyuv/convert_argb.h b/pkg/encoder/yuv/libyuv/convert_argb.h deleted file mode 100644 index ac8e97169..000000000 --- a/pkg/encoder/yuv/libyuv/convert_argb.h +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_ -#define INCLUDE_LIBYUV_CONVERT_ARGB_H_ - -#include "basic_types.h" - -// Conversion matrix for YVU to BGR -LIBYUV_API extern const struct YuvConstants kYvuI601Constants; // BT.601 -LIBYUV_API extern const struct YuvConstants kYvuJPEGConstants; // BT.601 full -LIBYUV_API extern const struct YuvConstants kYvuH709Constants; // BT.709 -LIBYUV_API extern const struct YuvConstants kYvuF709Constants; // BT.709 full -LIBYUV_API extern const struct YuvConstants kYvu2020Constants; // BT.2020 -LIBYUV_API extern const struct YuvConstants kYvuV2020Constants; // BT.2020 full - -#endif // INCLUDE_LIBYUV_CONVERT_ARGB_H_ diff --git a/pkg/encoder/yuv/libyuv/convert_to_i420.c b/pkg/encoder/yuv/libyuv/convert_to_i420.c deleted file mode 100644 index 848021427..000000000 --- a/pkg/encoder/yuv/libyuv/convert_to_i420.c +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include - -#include "convert.h" -#include "video_common.h" - -// Convert camera sample to I420 with cropping, rotation and vertical flip. -// src_width is used for source stride computation -// src_height is used to compute location of planes, and indicate inversion -// sample_size is measured in bytes and is the size of the frame. -// With MJPEG it is the compressed size of the frame. -LIBYUV_API -int ConvertToI420(const uint8_t *sample, - size_t sample_size, - uint8_t *dst_y, - int dst_stride_y, - uint8_t *dst_u, - int dst_stride_u, - uint8_t *dst_v, - int dst_stride_v, - int crop_x, - int crop_y, - int src_width, - int src_height, - int crop_width, - int crop_height, - enum RotationMode rotation, - uint32_t fourcc) { - uint32_t format = CanonicalFourCC(fourcc); - const uint8_t *src; - // TODO(nisse): Why allow crop_height < 0? - const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; - int r = 0; - LIBYUV_BOOL need_buf = - (rotation && format != FOURCC_I420 && format != FOURCC_NV12 && - format != FOURCC_NV21 && format != FOURCC_YV12) || - dst_y == sample; - uint8_t *tmp_y = dst_y; - uint8_t *tmp_u = dst_u; - uint8_t *tmp_v = dst_v; - int tmp_y_stride = dst_stride_y; - int tmp_u_stride = dst_stride_u; - int tmp_v_stride = dst_stride_v; - uint8_t *rotate_buffer = NULL; - const int inv_crop_height = - (src_height < 0) ? -abs_crop_height : abs_crop_height; - - if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 || - crop_width <= 0 || src_height == 0 || crop_height == 0) { - return -1; - } - - // One pass rotation is available for some formats. For the rest, convert - // to I420 (with optional vertical flipping) into a temporary I420 buffer, - // and then rotate the I420 to the final destination buffer. - // For in-place conversion, if destination dst_y is same as source sample, - // also enable temporary buffer. - if (need_buf) { - int y_size = crop_width * abs_crop_height; - int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2); - rotate_buffer = (uint8_t *) malloc(y_size + uv_size * 2); /* NOLINT */ - if (!rotate_buffer) { - return 1; // Out of memory runtime error. - } - dst_y = rotate_buffer; - dst_u = dst_y + y_size; - dst_v = dst_u + uv_size; - dst_stride_y = crop_width; - dst_stride_u = dst_stride_v = ((crop_width + 1) / 2); - } - - switch (format) { - // Single plane formats - case FOURCC_RGBP: - src = sample + (src_width * crop_y + crop_x) * 2; - r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, crop_width, - inv_crop_height); - break; - case FOURCC_ARGB: - src = sample + (src_width * crop_y + crop_x) * 4; - r = ARGBToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, crop_width, - inv_crop_height); - break; - case FOURCC_ABGR: - src = sample + (src_width * crop_y + crop_x) * 4; - r = ABGRToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, crop_width, - inv_crop_height); - break; - default: - r = -1; // unknown fourcc - return failure code. - } - - if (need_buf) { - if (!r) { - r = I420Rotate(dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, - dst_stride_v, tmp_y, tmp_y_stride, tmp_u, tmp_u_stride, - tmp_v, tmp_v_stride, crop_width, abs_crop_height, - rotation); - } - free(rotate_buffer); - } - - return r; -} diff --git a/pkg/encoder/yuv/libyuv/cpu_id.c b/pkg/encoder/yuv/libyuv/cpu_id.c deleted file mode 100644 index 166057de5..000000000 --- a/pkg/encoder/yuv/libyuv/cpu_id.c +++ /dev/null @@ -1,204 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "cpu_id.h" - -#if !defined(__pnacl__) && !defined(__CLR_VER) && \ - !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \ - defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) -#include // For _xgetbv() -#endif - -// For ArmCpuCaps() but unittested on all platforms -#include // For fopen() -#include - -// For functions that use the stack and have runtime checks for overflow, -// use SAFEBUFFERS to avoid additional check. -#define SAFEBUFFERS - -// cpu_info_ variable for SIMD instruction sets detected. -LIBYUV_API int cpu_info_ = 0; - -// Low level cpuid for X86. -#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ - defined(__x86_64__)) && \ - !defined(__pnacl__) && !defined(__CLR_VER) -LIBYUV_API -void CpuId(int info_eax, int info_ecx, int *cpu_info) { -#if defined(_MSC_VER) - // GCC version uses inline x86 assembly. -#else // defined(_MSC_VER) - int info_ebx, info_edx; - asm volatile( -#if defined(__i386__) && defined(__PIC__) - // Preserve ebx for fpic 32 bit. - "mov %%ebx, %%edi \n" - "cpuid \n" - "xchg %%edi, %%ebx \n" - : "=D"(info_ebx), -#else - "cpuid \n" - : "=b"(info_ebx), -#endif // defined( __i386__) && defined(__PIC__) - "+a"(info_eax), "+c"(info_ecx), "=d"(info_edx)); - cpu_info[0] = info_eax; - cpu_info[1] = info_ebx; - cpu_info[2] = info_ecx; - cpu_info[3] = info_edx; -#endif // defined(_MSC_VER) -} - -#else // (defined(_M_IX86) || defined(_M_X64) ... -LIBYUV_API -void CpuId(int eax, int ecx, int* cpu_info) { - (void)eax; - (void)ecx; - cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; -} -#endif - -// For VS2010 and earlier emit can be used: -// _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier. -// __asm { -// xor ecx, ecx // xcr 0 -// xgetbv -// mov xcr0, eax -// } -// For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code. -// https://code.google.com/p/libyuv/issues/detail?id=529 -#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900) -#pragma optimize("g", off) -#endif -#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ - defined(__x86_64__)) && \ - !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__) - -// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers. -static int GetXCR0() { - int xcr0 = 0; -#if defined(__i386__) || defined(__x86_64__) - asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx"); -#endif // defined(__i386__) || defined(__x86_64__) - return xcr0; -} - -#else -// xgetbv unavailable to query for OSSave support. Return 0. -#define GetXCR0() 0 -#endif // defined(_M_IX86) || defined(_M_X64) .. -// Return optimization to previous setting. -#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900) -#pragma optimize("g", on) -#endif - -// Based on libvpx arm_cpudetect.c -// For Arm, but public to allow testing on any CPU -LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char *cpuinfo_name) { - char cpuinfo_line[512]; - FILE *f = fopen(cpuinfo_name, "re"); - if (!f) { - // Assume Neon if /proc/cpuinfo is unavailable. - // This will occur for Chrome sandbox for Pepper or Render process. - return kCpuHasNEON; - } - memset(cpuinfo_line, 0, sizeof(cpuinfo_line)); - while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f)) { - if (memcmp(cpuinfo_line, "Features", 8) == 0) { - char *p = strstr(cpuinfo_line, " neon"); - if (p && (p[5] == ' ' || p[5] == '\n')) { - fclose(f); - return kCpuHasNEON; - } - // aarch64 uses asimd for Neon. - p = strstr(cpuinfo_line, " asimd"); - if (p) { - fclose(f); - return kCpuHasNEON; - } - } - } - fclose(f); - return 0; -} - -static SAFEBUFFERS int GetCpuFlags(void) { - int cpu_info = 0; -#if !defined(__pnacl__) && !defined(__CLR_VER) && \ - (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \ - defined(_M_IX86)) - int cpu_info0[4] = {0, 0, 0, 0}; - int cpu_info1[4] = {0, 0, 0, 0}; - int cpu_info7[4] = {0, 0, 0, 0}; - CpuId(0, 0, cpu_info0); - CpuId(1, 0, cpu_info1); - if (cpu_info0[0] >= 7) { - CpuId(7, 0, cpu_info7); - } - cpu_info = kCpuHasX86 | ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) | - ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) | - ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) | - ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) | - ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0); - - // AVX requires OS saves YMM registers. - if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) && // AVX and OSXSave - ((GetXCR0() & 6) == 6)) { // Test OS saves YMM registers - cpu_info |= kCpuHasAVX | ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | - ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) | - ((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0); - - // Detect AVX512bw - if ((GetXCR0() & 0xe0) == 0xe0) { - cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0; - cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0; - cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0; - cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0; - cpu_info |= (cpu_info7[2] & 0x00000800) ? kCpuHasAVX512VNNI : 0; - cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0; - cpu_info |= (cpu_info7[2] & 0x00004000) ? kCpuHasAVX512VPOPCNTDQ : 0; - cpu_info |= (cpu_info7[2] & 0x00000100) ? kCpuHasGFNI : 0; - } - } -#endif -#if defined(__arm__) || defined(__aarch64__) - // gcc -mfpu=neon defines __ARM_NEON__ - // __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon. - // For Linux, /proc/cpuinfo can be tested but without that assume Neon. -#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__) - cpu_info = kCpuHasNEON; - // For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon - // flag in it. - // So for aarch64, neon enabling is hard coded here. -#endif -#if defined(__aarch64__) - cpu_info = kCpuHasNEON; -#else - // Linux arm parse text file for neon detect. - cpu_info = ArmCpuCaps("/proc/cpuinfo"); -#endif - cpu_info |= kCpuHasARM; -#endif // __arm__ - cpu_info |= kCpuInitialized; - return cpu_info; -} - -// Note that use of this function is not thread safe. -LIBYUV_API -int MaskCpuFlags(int enable_flags) { - int cpu_info = GetCpuFlags() & enable_flags; - SetCpuFlags(cpu_info); - return cpu_info; -} - -LIBYUV_API -int InitCpuFlags(void) { - return MaskCpuFlags(-1); -} diff --git a/pkg/encoder/yuv/libyuv/cpu_id.h b/pkg/encoder/yuv/libyuv/cpu_id.h deleted file mode 100644 index bf50b9cd1..000000000 --- a/pkg/encoder/yuv/libyuv/cpu_id.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_CPU_ID_H_ -#define INCLUDE_LIBYUV_CPU_ID_H_ - -#include "basic_types.h" - -// Internal flag to indicate cpuid requires initialization. -static const int kCpuInitialized = 0x1; - -// These flags are only valid on ARM processors. -static const int kCpuHasARM = 0x2; -static const int kCpuHasNEON = 0x4; -// 0x8 reserved for future ARM flag. - -// These flags are only valid on x86 processors. -static const int kCpuHasX86 = 0x10; -static const int kCpuHasSSE2 = 0x20; -static const int kCpuHasSSSE3 = 0x40; -static const int kCpuHasSSE41 = 0x80; -static const int kCpuHasSSE42 = 0x100; // unused at this time. -static const int kCpuHasAVX = 0x200; -static const int kCpuHasAVX2 = 0x400; -static const int kCpuHasERMS = 0x800; -static const int kCpuHasFMA3 = 0x1000; -static const int kCpuHasF16C = 0x2000; -static const int kCpuHasGFNI = 0x4000; -static const int kCpuHasAVX512BW = 0x8000; -static const int kCpuHasAVX512VL = 0x10000; -static const int kCpuHasAVX512VNNI = 0x20000; -static const int kCpuHasAVX512VBMI = 0x40000; -static const int kCpuHasAVX512VBMI2 = 0x80000; -static const int kCpuHasAVX512VBITALG = 0x100000; -static const int kCpuHasAVX512VPOPCNTDQ = 0x200000; - -// Optional init function. TestCpuFlag does an auto-init. -// Returns cpu_info flags. -LIBYUV_API -int InitCpuFlags(void); - -// Detect CPU has SSE2 etc. -// Test_flag parameter should be one of kCpuHas constants above. -// Returns non-zero if instruction set is detected -static __inline int TestCpuFlag(int test_flag) { - LIBYUV_API extern int cpu_info_; -#ifdef __ATOMIC_RELAXED - int cpu_info = __atomic_load_n(&cpu_info_, __ATOMIC_RELAXED); -#else - int cpu_info = cpu_info_; -#endif - return (!cpu_info ? InitCpuFlags() : cpu_info) & test_flag; -} - -// Internal function for parsing /proc/cpuinfo. -LIBYUV_API -int ArmCpuCaps(const char *cpuinfo_name); - -// For testing, allow CPU flags to be disabled. -// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3. -// MaskCpuFlags(-1) to enable all cpu specific optimizations. -// MaskCpuFlags(1) to disable all cpu specific optimizations. -// MaskCpuFlags(0) to reset state so next call will auto init. -// Returns cpu_info flags. -LIBYUV_API -int MaskCpuFlags(int enable_flags); - -// Sets the CPU flags to |cpu_flags|, bypassing the detection code. |cpu_flags| -// should be a valid combination of the kCpuHas constants above and include -// kCpuInitialized. Use this method when running in a sandboxed process where -// the detection code might fail (as it might access /proc/cpuinfo). In such -// cases the cpu_info can be obtained from a non sandboxed process by calling -// InitCpuFlags() and passed to the sandboxed process (via command line -// parameters, IPC...) which can then call this method to initialize the CPU -// flags. -// Notes: -// - when specifying 0 for |cpu_flags|, the auto initialization is enabled -// again. -// - enabling CPU features that are not supported by the CPU will result in -// undefined behavior. -// TODO(fbarchard): consider writing a helper function that translates from -// other library CPU info to libyuv CPU info and add a .md doc that explains -// CPU detection. -static __inline void SetCpuFlags(int cpu_flags) { - LIBYUV_API extern int cpu_info_; -#ifdef __ATOMIC_RELAXED - __atomic_store_n(&cpu_info_, cpu_flags, __ATOMIC_RELAXED); -#else - cpu_info_ = cpu_flags; -#endif -} - -// Low level cpuid for X86. Returns zeros on other CPUs. -// eax is the info type that you want. -// ecx is typically the cpu number, and should normally be zero. -LIBYUV_API -void CpuId(int info_eax, int info_ecx, int *cpu_info); - -#endif // INCLUDE_LIBYUV_CPU_ID_H_ \ No newline at end of file diff --git a/pkg/encoder/yuv/libyuv/libyuv.go b/pkg/encoder/yuv/libyuv/libyuv.go index 98d4276ff..8bde0ad89 100644 --- a/pkg/encoder/yuv/libyuv/libyuv.go +++ b/pkg/encoder/yuv/libyuv/libyuv.go @@ -1,18 +1,43 @@ -//go:build !darwin && !no_libyuv - +// Package libyuv contains the wrapper for: https://chromium.googlesource.com/libyuv/libyuv. +// Libs are downloaded from: https://packages.macports.org/libyuv/. package libyuv -// see: https://chromium.googlesource.com/libyuv/libyuv - /* -#cgo CFLAGS: -Wall -#cgo LDFLAGS: -lyuv +#cgo !darwin LDFLAGS: -lyuv + +#cgo darwin CFLAGS: -DINCLUDE_LIBYUV_VERSION_H_ +#cgo darwin LDFLAGS: -L${SRCDIR} -lstdc++ +#cgo darwin,amd64 LDFLAGS: -lyuv_darwin_x86_64 -ljpeg -lstdc++ +#cgo darwin,arm64 LDFLAGS: -lyuv_darwin_arm64 -ljpeg -lstdc++ -#include +#include // for uintptr_t and C99 types + +#if !defined(LIBYUV_API) +#define LIBYUV_API +#endif // LIBYUV_API + +#ifndef INCLUDE_LIBYUV_VERSION_H_ #include "libyuv/version.h" -#include "libyuv/video_common.h" +#else +#define LIBYUV_VERSION 1874 // darwin static libs version +#endif // INCLUDE_LIBYUV_VERSION_H_ + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define FOURCC(a, b, c, d) \ + (((uint32_t)(a)) | ((uint32_t)(b) << 8) | ((uint32_t)(c) << 16) | ((uint32_t)(d) << 24)) + +enum FourCC { + FOURCC_I420 = FOURCC('I', '4', '2', '0'), + FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'), + FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'), + FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'), // rgb565 LE. + FOURCC_ANY = -1, +}; -// typedef enum RotationMode { kRotate0 = 0, // No rotation. kRotate90 = 90, // Rotate 90 degrees clockwise. @@ -20,7 +45,6 @@ typedef enum RotationMode { kRotate270 = 270, // Rotate 270 degrees clockwise. } RotationModeEnum; -// LIBYUV_API int ConvertToI420(const uint8_t* sample, size_t sample_size, @@ -65,6 +89,11 @@ int I420Scale(const uint8_t *src_y, int dst_width, int dst_height, enum FilterMode filtering); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif */ import "C" import "fmt" diff --git a/pkg/encoder/yuv/libyuv/libyuv2.go b/pkg/encoder/yuv/libyuv/libyuv2.go deleted file mode 100644 index f4f6a68b5..000000000 --- a/pkg/encoder/yuv/libyuv/libyuv2.go +++ /dev/null @@ -1,89 +0,0 @@ -//go:build darwin || no_libyuv - -package libyuv - -/* -#cgo CFLAGS: -Wall - -#include "basic_types.h" -#include "version.h" -#include "video_common.h" -#include "rotate.h" -#include "scale.h" -#include "convert.h" - -*/ -import "C" -import "fmt" - -const FourccRgbp uint32 = C.FOURCC_RGBP -const FourccArgb uint32 = C.FOURCC_ARGB -const FourccAbgr uint32 = C.FOURCC_ABGR - -func Y420(src []byte, dst []byte, _, h, stride int, dw, dh int, rot uint, pix uint32, cx, cy int) { - cw := (dw + 1) / 2 - ch := (dh + 1) / 2 - i0 := dw * dh - i1 := i0 + cw*ch - yStride := dw - cStride := cw - - C.ConvertToI420( - (*C.uchar)(&src[0]), - C.size_t(0), - (*C.uchar)(&dst[0]), - C.int(yStride), - (*C.uchar)(&dst[i0]), - C.int(cStride), - (*C.uchar)(&dst[i1]), - C.int(cStride), - C.int(0), - C.int(0), - C.int(stride), - C.int(h), - C.int(cx), - C.int(cy), - C.enum_RotationMode(rot), - C.uint32_t(pix)) -} - -func Y420Scale(src []byte, dst []byte, w, h int, dw, dh int) { - srcWidthUV, dstWidthUV := (w+1)>>1, (dw+1)>>1 - srcHeightUV, dstHeightUV := (h+1)>>1, (dh+1)>>1 - - srcYPlaneSize, dstYPlaneSize := w*h, dw*dh - srcUVPlaneSize, dstUVPlaneSize := srcWidthUV*srcHeightUV, dstWidthUV*dstHeightUV - - srcStrideY, dstStrideY := w, dw - srcStrideU, dstStrideU := srcWidthUV, dstWidthUV - srcStrideV, dstStrideV := srcWidthUV, dstWidthUV - - srcY := (*C.uchar)(&src[0]) - srcU := (*C.uchar)(&src[srcYPlaneSize]) - srcV := (*C.uchar)(&src[srcYPlaneSize+srcUVPlaneSize]) - - dstY := (*C.uchar)(&dst[0]) - dstU := (*C.uchar)(&dst[dstYPlaneSize]) - dstV := (*C.uchar)(&dst[dstYPlaneSize+dstUVPlaneSize]) - - C.I420Scale( - srcY, - C.int(srcStrideY), - srcU, - C.int(srcStrideU), - srcV, - C.int(srcStrideV), - C.int(w), - C.int(h), - dstY, - C.int(dstStrideY), - dstU, - C.int(dstStrideU), - dstV, - C.int(dstStrideV), - C.int(dw), - C.int(dh), - C.enum_FilterMode(C.kFilterNone)) -} - -func Version() string { return fmt.Sprintf("%v mod", int(C.LIBYUV_VERSION)) } diff --git a/pkg/encoder/yuv/libyuv/libyuv_darwin_arm64.a b/pkg/encoder/yuv/libyuv/libyuv_darwin_arm64.a new file mode 100644 index 000000000..f399a41c7 Binary files /dev/null and b/pkg/encoder/yuv/libyuv/libyuv_darwin_arm64.a differ diff --git a/pkg/encoder/yuv/libyuv/libyuv_darwin_x86_64.a b/pkg/encoder/yuv/libyuv/libyuv_darwin_x86_64.a new file mode 100644 index 000000000..63cd5c74a Binary files /dev/null and b/pkg/encoder/yuv/libyuv/libyuv_darwin_x86_64.a differ diff --git a/pkg/encoder/yuv/libyuv/planar_functions.c b/pkg/encoder/yuv/libyuv/planar_functions.c deleted file mode 100644 index a5d543cc5..000000000 --- a/pkg/encoder/yuv/libyuv/planar_functions.c +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "planar_functions.h" - -#include "cpu_id.h" -#include "row.h" - -// Copy a plane of data -LIBYUV_API -void CopyPlane(const uint8_t *src_y, - int src_stride_y, - uint8_t *dst_y, - int dst_stride_y, - int width, - int height) { - int y; - void (*CopyRow)(const uint8_t *src, uint8_t *dst, int width) = CopyRow_C; - if (width <= 0 || height == 0) { - return; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_y = dst_y + (height - 1) * dst_stride_y; - dst_stride_y = -dst_stride_y; - } - // Coalesce rows. - if (src_stride_y == width && dst_stride_y == width) { - width *= height; - height = 1; - src_stride_y = dst_stride_y = 0; - } - // Nothing to do. - if (src_y == dst_y && src_stride_y == dst_stride_y) { - return; - } - -#if defined(HAS_COPYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; - } -#endif -#if defined(HAS_COPYROW_AVX) - if (TestCpuFlag(kCpuHasAVX)) { - CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX; - } -#endif -#if defined(HAS_COPYROW_ERMS) - if (TestCpuFlag(kCpuHasERMS)) { - CopyRow = CopyRow_ERMS; - } -#endif - - // Copy plane - for (y = 0; y < height; ++y) { - CopyRow(src_y, dst_y, width); - src_y += src_stride_y; - dst_y += dst_stride_y; - } -} diff --git a/pkg/encoder/yuv/libyuv/planar_functions.h b/pkg/encoder/yuv/libyuv/planar_functions.h deleted file mode 100644 index 222109cfc..000000000 --- a/pkg/encoder/yuv/libyuv/planar_functions.h +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ -#define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ - -#include "basic_types.h" - -// TODO(fbarchard): Move cpu macros to row.h -#if defined(__pnacl__) || defined(__CLR_VER) || \ - (defined(__native_client__) && defined(__x86_64__)) || \ - (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) -#define LIBYUV_DISABLE_X86 -#endif -// MemorySanitizer does not support assembly code yet. http://crbug.com/344505 -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON) -#define LIBYUV_DISABLE_NEON -#endif -#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86) -#define LIBYUV_DISABLE_X86 -#endif -#endif -// The following are available on all x86 platforms: -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) -#define HAS_ARGBAFFINEROW_SSE2 -#endif - -// Copy a plane of data. -LIBYUV_API -void CopyPlane(const uint8_t *src_y, - int src_stride_y, - uint8_t *dst_y, - int dst_stride_y, - int width, - int height); - -#endif // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ \ No newline at end of file diff --git a/pkg/encoder/yuv/libyuv/rotate.c b/pkg/encoder/yuv/libyuv/rotate.c deleted file mode 100644 index 4aabae5b0..000000000 --- a/pkg/encoder/yuv/libyuv/rotate.c +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "rotate.h" - -#include "convert.h" -#include "cpu_id.h" -#include "rotate_row.h" -#include "row.h" - -LIBYUV_API -void TransposePlane(const uint8_t *src, - int src_stride, - uint8_t *dst, - int dst_stride, - int width, - int height) { - int i = height; - - void (*TransposeWx8)(const uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, int width) = TransposeWx8_C; - -#if defined(HAS_TRANSPOSEWX8_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - TransposeWx8 = TransposeWx8_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - TransposeWx8 = TransposeWx8_SSSE3; - } - } -#endif -#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - TransposeWx8 = TransposeWx8_Fast_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - TransposeWx8 = TransposeWx8_Fast_SSSE3; - } - } -#endif - - // Work across the source in 8x8 tiles - while (i >= 8) { - TransposeWx8(src, src_stride, dst, dst_stride, width); - src += 8 * src_stride; // Go down 8 rows. - dst += 8; // Move over 8 columns. - i -= 8; - } - - if (i > 0) { - TransposeWxH_C(src, src_stride, dst, dst_stride, width, i); - } -} - -LIBYUV_API -void RotatePlane90(const uint8_t *src, - int src_stride, - uint8_t *dst, - int dst_stride, - int width, - int height) { - // Rotate by 90 is a transpose with the source read - // from bottom to top. So set the source pointer to the end - // of the buffer and flip the sign of the source stride. - src += src_stride * (height - 1); - src_stride = -src_stride; - TransposePlane(src, src_stride, dst, dst_stride, width, height); -} - -LIBYUV_API -void RotatePlane270(const uint8_t *src, - int src_stride, - uint8_t *dst, - int dst_stride, - int width, - int height) { - // Rotate by 270 is a transpose with the destination written - // from bottom to top. So set the destination pointer to the end - // of the buffer and flip the sign of the destination stride. - dst += dst_stride * (width - 1); - dst_stride = -dst_stride; - TransposePlane(src, src_stride, dst, dst_stride, width, height); -} - -LIBYUV_API -void RotatePlane180(const uint8_t *src, - int src_stride, - uint8_t *dst, - int dst_stride, - int width, - int height) { - // Swap top and bottom row and mirror the content. Uses a temporary row. - align_buffer_64(row, width); - const uint8_t *src_bot = src + src_stride * (height - 1); - uint8_t *dst_bot = dst + dst_stride * (height - 1); - int half_height = (height + 1) >> 1; - int y; - void (*MirrorRow)(const uint8_t *src, uint8_t *dst, int width) = MirrorRow_C; - void (*CopyRow)(const uint8_t *src, uint8_t *dst, int width) = CopyRow_C; -#if defined(HAS_MIRRORROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - MirrorRow = MirrorRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - MirrorRow = MirrorRow_SSSE3; - } - } -#endif -#if defined(HAS_MIRRORROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - MirrorRow = MirrorRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - MirrorRow = MirrorRow_AVX2; - } - } -#endif -#if defined(HAS_COPYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; - } -#endif -#if defined(HAS_COPYROW_AVX) - if (TestCpuFlag(kCpuHasAVX)) { - CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX; - } -#endif -#if defined(HAS_COPYROW_ERMS) - if (TestCpuFlag(kCpuHasERMS)) { - CopyRow = CopyRow_ERMS; - } -#endif -#if defined(HAS_COPYROW_NEON) -#endif - // Odd height will harmlessly mirror the middle row twice. - for (y = 0; y < half_height; ++y) { - CopyRow(src, row, width); // Copy top row into buffer - MirrorRow(src_bot, dst, width); // Mirror bottom row into top row - MirrorRow(row, dst_bot, width); // Mirror buffer into bottom row - src += src_stride; - dst += dst_stride; - src_bot -= src_stride; - dst_bot -= dst_stride; - } - free_aligned_buffer_64(row); -} - -LIBYUV_API -int I420Rotate(const uint8_t *src_y, - int src_stride_y, - const uint8_t *src_u, - int src_stride_u, - const uint8_t *src_v, - int src_stride_v, - uint8_t *dst_y, - int dst_stride_y, - uint8_t *dst_u, - int dst_stride_u, - uint8_t *dst_v, - int dst_stride_v, - int width, - int height, - enum RotationMode mode) { - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - if ((!src_y && dst_y) || !src_u || !src_v || width <= 0 || height == 0 || - !dst_y || !dst_u || !dst_v) { - return -1; - } - - // Negative height means invert the image. - if (height < 0) { - height = -height; - halfheight = (height + 1) >> 1; - src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (halfheight - 1) * src_stride_u; - src_v = src_v + (halfheight - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } - - switch (mode) { - case kRotate0: - // copy frame - return I420Copy(src_y, src_stride_y, src_u, src_stride_u, src_v, - src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, - dst_v, dst_stride_v, width, height); - case kRotate90: - RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, - halfheight); - RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, - halfheight); - return 0; - case kRotate270: - RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, - halfheight); - RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, - halfheight); - return 0; - case kRotate180: - RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, - halfheight); - RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, - halfheight); - return 0; - default: - break; - } - return -1; -} diff --git a/pkg/encoder/yuv/libyuv/rotate.h b/pkg/encoder/yuv/libyuv/rotate.h deleted file mode 100644 index 59b9ec3cb..000000000 --- a/pkg/encoder/yuv/libyuv/rotate.h +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_ROTATE_H_ -#define INCLUDE_LIBYUV_ROTATE_H_ - -#include "basic_types.h" - -// Supported rotation. -typedef enum RotationMode { - kRotate0 = 0, // No rotation. - kRotate90 = 90, // Rotate 90 degrees clockwise. - kRotate180 = 180, // Rotate 180 degrees. - kRotate270 = 270, // Rotate 270 degrees clockwise. -} RotationModeEnum; - -// Rotate I420 frame. -LIBYUV_API -int I420Rotate(const uint8_t *src_y, - int src_stride_y, - const uint8_t *src_u, - int src_stride_u, - const uint8_t *src_v, - int src_stride_v, - uint8_t *dst_y, - int dst_stride_y, - uint8_t *dst_u, - int dst_stride_u, - uint8_t *dst_v, - int dst_stride_v, - int width, - int height, - enum RotationMode mode); - -// Rotate planes by 90, 180, 270. Deprecated. -LIBYUV_API -void RotatePlane90(const uint8_t *src, - int src_stride, - uint8_t *dst, - int dst_stride, - int width, - int height); - -LIBYUV_API -void RotatePlane180(const uint8_t *src, - int src_stride, - uint8_t *dst, - int dst_stride, - int width, - int height); - -LIBYUV_API -void RotatePlane270(const uint8_t *src, - int src_stride, - uint8_t *dst, - int dst_stride, - int width, - int height); - -// The 90 and 270 functions are based on transposes. -// Doing a transpose with reversing the read/write -// order will result in a rotation by +- 90 degrees. -// Deprecated. -LIBYUV_API -void TransposePlane(const uint8_t *src, - int src_stride, - uint8_t *dst, - int dst_stride, - int width, - int height); - -#endif // INCLUDE_LIBYUV_ROTATE_H_ diff --git a/pkg/encoder/yuv/libyuv/rotate_any.c b/pkg/encoder/yuv/libyuv/rotate_any.c deleted file mode 100644 index 9af8c04ab..000000000 --- a/pkg/encoder/yuv/libyuv/rotate_any.c +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright 2015 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "rotate_row.h" - -#define TANY(NAMEANY, TPOS_SIMD, MASK) \ - void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst, \ - int dst_stride, int width) { \ - int r = width & MASK; \ - int n = width - r; \ - if (n > 0) { \ - TPOS_SIMD(src, src_stride, dst, dst_stride, n); \ - } \ - TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \ - } - -#ifdef HAS_TRANSPOSEWX8_SSSE3 - -TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7) - -#endif -#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3 - -TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15) - -#endif -#undef TANY - -#define TUVANY(NAMEANY, TPOS_SIMD, MASK) \ - void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst_a, \ - int dst_stride_a, uint8_t* dst_b, int dst_stride_b, \ - int width) { \ - int r = width & MASK; \ - int n = width - r; \ - if (n > 0) { \ - TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, n); \ - } \ - TransposeUVWx8_C(src + n * 2, src_stride, dst_a + n * dst_stride_a, \ - dst_stride_a, dst_b + n * dst_stride_b, dst_stride_b, r); \ - } - -#ifdef HAS_TRANSPOSEUVWX8_SSE2 - -TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7) - -#endif -#undef TUVANY diff --git a/pkg/encoder/yuv/libyuv/rotate_common.c b/pkg/encoder/yuv/libyuv/rotate_common.c deleted file mode 100644 index 20c1481a7..000000000 --- a/pkg/encoder/yuv/libyuv/rotate_common.c +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "rotate_row.h" - -void TransposeWx8_C(const uint8_t *src, - int src_stride, - uint8_t *dst, - int dst_stride, - int width) { - int i; - for (i = 0; i < width; ++i) { - dst[0] = src[0 * src_stride]; - dst[1] = src[1 * src_stride]; - dst[2] = src[2 * src_stride]; - dst[3] = src[3 * src_stride]; - dst[4] = src[4 * src_stride]; - dst[5] = src[5 * src_stride]; - dst[6] = src[6 * src_stride]; - dst[7] = src[7 * src_stride]; - ++src; - dst += dst_stride; - } -} - -void TransposeUVWx8_C(const uint8_t *src, - int src_stride, - uint8_t *dst_a, - int dst_stride_a, - uint8_t *dst_b, - int dst_stride_b, - int width) { - int i; - for (i = 0; i < width; ++i) { - dst_a[0] = src[0 * src_stride + 0]; - dst_b[0] = src[0 * src_stride + 1]; - dst_a[1] = src[1 * src_stride + 0]; - dst_b[1] = src[1 * src_stride + 1]; - dst_a[2] = src[2 * src_stride + 0]; - dst_b[2] = src[2 * src_stride + 1]; - dst_a[3] = src[3 * src_stride + 0]; - dst_b[3] = src[3 * src_stride + 1]; - dst_a[4] = src[4 * src_stride + 0]; - dst_b[4] = src[4 * src_stride + 1]; - dst_a[5] = src[5 * src_stride + 0]; - dst_b[5] = src[5 * src_stride + 1]; - dst_a[6] = src[6 * src_stride + 0]; - dst_b[6] = src[6 * src_stride + 1]; - dst_a[7] = src[7 * src_stride + 0]; - dst_b[7] = src[7 * src_stride + 1]; - src += 2; - dst_a += dst_stride_a; - dst_b += dst_stride_b; - } -} - -void TransposeWxH_C(const uint8_t *src, - int src_stride, - uint8_t *dst, - int dst_stride, - int width, - int height) { - int i; - for (i = 0; i < width; ++i) { - int j; - for (j = 0; j < height; ++j) { - dst[i * dst_stride + j] = src[j * src_stride + i]; - } - } -} diff --git a/pkg/encoder/yuv/libyuv/rotate_gcc.c b/pkg/encoder/yuv/libyuv/rotate_gcc.c deleted file mode 100644 index 54fdafff8..000000000 --- a/pkg/encoder/yuv/libyuv/rotate_gcc.c +++ /dev/null @@ -1,370 +0,0 @@ -/* - * Copyright 2015 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "rotate_row.h" -#include "row.h" - -// This module is for GCC x86 and x64. -#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) - -// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit. -#if defined(HAS_TRANSPOSEWX8_SSSE3) - -void TransposeWx8_SSSE3(const uint8_t *src, - int src_stride, - uint8_t *dst, - int dst_stride, - int width) { - asm volatile( - // Read in the data from the source pointer. - // First round of bit swap. - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" - "movq (%0,%3),%%xmm1 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "movq (%0),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "palignr $0x8,%%xmm1,%%xmm1 \n" - "movq (%0,%3),%%xmm3 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "movq (%0),%%xmm4 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "movq (%0,%3),%%xmm5 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "movq (%0),%%xmm6 \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq (%0,%3),%%xmm7 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "neg %3 \n" - "movdqa %%xmm6,%%xmm7 \n" - "lea 0x8(%0,%3,8),%0 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "neg %3 \n" - // Second round of bit swap. - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "palignr $0x8,%%xmm2,%%xmm2 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "movdqa %%xmm5,%%xmm7 \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - // Third round of bit swap. - // Write to the destination pointer. - "punpckldq %%xmm4,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movdqa %%xmm0,%%xmm4 \n" - "palignr $0x8,%%xmm4,%%xmm4 \n" - "movq %%xmm4,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movdqa %%xmm2,%%xmm6 \n" - "movq %%xmm2,(%1) \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movq %%xmm6,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm1,%%xmm5 \n" - "movq %%xmm1,(%1) \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq %%xmm5,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movq %%xmm3,(%1) \n" - "movdqa %%xmm3,%%xmm7 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "sub $0x8,%2 \n" - "movq %%xmm7,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((intptr_t) (src_stride)), // %3 - "r"((intptr_t) (dst_stride)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} - -#endif // defined(HAS_TRANSPOSEWX8_SSSE3) - -// Transpose 16x8. 64 bit -#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) - -void TransposeWx8_Fast_SSSE3(const uint8_t *src, - int src_stride, - uint8_t *dst, - int dst_stride, - int width) { - asm volatile( - // Read in the data from the source pointer. - // First round of bit swap. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%3),%%xmm1 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm0,%%xmm8 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm8 \n" - "movdqu (%0),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm8,%%xmm9 \n" - "palignr $0x8,%%xmm1,%%xmm1 \n" - "palignr $0x8,%%xmm9,%%xmm9 \n" - "movdqu (%0,%3),%%xmm3 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm2,%%xmm10 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "punpckhbw %%xmm3,%%xmm10 \n" - "movdqa %%xmm2,%%xmm3 \n" - "movdqa %%xmm10,%%xmm11 \n" - "movdqu (%0),%%xmm4 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "palignr $0x8,%%xmm11,%%xmm11 \n" - "movdqu (%0,%3),%%xmm5 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm4,%%xmm12 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "punpckhbw %%xmm5,%%xmm12 \n" - "movdqa %%xmm4,%%xmm5 \n" - "movdqa %%xmm12,%%xmm13 \n" - "movdqu (%0),%%xmm6 \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "palignr $0x8,%%xmm13,%%xmm13 \n" - "movdqu (%0,%3),%%xmm7 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm6,%%xmm14 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "punpckhbw %%xmm7,%%xmm14 \n" - "neg %3 \n" - "movdqa %%xmm6,%%xmm7 \n" - "movdqa %%xmm14,%%xmm15 \n" - "lea 0x10(%0,%3,8),%0 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - "neg %3 \n" - // Second round of bit swap. - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "palignr $0x8,%%xmm2,%%xmm2 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "movdqa %%xmm5,%%xmm7 \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "punpcklwd %%xmm10,%%xmm8 \n" - "punpcklwd %%xmm11,%%xmm9 \n" - "movdqa %%xmm8,%%xmm10 \n" - "movdqa %%xmm9,%%xmm11 \n" - "palignr $0x8,%%xmm10,%%xmm10 \n" - "palignr $0x8,%%xmm11,%%xmm11 \n" - "punpcklwd %%xmm14,%%xmm12 \n" - "punpcklwd %%xmm15,%%xmm13 \n" - "movdqa %%xmm12,%%xmm14 \n" - "movdqa %%xmm13,%%xmm15 \n" - "palignr $0x8,%%xmm14,%%xmm14 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - // Third round of bit swap. - // Write to the destination pointer. - "punpckldq %%xmm4,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movdqa %%xmm0,%%xmm4 \n" - "palignr $0x8,%%xmm4,%%xmm4 \n" - "movq %%xmm4,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movdqa %%xmm2,%%xmm6 \n" - "movq %%xmm2,(%1) \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movq %%xmm6,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm1,%%xmm5 \n" - "movq %%xmm1,(%1) \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq %%xmm5,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movq %%xmm3,(%1) \n" - "movdqa %%xmm3,%%xmm7 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "movq %%xmm7,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm12,%%xmm8 \n" - "movq %%xmm8,(%1) \n" - "movdqa %%xmm8,%%xmm12 \n" - "palignr $0x8,%%xmm12,%%xmm12 \n" - "movq %%xmm12,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm14,%%xmm10 \n" - "movdqa %%xmm10,%%xmm14 \n" - "movq %%xmm10,(%1) \n" - "palignr $0x8,%%xmm14,%%xmm14 \n" - "punpckldq %%xmm13,%%xmm9 \n" - "movq %%xmm14,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm9,%%xmm13 \n" - "movq %%xmm9,(%1) \n" - "palignr $0x8,%%xmm13,%%xmm13 \n" - "movq %%xmm13,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm15,%%xmm11 \n" - "movq %%xmm11,(%1) \n" - "movdqa %%xmm11,%%xmm15 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - "sub $0x10,%2 \n" - "movq %%xmm15,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((intptr_t) (src_stride)), // %3 - "r"((intptr_t) (dst_stride)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", - "xmm15"); -} - -#endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3) - -// Transpose UV 8x8. 64 bit. -#if defined(HAS_TRANSPOSEUVWX8_SSE2) - -void TransposeUVWx8_SSE2(const uint8_t *src, - int src_stride, - uint8_t *dst_a, - int dst_stride_a, - uint8_t *dst_b, - int dst_stride_b, - int width) { - asm volatile( - // Read in the data from the source pointer. - // First round of bit swap. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%4),%%xmm1 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm0,%%xmm8 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm8 \n" - "movdqa %%xmm8,%%xmm1 \n" - "movdqu (%0),%%xmm2 \n" - "movdqu (%0,%4),%%xmm3 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm2,%%xmm8 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "punpckhbw %%xmm3,%%xmm8 \n" - "movdqa %%xmm8,%%xmm3 \n" - "movdqu (%0),%%xmm4 \n" - "movdqu (%0,%4),%%xmm5 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm4,%%xmm8 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "punpckhbw %%xmm5,%%xmm8 \n" - "movdqa %%xmm8,%%xmm5 \n" - "movdqu (%0),%%xmm6 \n" - "movdqu (%0,%4),%%xmm7 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm6,%%xmm8 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "neg %4 \n" - "lea 0x10(%0,%4,8),%0 \n" - "punpckhbw %%xmm7,%%xmm8 \n" - "movdqa %%xmm8,%%xmm7 \n" - "neg %4 \n" - // Second round of bit swap. - "movdqa %%xmm0,%%xmm8 \n" - "movdqa %%xmm1,%%xmm9 \n" - "punpckhwd %%xmm2,%%xmm8 \n" - "punpckhwd %%xmm3,%%xmm9 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm8,%%xmm2 \n" - "movdqa %%xmm9,%%xmm3 \n" - "movdqa %%xmm4,%%xmm8 \n" - "movdqa %%xmm5,%%xmm9 \n" - "punpckhwd %%xmm6,%%xmm8 \n" - "punpckhwd %%xmm7,%%xmm9 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm8,%%xmm6 \n" - "movdqa %%xmm9,%%xmm7 \n" - // Third round of bit swap. - // Write to the destination pointer. - "movdqa %%xmm0,%%xmm8 \n" - "punpckldq %%xmm4,%%xmm0 \n" - "movlpd %%xmm0,(%1) \n" // Write back U channel - "movhpd %%xmm0,(%2) \n" // Write back V channel - "punpckhdq %%xmm4,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm2,%%xmm8 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movlpd %%xmm2,(%1) \n" - "movhpd %%xmm2,(%2) \n" - "punpckhdq %%xmm6,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm1,%%xmm8 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movlpd %%xmm1,(%1) \n" - "movhpd %%xmm1,(%2) \n" - "punpckhdq %%xmm5,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm3,%%xmm8 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movlpd %%xmm3,(%1) \n" - "movhpd %%xmm3,(%2) \n" - "punpckhdq %%xmm7,%%xmm8 \n" - "sub $0x8,%3 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst_a), // %1 - "+r"(dst_b), // %2 - "+r"(width) // %3 - : "r"((intptr_t) (src_stride)), // %4 - "r"((intptr_t) (dst_stride_a)), // %5 - "r"((intptr_t) (dst_stride_b)) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7", "xmm8", "xmm9"); -} - -#endif // defined(HAS_TRANSPOSEUVWX8_SSE2) - -#endif // defined(__x86_64__) || defined(__i386__) diff --git a/pkg/encoder/yuv/libyuv/rotate_row.h b/pkg/encoder/yuv/libyuv/rotate_row.h deleted file mode 100644 index afdae49f0..000000000 --- a/pkg/encoder/yuv/libyuv/rotate_row.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright 2013 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_ -#define INCLUDE_LIBYUV_ROTATE_ROW_H_ - -#include "basic_types.h" - -#if defined(__pnacl__) || defined(__CLR_VER) || \ - (defined(__native_client__) && defined(__x86_64__)) || \ - (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) -#define LIBYUV_DISABLE_X86 -#endif -#if defined(__native_client__) -#define LIBYUV_DISABLE_NEON -#endif -// MemorySanitizer does not support assembly code yet. http://crbug.com/344505 -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON) -#define LIBYUV_DISABLE_NEON -#endif -#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86) -#define LIBYUV_DISABLE_X86 -#endif -#endif - -// The following are available for GCC 32 or 64 bit: -#if !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__)) -#define HAS_TRANSPOSEWX8_SSSE3 -#endif - -// The following are available for 64 bit GCC: -#if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__) -#define HAS_TRANSPOSEWX8_FAST_SSSE3 -#define HAS_TRANSPOSEUVWX8_SSE2 -#endif - -void TransposeWxH_C(const uint8_t *src, - int src_stride, - uint8_t *dst, - int dst_stride, - int width, - int height); - -void TransposeWx8_C(const uint8_t *src, - int src_stride, - uint8_t *dst, - int dst_stride, - int width); - -void TransposeWx8_SSSE3(const uint8_t *src, - int src_stride, - uint8_t *dst, - int dst_stride, - int width); - -void TransposeWx8_Fast_SSSE3(const uint8_t *src, - int src_stride, - uint8_t *dst, - int dst_stride, - int width); - -void TransposeWx8_Any_SSSE3(const uint8_t *src, - int src_stride, - uint8_t *dst, - int dst_stride, - int width); - -void TransposeWx8_Fast_Any_SSSE3(const uint8_t *src, - int src_stride, - uint8_t *dst, - int dst_stride, - int width); - -void TransposeUVWx8_C(const uint8_t *src, - int src_stride, - uint8_t *dst_a, - int dst_stride_a, - uint8_t *dst_b, - int dst_stride_b, - int width); - -void TransposeUVWx8_SSE2(const uint8_t *src, - int src_stride, - uint8_t *dst_a, - int dst_stride_a, - uint8_t *dst_b, - int dst_stride_b, - int width); - -void TransposeUVWx8_Any_SSE2(const uint8_t *src, - int src_stride, - uint8_t *dst_a, - int dst_stride_a, - uint8_t *dst_b, - int dst_stride_b, - int width); - -#endif // INCLUDE_LIBYUV_ROTATE_ROW_H_ diff --git a/pkg/encoder/yuv/libyuv/row.h b/pkg/encoder/yuv/libyuv/row.h deleted file mode 100644 index ca1c0c298..000000000 --- a/pkg/encoder/yuv/libyuv/row.h +++ /dev/null @@ -1,426 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_ROW_H_ -#define INCLUDE_LIBYUV_ROW_H_ - -#include // For NULL -#include // For malloc - -#include "basic_types.h" - -#if defined(__pnacl__) || defined(__CLR_VER) || \ - (defined(__native_client__) && defined(__x86_64__)) || \ - (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) -#define LIBYUV_DISABLE_X86 -#endif -#if defined(__native_client__) -#define LIBYUV_DISABLE_NEON -#endif -// MemorySanitizer does not support assembly code yet. http://crbug.com/344505 -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON) -#define LIBYUV_DISABLE_NEON -#endif -#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86) -#define LIBYUV_DISABLE_X86 -#endif -#endif - -// GCC >= 4.7.0 required for AVX2. -#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) -#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) -#define GCC_HAS_AVX2 1 -#endif // GNUC >= 4.7 -#endif // __GNUC__ - -// The following are available on all x86 platforms: -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) -// Conversions: -#define HAS_ABGRTOYROW_SSSE3 -#define HAS_ARGBTOYROW_SSSE3 -#define HAS_BGRATOYROW_SSSE3 -#define HAS_COPYROW_ERMS -#define HAS_COPYROW_SSE2 -#define HAS_INTERPOLATEROW_SSSE3 -#define HAS_MIRRORROW_SSSE3 -#define HAS_MIRRORSPLITUVROW_SSSE3 -#if !defined(LIBYUV_BIT_EXACT) -#define HAS_ABGRTOUVROW_SSSE3 -#define HAS_ARGBTOUVROW_SSSE3 -#endif - -// Effects: -#define HAS_ARGBGRAYROW_SSSE3 -#define HAS_ARGBMIRRORROW_SSE2 - -#endif - -// The following are available on all x86 platforms, but -// require VS2012, clang 3.4 or gcc 4.7. -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \ - defined(GCC_HAS_AVX2)) -#define HAS_ARGBEXTRACTALPHAROW_AVX2 -#define HAS_ARGBMIRRORROW_AVX2 -#define HAS_ARGBTOYROW_AVX2 -#define HAS_COPYROW_AVX -#define HAS_INTERPOLATEROW_AVX2 -#define HAS_MIRRORROW_AVX2 -#if !defined(LIBYUV_BIT_EXACT) -#define HAS_ARGBTOUVROW_AVX2 -#endif - -#endif - -// The following are available for gcc/clang x86 platforms: -// TODO(fbarchard): Port to Visual C -#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) -#define HAS_MIRRORUVROW_SSSE3 - -#endif - -// The following are available for AVX2 gcc/clang x86 platforms: -// TODO(fbarchard): Port to Visual C -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__)) && \ - (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) -#define HAS_ABGRTOYROW_AVX2 -#define HAS_MIRRORUVROW_AVX2 -#if !defined(LIBYUV_BIT_EXACT) -#define HAS_ABGRTOUVROW_AVX2 -#endif - -#endif - -#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) - #if defined(VISUALC_HAS_AVX2) -#define SIMD_ALIGNED(var) __declspec(align(32)) var -#else -#define SIMD_ALIGNED(var) __declspec(align(16)) var -#endif -#define LIBYUV_NOINLINE __declspec(noinline) -typedef __declspec(align(16)) int16_t vec16[8]; -typedef __declspec(align(16)) int32_t vec32[4]; -typedef __declspec(align(16)) float vecf32[4]; -typedef __declspec(align(16)) int8_t vec8[16]; -typedef __declspec(align(16)) uint16_t uvec16[8]; -typedef __declspec(align(16)) uint32_t uvec32[4]; -typedef __declspec(align(16)) uint8_t uvec8[16]; -typedef __declspec(align(32)) int16_t lvec16[16]; -typedef __declspec(align(32)) int32_t lvec32[8]; -typedef __declspec(align(32)) int8_t lvec8[32]; -typedef __declspec(align(32)) uint16_t ulvec16[16]; -typedef __declspec(align(32)) uint32_t ulvec32[8]; -typedef __declspec(align(32)) uint8_t ulvec8[32]; -#elif !defined(__pnacl__) && (defined(__GNUC__) || defined(__clang__)) -// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const. -#if defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2) -#define SIMD_ALIGNED(var) var __attribute__((aligned(32))) -#else -#define SIMD_ALIGNED(var) var __attribute__((aligned(16))) -#endif -#define LIBYUV_NOINLINE __attribute__((noinline)) -typedef int16_t __attribute__((vector_size(16))) vec16; -typedef int32_t __attribute__((vector_size(16))) vec32; -typedef float __attribute__((vector_size(16))) vecf32; -typedef int8_t __attribute__((vector_size(16))) vec8; -typedef uint16_t __attribute__((vector_size(16))) uvec16; -typedef uint32_t __attribute__((vector_size(16))) uvec32; -typedef uint8_t __attribute__((vector_size(16))) uvec8; -typedef int16_t __attribute__((vector_size(32))) lvec16; -typedef int32_t __attribute__((vector_size(32))) lvec32; -typedef int8_t __attribute__((vector_size(32))) lvec8; -typedef uint16_t __attribute__((vector_size(32))) ulvec16; -typedef uint32_t __attribute__((vector_size(32))) ulvec32; -typedef uint8_t __attribute__((vector_size(32))) ulvec8; -#else -#define SIMD_ALIGNED(var) var -#define LIBYUV_NOINLINE -typedef int16_t vec16[8]; -typedef int32_t vec32[4]; -typedef float vecf32[4]; -typedef int8_t vec8[16]; -typedef uint16_t uvec16[8]; -typedef uint32_t uvec32[4]; -typedef uint8_t uvec8[16]; -typedef int16_t lvec16[16]; -typedef int32_t lvec32[8]; -typedef int8_t lvec8[32]; -typedef uint16_t ulvec16[16]; -typedef uint32_t ulvec32[8]; -typedef uint8_t ulvec8[32]; -#endif - -#if !defined(__aarch64__) || !defined(__arm__) -// This struct is for Intel color conversion. -struct YuvConstants { - uint8_t kUVToB[32]; - uint8_t kUVToG[32]; - uint8_t kUVToR[32]; - int16_t kYToRgb[16]; - int16_t kYBiasToRgb[16]; -}; -#endif - -#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1))) - -#define align_buffer_64(var, size) \ - void* var##_mem = malloc((size) + 63); /* NOLINT */ \ - uint8_t* var = (uint8_t*)(((intptr_t)var##_mem + 63) & ~63) /* NOLINT */ - -#define free_aligned_buffer_64(var) \ - free(var##_mem); \ - var = NULL - -#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__) -#define OMITFP -#else -#define OMITFP __attribute__((optimize("omit-frame-pointer"))) -#endif - -// NaCL macros for GCC x86 and x64. -#if defined(__native_client__) -#define LABELALIGN ".p2align 5\n" -#else -#define LABELALIGN -#endif - -void ARGBToYRow_AVX2(const uint8_t *src_argb, uint8_t *dst_y, int width); - -void ARGBToYRow_Any_AVX2(const uint8_t *src_ptr, uint8_t *dst_ptr, int width); - -void ABGRToYRow_AVX2(const uint8_t *src_abgr, uint8_t *dst_y, int width); - -void ABGRToYRow_Any_AVX2(const uint8_t *src_ptr, uint8_t *dst_ptr, int width); - -void ARGBToYRow_SSSE3(const uint8_t *src_argb, uint8_t *dst_y, int width); - -void ABGRToYRow_SSSE3(const uint8_t *src_abgr, uint8_t *dst_y, int width); - -void BGRAToYRow_SSSE3(const uint8_t *src_bgra, uint8_t *dst_y, int width); - -void ABGRToYRow_SSSE3(const uint8_t *src_abgr, uint8_t *dst_y, int width); - -void ARGBToYRow_C(const uint8_t *src_rgb, uint8_t *dst_y, int width); - -void ABGRToYRow_C(const uint8_t *src_rgb, uint8_t *dst_y, int width); - -void RGB565ToYRow_C(const uint8_t *src_rgb565, uint8_t *dst_y, int width); - -void ARGBToYRow_Any_SSSE3(const uint8_t *src_ptr, uint8_t *dst_ptr, int width); - -void BGRAToYRow_Any_SSSE3(const uint8_t *src_ptr, uint8_t *dst_ptr, int width); - -void ABGRToYRow_Any_SSSE3(const uint8_t *src_ptr, uint8_t *dst_ptr, int width); - -void ARGBToUVRow_AVX2(const uint8_t *src_argb, - int src_stride_argb, - uint8_t *dst_u, - uint8_t *dst_v, - int width); - -void ABGRToUVRow_AVX2(const uint8_t *src_abgr, - int src_stride_abgr, - uint8_t *dst_u, - uint8_t *dst_v, - int width); - -void ARGBToUVRow_SSSE3(const uint8_t *src_argb, - int src_stride_argb, - uint8_t *dst_u, - uint8_t *dst_v, - int width); - -void BGRAToUVRow_SSSE3(const uint8_t *src_bgra, - int src_stride_bgra, - uint8_t *dst_u, - uint8_t *dst_v, - int width); - -void ABGRToUVRow_SSSE3(const uint8_t *src_abgr, - int src_stride_abgr, - uint8_t *dst_u, - uint8_t *dst_v, - int width); - -void RGBAToUVRow_SSSE3(const uint8_t *src_rgba, - int src_stride_rgba, - uint8_t *dst_u, - uint8_t *dst_v, - int width); - -void ARGBToUVRow_Any_AVX2(const uint8_t *src_ptr, - int src_stride, - uint8_t *dst_u, - uint8_t *dst_v, - int width); - -void ABGRToUVRow_Any_AVX2(const uint8_t *src_ptr, - int src_stride, - uint8_t *dst_u, - uint8_t *dst_v, - int width); - -void ARGBToUVRow_Any_SSSE3(const uint8_t *src_ptr, - int src_stride, - uint8_t *dst_u, - uint8_t *dst_v, - int width); - -void BGRAToUVRow_Any_SSSE3(const uint8_t *src_ptr, - int src_stride, - uint8_t *dst_u, - uint8_t *dst_v, - int width); - -void ABGRToUVRow_Any_SSSE3(const uint8_t *src_ptr, - int src_stride, - uint8_t *dst_u, - uint8_t *dst_v, - int width); - -void RGBAToUVRow_Any_SSSE3(const uint8_t *src_ptr, - int src_stride, - uint8_t *dst_u, - uint8_t *dst_v, - int width); - -void ARGBToUVRow_C(const uint8_t *src_rgb, - int src_stride_rgb, - uint8_t *dst_u, - uint8_t *dst_v, - int width); - -void ARGBToUVRow_C(const uint8_t *src_rgb, - int src_stride_rgb, - uint8_t *dst_u, - uint8_t *dst_v, - int width); - -void BGRAToUVRow_C(const uint8_t *src_rgb, - int src_stride_rgb, - uint8_t *dst_u, - uint8_t *dst_v, - int width); - -void ABGRToUVRow_C(const uint8_t *src_rgb, - int src_stride_rgb, - uint8_t *dst_u, - uint8_t *dst_v, - int width); - -void RGBAToUVRow_C(const uint8_t *src_rgb, - int src_stride_rgb, - uint8_t *dst_u, - uint8_t *dst_v, - int width); - -void RGB565ToUVRow_C(const uint8_t *src_rgb565, - int src_stride_rgb565, - uint8_t *dst_u, - uint8_t *dst_v, - int width); - -void MirrorRow_AVX2(const uint8_t *src, uint8_t *dst, int width); - -void MirrorRow_SSSE3(const uint8_t *src, uint8_t *dst, int width); - -void MirrorRow_C(const uint8_t *src, uint8_t *dst, int width); - -void MirrorRow_Any_AVX2(const uint8_t *src_ptr, uint8_t *dst_ptr, int width); - -void MirrorRow_Any_SSSE3(const uint8_t *src_ptr, uint8_t *dst_ptr, int width); - -void MirrorRow_Any_SSE2(const uint8_t *src, uint8_t *dst, int width); - -void MirrorUVRow_AVX2(const uint8_t *src_uv, uint8_t *dst_uv, int width); - -void MirrorUVRow_SSSE3(const uint8_t *src_uv, uint8_t *dst_uv, int width); - -void MirrorUVRow_Any_AVX2(const uint8_t *src_ptr, uint8_t *dst_ptr, int width); - -void MirrorUVRow_Any_SSSE3(const uint8_t *src_ptr, uint8_t *dst_ptr, int width); - -void ARGBMirrorRow_AVX2(const uint8_t *src, uint8_t *dst, int width); - -void ARGBMirrorRow_SSE2(const uint8_t *src, uint8_t *dst, int width); - -void ARGBMirrorRow_C(const uint8_t *src, uint8_t *dst, int width); - -void ARGBMirrorRow_Any_AVX2(const uint8_t *src_ptr, - uint8_t *dst_ptr, - int width); - -void ARGBMirrorRow_Any_SSE2(const uint8_t *src_ptr, - uint8_t *dst_ptr, - int width); - -void CopyRow_SSE2(const uint8_t *src, uint8_t *dst, int width); - -void CopyRow_AVX(const uint8_t *src, uint8_t *dst, int width); - -void CopyRow_ERMS(const uint8_t *src, uint8_t *dst, int width); - -void CopyRow_C(const uint8_t *src, uint8_t *dst, int count); - -void CopyRow_Any_SSE2(const uint8_t *src_ptr, uint8_t *dst_ptr, int width); - -void CopyRow_Any_AVX(const uint8_t *src_ptr, uint8_t *dst_ptr, int width); - -void RGB565ToARGBRow_SSE2(const uint8_t *src, uint8_t *dst, int width); - -void RGB565ToARGBRow_AVX2(const uint8_t *src_rgb565, - uint8_t *dst_argb, - int width); - -void RGB565ToARGBRow_C(const uint8_t *src_rgb565, uint8_t *dst_argb, int width); - -void RGB565ToARGBRow_Any_SSE2(const uint8_t *src_ptr, - uint8_t *dst_ptr, - int width); - -void RGB565ToARGBRow_Any_AVX2(const uint8_t *src_ptr, - uint8_t *dst_ptr, - int width); - -// Used for I420Scale, ARGBScale, and ARGBInterpolate. -void InterpolateRow_C(uint8_t *dst_ptr, - const uint8_t *src_ptr, - ptrdiff_t src_stride, - int width, - int source_y_fraction); - -void InterpolateRow_SSSE3(uint8_t *dst_ptr, - const uint8_t *src_ptr, - ptrdiff_t src_stride, - int dst_width, - int source_y_fraction); - -void InterpolateRow_AVX2(uint8_t *dst_ptr, - const uint8_t *src_ptr, - ptrdiff_t src_stride, - int dst_width, - int source_y_fraction); - -void InterpolateRow_Any_SSSE3(uint8_t *dst_ptr, - const uint8_t *src_ptr, - ptrdiff_t src_stride_ptr, - int width, - int source_y_fraction); - -void InterpolateRow_Any_AVX2(uint8_t *dst_ptr, - const uint8_t *src_ptr, - ptrdiff_t src_stride_ptr, - int width, - int source_y_fraction); - -#endif // INCLUDE_LIBYUV_ROW_H_ \ No newline at end of file diff --git a/pkg/encoder/yuv/libyuv/row_any.c b/pkg/encoder/yuv/libyuv/row_any.c deleted file mode 100644 index fcc49c672..000000000 --- a/pkg/encoder/yuv/libyuv/row_any.c +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "row.h" - -#include // For memset. - -// Subsampled source needs to be increase by 1 of not even. -#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift)) - -// Any 1 to 1. -#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8_t vin[128]); \ - SIMD_ALIGNED(uint8_t vout[128]); \ - memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_ptr, n); \ - } \ - memcpy(vin, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ - ANY_SIMD(vin, vout, MASK + 1); \ - memcpy(dst_ptr + n * BPP, vout, r * BPP); \ - } - -#ifdef HAS_COPYROW_AVX - -ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63) - -#endif -#ifdef HAS_COPYROW_SSE2 - -ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31) - -#endif - -#ifdef HAS_ARGBTOYROW_AVX2 - -ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31) - -#endif -#ifdef HAS_ABGRTOYROW_AVX2 - -ANY11(ABGRToYRow_Any_AVX2, ABGRToYRow_AVX2, 0, 4, 1, 31) - -#endif -#ifdef HAS_ARGBTOYROW_SSSE3 - -ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15) - -#endif -#ifdef HAS_BGRATOYROW_SSSE3 - -ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15) - -ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15) - -#endif - -#undef ANY11 - -// Any 1 to 1 interpolate. Takes 2 rows of source via stride. -#define ANY11I(NAMEANY, ANY_SIMD, TD, TS, SBPP, BPP, MASK) \ - void NAMEANY(TD* dst_ptr, const TS* src_ptr, ptrdiff_t src_stride, \ - int width, int source_y_fraction) { \ - SIMD_ALIGNED(TS vin[64 * 2]); \ - SIMD_ALIGNED(TD vout[64]); \ - memset(vin, 0, sizeof(vin)); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(dst_ptr, src_ptr, src_stride, n, source_y_fraction); \ - } \ - memcpy(vin, src_ptr + n * SBPP, r * SBPP * sizeof(TS)); \ - if (source_y_fraction) { \ - memcpy(vin + 64, src_ptr + src_stride + n * SBPP, \ - r * SBPP * sizeof(TS)); \ - } \ - ANY_SIMD(vout, vin, 64, MASK + 1, source_y_fraction); \ - memcpy(dst_ptr + n * BPP, vout, r * BPP * sizeof(TD)); \ - } - -#ifdef HAS_INTERPOLATEROW_AVX2 - -ANY11I(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, uint8_t, uint8_t, 1, 1, 31) - -#endif -#ifdef HAS_INTERPOLATEROW_SSSE3 - -ANY11I(InterpolateRow_Any_SSSE3, - InterpolateRow_SSSE3, - uint8_t, - uint8_t, - 1, - 1, - 15) - -#endif - -#undef ANY11I - -// Any 1 to 1 mirror. -#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8_t vin[64]); \ - SIMD_ALIGNED(uint8_t vout[64]); \ - memset(vin, 0, sizeof(vin)); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \ - } \ - memcpy(vin, src_ptr, r* BPP); \ - ANY_SIMD(vin, vout, MASK + 1); \ - memcpy(dst_ptr + n * BPP, vout + (MASK + 1 - r) * BPP, r * BPP); \ - } - -#ifdef HAS_MIRRORROW_AVX2 - -ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31) - -#endif -#ifdef HAS_MIRRORROW_SSSE3 - -ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15) - -#endif -#ifdef HAS_MIRRORUVROW_AVX2 - -ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15) - -#endif -#ifdef HAS_MIRRORUVROW_SSSE3 - -ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7) - -#endif -#ifdef HAS_ARGBMIRRORROW_AVX2 - -ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7) - -#endif -#ifdef HAS_ARGBMIRRORROW_SSE2 - -ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3) - -#endif -#undef ANY11M - -// Any 1 to 2 with source stride (2 rows of source). Outputs UV planes. -// 128 byte row allows for 32 avx ARGB pixels. -#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, \ - uint8_t* dst_v, int width) { \ - SIMD_ALIGNED(uint8_t vin[128 * 2]); \ - SIMD_ALIGNED(uint8_t vout[128 * 2]); \ - memset(vin, 0, sizeof(vin)); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, src_stride, dst_u, dst_v, n); \ - } \ - memcpy(vin, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ - memcpy(vin + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP, \ - SS(r, UVSHIFT) * BPP); \ - if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \ - memcpy(vin + SS(r, UVSHIFT) * BPP, vin + SS(r, UVSHIFT) * BPP - BPP, \ - BPP); \ - memcpy(vin + 128 + SS(r, UVSHIFT) * BPP, \ - vin + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \ - } \ - ANY_SIMD(vin, 128, vout, vout + 128, MASK + 1); \ - memcpy(dst_u + (n >> 1), vout, SS(r, 1)); \ - memcpy(dst_v + (n >> 1), vout + 128, SS(r, 1)); \ - } - -#ifdef HAS_ARGBTOUVROW_AVX2 - -ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31) - -#endif -#ifdef HAS_ABGRTOUVROW_AVX2 - -ANY12S(ABGRToUVRow_Any_AVX2, ABGRToUVRow_AVX2, 0, 4, 31) - -#endif -#ifdef HAS_ARGBTOUVROW_SSSE3 - -ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15) - -ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15) - -ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15) - -ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15) - -#endif -#undef ANY12S diff --git a/pkg/encoder/yuv/libyuv/row_common.c b/pkg/encoder/yuv/libyuv/row_common.c deleted file mode 100644 index 34a93a074..000000000 --- a/pkg/encoder/yuv/libyuv/row_common.c +++ /dev/null @@ -1,887 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "row.h" - -#include -#include // For memcpy and memset. - -#define STATIC_CAST(type, expr) (type)(expr) - -// This macro controls YUV to RGB using unsigned math to extend range of -// YUV to RGB coefficients to 0 to 4 instead of 0 to 2 for more accuracy on B: -// LIBYUV_UNLIMITED_DATA - -// Macros to enable unlimited data for each colorspace -// LIBYUV_UNLIMITED_BT601 -// LIBYUV_UNLIMITED_BT709 -// LIBYUV_UNLIMITED_BT2020 - -#if !defined(LIBYUV_BIT_EXACT) && (defined(__x86_64__) || defined(_M_X64) || \ - defined(__i386__) || defined(_M_IX86)) -#define LIBYUV_ARGBTOUV_PAVGB 1 -#define LIBYUV_RGBTOU_TRUNCATE 1 -#endif -#if defined(LIBYUV_BIT_EXACT) -#define LIBYUV_UNATTENUATE_DUP 1 -#endif - -// llvm x86 is poor at ternary operator, so use branchless min/max. - -#define USE_BRANCHLESS 1 -#if USE_BRANCHLESS - -static __inline int32_t clamp0(int32_t v) { - return -(v >= 0) & v; -} - -// TODO(fbarchard): make clamp255 preserve negative values. -static __inline int32_t clamp255(int32_t v) { - return (-(v >= 255) | v) & 255; -} - -static __inline int32_t clamp1023(int32_t v) { - return (-(v >= 1023) | v) & 1023; -} - -// clamp to max -static __inline int32_t ClampMax(int32_t v, int32_t max) { - return (-(v >= max) | v) & max; -} - -static __inline uint32_t Abs(int32_t v) { - int m = -(v < 0); - return (v + m) ^ m; -} - -#else // USE_BRANCHLESS -static __inline int32_t clamp0(int32_t v) { - return (v < 0) ? 0 : v; -} - -static __inline int32_t clamp255(int32_t v) { - return (v > 255) ? 255 : v; -} - -static __inline int32_t clamp1023(int32_t v) { - return (v > 1023) ? 1023 : v; -} - -static __inline int32_t ClampMax(int32_t v, int32_t max) { - return (v > max) ? max : v; -} - -static __inline uint32_t Abs(int32_t v) { - return (v < 0) ? -v : v; -} -#endif // USE_BRANCHLESS - -static __inline uint32_t Clamp(int32_t val) { - int v = clamp0(val); - return (uint32_t) (clamp255(v)); -} - -static __inline uint32_t Clamp10(int32_t val) { - int v = clamp0(val); - return (uint32_t) (clamp1023(v)); -} - -// Little Endian -#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \ - defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) || \ - (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) -#define WRITEWORD(p, v) *(uint32_t*)(p) = v -#else -static inline void WRITEWORD(uint8_t* p, uint32_t v) { - p[0] = (uint8_t)(v & 255); - p[1] = (uint8_t)((v >> 8) & 255); - p[2] = (uint8_t)((v >> 16) & 255); - p[3] = (uint8_t)((v >> 24) & 255); -} -#endif - -void RGB565ToARGBRow_C(const uint8_t *src_rgb565, - uint8_t *dst_argb, - int width) { - int x; - for (x = 0; x < width; ++x) { - uint8_t b = STATIC_CAST(uint8_t, src_rgb565[0] & 0x1f); - uint8_t g = STATIC_CAST( - uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3)); - uint8_t r = STATIC_CAST(uint8_t, src_rgb565[1] >> 3); - dst_argb[0] = STATIC_CAST(uint8_t, (b << 3) | (b >> 2)); - dst_argb[1] = STATIC_CAST(uint8_t, (g << 2) | (g >> 4)); - dst_argb[2] = STATIC_CAST(uint8_t, (r << 3) | (r >> 2)); - dst_argb[3] = 255u; - dst_argb += 4; - src_rgb565 += 2; - } -} - -// 8 bit -// Intel SSE/AVX uses the following equivalent formula -// 0x7e80 = (66 + 129 + 25) * -128 + 0x1000 (for +16) and 0x0080 for round. -// return (66 * ((int)r - 128) + 129 * ((int)g - 128) + 25 * ((int)b - 128) + -// 0x7e80) >> 8; - -static __inline uint8_t RGBToY(uint8_t r, uint8_t g, uint8_t b) { - return STATIC_CAST(uint8_t, (66 * r + 129 * g + 25 * b + 0x1080) >> 8); -} - -#define AVGB(a, b) (((a) + (b) + 1) >> 1) - -// LIBYUV_RGBTOU_TRUNCATE mimics x86 code that does not round. -#ifdef LIBYUV_RGBTOU_TRUNCATE - -static __inline uint8_t RGBToU(uint8_t r, uint8_t g, uint8_t b) { - return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8000) >> 8); -} - -static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) { - return STATIC_CAST(uint8_t, (112 * r - 94 * g - 18 * b + 0x8000) >> 8); -} - -#else -// TODO(fbarchard): Add rounding to x86 SIMD and use this -static __inline uint8_t RGBToU(uint8_t r, uint8_t g, uint8_t b) { - return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8080) >> 8); -} -static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) { - return STATIC_CAST(uint8_t, (112 * r - 94 * g - 18 * b + 0x8080) >> 8); -} -#endif - -// LIBYUV_ARGBTOUV_PAVGB mimics x86 code that subsamples with 2 pavgb. -#if !defined(LIBYUV_ARGBTOUV_PAVGB) -static __inline int RGB2xToU(uint16_t r, uint16_t g, uint16_t b) { - return STATIC_CAST( - uint8_t, ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8); -} -static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) { - return STATIC_CAST( - uint8_t, ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8); -} -#endif - -// ARGBToY_C and ARGBToUV_C -// Intel version mimic SSE/AVX which does 2 pavgb -#if LIBYUV_ARGBTOUV_PAVGB -#define MAKEROWY(NAME, R, G, B, BPP) \ - void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ - int x; \ - for (x = 0; x < width; ++x) { \ - dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]); \ - src_rgb += BPP; \ - dst_y += 1; \ - } \ - } \ - void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ - uint8_t* dst_u, uint8_t* dst_v, int width) { \ - const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ - int x; \ - for (x = 0; x < width - 1; x += 2) { \ - uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]), \ - AVGB(src_rgb[B + BPP], src_rgb1[B + BPP])); \ - uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]), \ - AVGB(src_rgb[G + BPP], src_rgb1[G + BPP])); \ - uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]), \ - AVGB(src_rgb[R + BPP], src_rgb1[R + BPP])); \ - dst_u[0] = RGBToU(ar, ag, ab); \ - dst_v[0] = RGBToV(ar, ag, ab); \ - src_rgb += BPP * 2; \ - src_rgb1 += BPP * 2; \ - dst_u += 1; \ - dst_v += 1; \ - } \ - if (width & 1) { \ - uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]); \ - uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]); \ - uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]); \ - dst_u[0] = RGBToU(ar, ag, ab); \ - dst_v[0] = RGBToV(ar, ag, ab); \ - } \ - } -#else -// ARM version does sum / 2 then multiply by 2x smaller coefficients -#define MAKEROWY(NAME, R, G, B, BPP) \ - void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ - int x; \ - for (x = 0; x < width; ++x) { \ - dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]); \ - src_rgb += BPP; \ - dst_y += 1; \ - } \ - } \ - void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ - uint8_t* dst_u, uint8_t* dst_v, int width) { \ - const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ - int x; \ - for (x = 0; x < width - 1; x += 2) { \ - uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] + \ - src_rgb1[B + BPP] + 1) >> \ - 1; \ - uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] + \ - src_rgb1[G + BPP] + 1) >> \ - 1; \ - uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] + \ - src_rgb1[R + BPP] + 1) >> \ - 1; \ - dst_u[0] = RGB2xToU(ar, ag, ab); \ - dst_v[0] = RGB2xToV(ar, ag, ab); \ - src_rgb += BPP * 2; \ - src_rgb1 += BPP * 2; \ - dst_u += 1; \ - dst_v += 1; \ - } \ - if (width & 1) { \ - uint16_t ab = src_rgb[B] + src_rgb1[B]; \ - uint16_t ag = src_rgb[G] + src_rgb1[G]; \ - uint16_t ar = src_rgb[R] + src_rgb1[R]; \ - dst_u[0] = RGB2xToU(ar, ag, ab); \ - dst_v[0] = RGB2xToV(ar, ag, ab); \ - } \ - } -#endif - -MAKEROWY(ARGB, 2, 1, 0, 4) - -MAKEROWY(BGRA, 1, 2, 3, 4) - -MAKEROWY(ABGR, 0, 1, 2, 4) - -MAKEROWY(RGBA, 3, 2, 1, 4) - -#undef MAKEROWY - -// JPeg uses a variation on BT.601-1 full range -// y = 0.29900 * r + 0.58700 * g + 0.11400 * b -// u = -0.16874 * r - 0.33126 * g + 0.50000 * b + center -// v = 0.50000 * r - 0.41869 * g - 0.08131 * b + center -// BT.601 Mpeg range uses: -// b 0.1016 * 255 = 25.908 = 25 -// g 0.5078 * 255 = 129.489 = 129 -// r 0.2578 * 255 = 65.739 = 66 -// JPeg 7 bit Y (deprecated) -// b 0.11400 * 128 = 14.592 = 15 -// g 0.58700 * 128 = 75.136 = 75 -// r 0.29900 * 128 = 38.272 = 38 -// JPeg 8 bit Y: -// b 0.11400 * 256 = 29.184 = 29 -// g 0.58700 * 256 = 150.272 = 150 -// r 0.29900 * 256 = 76.544 = 77 -// JPeg 8 bit U: -// b 0.50000 * 255 = 127.5 = 127 -// g -0.33126 * 255 = -84.4713 = -84 -// r -0.16874 * 255 = -43.0287 = -43 -// JPeg 8 bit V: -// b -0.08131 * 255 = -20.73405 = -20 -// g -0.41869 * 255 = -106.76595 = -107 -// r 0.50000 * 255 = 127.5 = 127 - -// 8 bit -static __inline uint8_t RGBToYJ(uint8_t r, uint8_t g, uint8_t b) { - return (77 * r + 150 * g + 29 * b + 128) >> 8; -} - -#if defined(LIBYUV_ARGBTOUV_PAVGB) - -static __inline uint8_t RGBToUJ(uint8_t r, uint8_t g, uint8_t b) { - return (127 * b - 84 * g - 43 * r + 0x8080) >> 8; -} - -static __inline uint8_t RGBToVJ(uint8_t r, uint8_t g, uint8_t b) { - return (127 * r - 107 * g - 20 * b + 0x8080) >> 8; -} - -#else -static __inline uint8_t RGB2xToUJ(uint16_t r, uint16_t g, uint16_t b) { - return ((127 / 2) * b - (84 / 2) * g - (43 / 2) * r + 0x8080) >> 8; -} -static __inline uint8_t RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) { - return ((127 / 2) * r - (107 / 2) * g - (20 / 2) * b + 0x8080) >> 8; -} -#endif - -// ARGBToYJ_C and ARGBToUVJ_C -// Intel version mimic SSE/AVX which does 2 pavgb -#if LIBYUV_ARGBTOUV_PAVGB -#define MAKEROWYJ(NAME, R, G, B, BPP) \ - void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ - int x; \ - for (x = 0; x < width; ++x) { \ - dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]); \ - src_rgb += BPP; \ - dst_y += 1; \ - } \ - } \ - void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ - uint8_t* dst_u, uint8_t* dst_v, int width) { \ - const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ - int x; \ - for (x = 0; x < width - 1; x += 2) { \ - uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]), \ - AVGB(src_rgb[B + BPP], src_rgb1[B + BPP])); \ - uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]), \ - AVGB(src_rgb[G + BPP], src_rgb1[G + BPP])); \ - uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]), \ - AVGB(src_rgb[R + BPP], src_rgb1[R + BPP])); \ - dst_u[0] = RGBToUJ(ar, ag, ab); \ - dst_v[0] = RGBToVJ(ar, ag, ab); \ - src_rgb += BPP * 2; \ - src_rgb1 += BPP * 2; \ - dst_u += 1; \ - dst_v += 1; \ - } \ - if (width & 1) { \ - uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]); \ - uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]); \ - uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]); \ - dst_u[0] = RGBToUJ(ar, ag, ab); \ - dst_v[0] = RGBToVJ(ar, ag, ab); \ - } \ - } -#else -// ARM version does sum / 2 then multiply by 2x smaller coefficients -#define MAKEROWYJ(NAME, R, G, B, BPP) \ - void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ - int x; \ - for (x = 0; x < width; ++x) { \ - dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]); \ - src_rgb += BPP; \ - dst_y += 1; \ - } \ - } \ - void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ - uint8_t* dst_u, uint8_t* dst_v, int width) { \ - const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ - int x; \ - for (x = 0; x < width - 1; x += 2) { \ - uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] + \ - src_rgb1[B + BPP] + 1) >> \ - 1; \ - uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] + \ - src_rgb1[G + BPP] + 1) >> \ - 1; \ - uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] + \ - src_rgb1[R + BPP] + 1) >> \ - 1; \ - dst_u[0] = RGB2xToUJ(ar, ag, ab); \ - dst_v[0] = RGB2xToVJ(ar, ag, ab); \ - src_rgb += BPP * 2; \ - src_rgb1 += BPP * 2; \ - dst_u += 1; \ - dst_v += 1; \ - } \ - if (width & 1) { \ - uint16_t ab = (src_rgb[B] + src_rgb1[B]); \ - uint16_t ag = (src_rgb[G] + src_rgb1[G]); \ - uint16_t ar = (src_rgb[R] + src_rgb1[R]); \ - dst_u[0] = RGB2xToUJ(ar, ag, ab); \ - dst_v[0] = RGB2xToVJ(ar, ag, ab); \ - } \ - } - -#endif - -MAKEROWYJ(ARGB, 2, 1, 0, 4) - -MAKEROWYJ(ABGR, 0, 1, 2, 4) - -MAKEROWYJ(RGBA, 3, 2, 1, 4) - -MAKEROWYJ(RGB24, 2, 1, 0, 3) - -MAKEROWYJ(RAW, 0, 1, 2, 3) - -#undef MAKEROWYJ - -void RGB565ToYRow_C(const uint8_t *src_rgb565, uint8_t *dst_y, int width) { - int x; - for (x = 0; x < width; ++x) { - uint8_t b = src_rgb565[0] & 0x1f; - uint8_t g = STATIC_CAST( - uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3)); - uint8_t r = src_rgb565[1] >> 3; - b = STATIC_CAST(uint8_t, (b << 3) | (b >> 2)); - g = STATIC_CAST(uint8_t, (g << 2) | (g >> 4)); - r = STATIC_CAST(uint8_t, (r << 3) | (r >> 2)); - dst_y[0] = RGBToY(r, g, b); - src_rgb565 += 2; - dst_y += 1; - } -} - -void RGB565ToUVRow_C(const uint8_t *src_rgb565, - int src_stride_rgb565, - uint8_t *dst_u, - uint8_t *dst_v, - int width) { - const uint8_t *next_rgb565 = src_rgb565 + src_stride_rgb565; - int x; - for (x = 0; x < width - 1; x += 2) { - uint8_t b0 = STATIC_CAST(uint8_t, src_rgb565[0] & 0x1f); - uint8_t g0 = STATIC_CAST( - uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3)); - uint8_t r0 = STATIC_CAST(uint8_t, src_rgb565[1] >> 3); - uint8_t b1 = STATIC_CAST(uint8_t, src_rgb565[2] & 0x1f); - uint8_t g1 = STATIC_CAST( - uint8_t, (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3)); - uint8_t r1 = STATIC_CAST(uint8_t, src_rgb565[3] >> 3); - uint8_t b2 = STATIC_CAST(uint8_t, next_rgb565[0] & 0x1f); - uint8_t g2 = STATIC_CAST( - uint8_t, (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3)); - uint8_t r2 = STATIC_CAST(uint8_t, next_rgb565[1] >> 3); - uint8_t b3 = STATIC_CAST(uint8_t, next_rgb565[2] & 0x1f); - uint8_t g3 = STATIC_CAST( - uint8_t, (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3)); - uint8_t r3 = STATIC_CAST(uint8_t, next_rgb565[3] >> 3); - - b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2)); - g0 = STATIC_CAST(uint8_t, (g0 << 2) | (g0 >> 4)); - r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2)); - b1 = STATIC_CAST(uint8_t, (b1 << 3) | (b1 >> 2)); - g1 = STATIC_CAST(uint8_t, (g1 << 2) | (g1 >> 4)); - r1 = STATIC_CAST(uint8_t, (r1 << 3) | (r1 >> 2)); - b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2)); - g2 = STATIC_CAST(uint8_t, (g2 << 2) | (g2 >> 4)); - r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2)); - b3 = STATIC_CAST(uint8_t, (b3 << 3) | (b3 >> 2)); - g3 = STATIC_CAST(uint8_t, (g3 << 2) | (g3 >> 4)); - r3 = STATIC_CAST(uint8_t, (r3 << 3) | (r3 >> 2)); - -#if LIBYUV_ARGBTOUV_PAVGB - uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3)); - uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3)); - uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3)); - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); -#else - uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1; - uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1; - uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1; - dst_u[0] = RGB2xToU(r, g, b); - dst_v[0] = RGB2xToV(r, g, b); -#endif - - src_rgb565 += 4; - next_rgb565 += 4; - dst_u += 1; - dst_v += 1; - } - if (width & 1) { - uint8_t b0 = STATIC_CAST(uint8_t, src_rgb565[0] & 0x1f); - uint8_t g0 = STATIC_CAST( - uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3)); - uint8_t r0 = STATIC_CAST(uint8_t, src_rgb565[1] >> 3); - uint8_t b2 = STATIC_CAST(uint8_t, next_rgb565[0] & 0x1f); - uint8_t g2 = STATIC_CAST( - uint8_t, (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3)); - uint8_t r2 = STATIC_CAST(uint8_t, next_rgb565[1] >> 3); - b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2)); - g0 = STATIC_CAST(uint8_t, (g0 << 2) | (g0 >> 4)); - r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2)); - b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2)); - g2 = STATIC_CAST(uint8_t, (g2 << 2) | (g2 >> 4)); - r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2)); - -#if LIBYUV_ARGBTOUV_PAVGB - uint8_t ab = AVGB(b0, b2); - uint8_t ag = AVGB(g0, g2); - uint8_t ar = AVGB(r0, r2); - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); -#else - uint16_t b = b0 + b2; - uint16_t g = g0 + g2; - uint16_t r = r0 + r2; - dst_u[0] = RGB2xToU(r, g, b); - dst_v[0] = RGB2xToV(r, g, b); -#endif - } -} - -#define REPEAT8(v) (v) | ((v) << 8) -#define SHADE(f, v) v* f >> 24 - -#undef REPEAT8 -#undef SHADE - -#define REPEAT8(v) (v) | ((v) << 8) -#define SHADE(f, v) v* f >> 16 - -#undef REPEAT8 -#undef SHADE - -#define SHADE(f, v) clamp255(v + f) - -#undef SHADE - -#define SHADE(f, v) clamp0(f - v) - -#undef SHADE - -// Macros to create SIMD specific yuv to rgb conversion constants. - -// clang-format off - -#if defined(__aarch64__) || defined(__arm__) -// Bias values include subtract 128 from U and V, bias from Y and rounding. -// For B and R bias is negative. For G bias is positive. -#define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR) \ - {{UB, VR, UG, VG, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, \ - {YG, (UB * 128 - YB), (UG * 128 + VG * 128 + YB), (VR * 128 - YB), YB, 0, \ - 0, 0}} -#else -#define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR) \ - {{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, \ - UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0}, \ - {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, \ - UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, \ - {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, \ - 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR}, \ - {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \ - {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}} -#endif - -// clang-format on - -#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR) \ - const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = \ - YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR); \ - const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = \ - YUVCONSTANTSBODY(YG, YB, VR, VG, UG, UB); - -// TODO(fbarchard): Generate SIMD structures from float matrix. - -// BT.601 limited range YUV to RGB reference -// R = (Y - 16) * 1.164 + V * 1.596 -// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813 -// B = (Y - 16) * 1.164 + U * 2.018 -// KR = 0.299; KB = 0.114 - -// U and V contributions to R,G,B. -#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT601) -#define UB 129 /* round(2.018 * 64) */ -#else -#define UB 128 /* max(128, round(2.018 * 64)) */ -#endif -#define UG 25 /* round(0.391 * 64) */ -#define VG 52 /* round(0.813 * 64) */ -#define VR 102 /* round(1.596 * 64) */ - -// Y contribution to R,G,B. Scale and bias. -#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ -#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ - -MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR) - -#undef YG -#undef YB -#undef UB -#undef UG -#undef VG -#undef VR - -// BT.601 full range YUV to RGB reference (aka JPEG) -// * R = Y + V * 1.40200 -// * G = Y - U * 0.34414 - V * 0.71414 -// * B = Y + U * 1.77200 -// KR = 0.299; KB = 0.114 - -// U and V contributions to R,G,B. -#define UB 113 /* round(1.77200 * 64) */ -#define UG 22 /* round(0.34414 * 64) */ -#define VG 46 /* round(0.71414 * 64) */ -#define VR 90 /* round(1.40200 * 64) */ - -// Y contribution to R,G,B. Scale and bias. -#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ -#define YB 32 /* 64 / 2 */ - -MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR) - -#undef YG -#undef YB -#undef UB -#undef UG -#undef VG -#undef VR - -// BT.709 limited range YUV to RGB reference -// R = (Y - 16) * 1.164 + V * 1.793 -// G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533 -// B = (Y - 16) * 1.164 + U * 2.112 -// KR = 0.2126, KB = 0.0722 - -// U and V contributions to R,G,B. -#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT709) -#define UB 135 /* round(2.112 * 64) */ -#else -#define UB 128 /* max(128, round(2.112 * 64)) */ -#endif -#define UG 14 /* round(0.213 * 64) */ -#define VG 34 /* round(0.533 * 64) */ -#define VR 115 /* round(1.793 * 64) */ - -// Y contribution to R,G,B. Scale and bias. -#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ -#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ - -MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR) - -#undef YG -#undef YB -#undef UB -#undef UG -#undef VG -#undef VR - -// BT.709 full range YUV to RGB reference -// R = Y + V * 1.5748 -// G = Y - U * 0.18732 - V * 0.46812 -// B = Y + U * 1.8556 -// KR = 0.2126, KB = 0.0722 - -// U and V contributions to R,G,B. -#define UB 119 /* round(1.8556 * 64) */ -#define UG 12 /* round(0.18732 * 64) */ -#define VG 30 /* round(0.46812 * 64) */ -#define VR 101 /* round(1.5748 * 64) */ - -// Y contribution to R,G,B. Scale and bias. (same as jpeg) -#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */ -#define YB 32 /* 64 / 2 */ - -MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR) - -#undef YG -#undef YB -#undef UB -#undef UG -#undef VG -#undef VR - -// BT.2020 limited range YUV to RGB reference -// R = (Y - 16) * 1.164384 + V * 1.67867 -// G = (Y - 16) * 1.164384 - U * 0.187326 - V * 0.65042 -// B = (Y - 16) * 1.164384 + U * 2.14177 -// KR = 0.2627; KB = 0.0593 - -// U and V contributions to R,G,B. -#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT2020) -#define UB 137 /* round(2.142 * 64) */ -#else -#define UB 128 /* max(128, round(2.142 * 64)) */ -#endif -#define UG 12 /* round(0.187326 * 64) */ -#define VG 42 /* round(0.65042 * 64) */ -#define VR 107 /* round(1.67867 * 64) */ - -// Y contribution to R,G,B. Scale and bias. -#define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */ -#define YB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */ - -MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR) - -#undef YG -#undef YB -#undef UB -#undef UG -#undef VG -#undef VR - -// BT.2020 full range YUV to RGB reference -// R = Y + V * 1.474600 -// G = Y - U * 0.164553 - V * 0.571353 -// B = Y + U * 1.881400 -// KR = 0.2627; KB = 0.0593 - -#define UB 120 /* round(1.881400 * 64) */ -#define UG 11 /* round(0.164553 * 64) */ -#define VG 37 /* round(0.571353 * 64) */ -#define VR 94 /* round(1.474600 * 64) */ - -// Y contribution to R,G,B. Scale and bias. (same as jpeg) -#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */ -#define YB 32 /* 64 / 2 */ - -MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR) - -#undef YG -#undef YB -#undef UB -#undef UG -#undef VG -#undef VR - -#undef BB -#undef BG -#undef BR - -#undef MAKEYUVCONSTANTS - -#if defined(__aarch64__) || defined(__arm__) -#define LOAD_YUV_CONSTANTS \ - int ub = yuvconstants->kUVCoeff[0]; \ - int vr = yuvconstants->kUVCoeff[1]; \ - int ug = yuvconstants->kUVCoeff[2]; \ - int vg = yuvconstants->kUVCoeff[3]; \ - int yg = yuvconstants->kRGBCoeffBias[0]; \ - int bb = yuvconstants->kRGBCoeffBias[1]; \ - int bg = yuvconstants->kRGBCoeffBias[2]; \ - int br = yuvconstants->kRGBCoeffBias[3] - -#define CALC_RGB16 \ - int32_t y1 = (uint32_t)(y32 * yg) >> 16; \ - int b16 = y1 + (u * ub) - bb; \ - int g16 = y1 + bg - (u * ug + v * vg); \ - int r16 = y1 + (v * vr) - br -#else -#define LOAD_YUV_CONSTANTS \ - int ub = yuvconstants->kUVToB[0]; \ - int ug = yuvconstants->kUVToG[0]; \ - int vg = yuvconstants->kUVToG[1]; \ - int vr = yuvconstants->kUVToR[1]; \ - int yg = yuvconstants->kYToRgb[0]; \ - int yb = yuvconstants->kYBiasToRgb[0] - -#define CALC_RGB16 \ - int32_t y1 = ((uint32_t)(y32 * yg) >> 16) + yb; \ - int8_t ui = (int8_t)u; \ - int8_t vi = (int8_t)v; \ - ui -= 0x80; \ - vi -= 0x80; \ - int b16 = y1 + (ui * ub); \ - int g16 = y1 - (ui * ug + vi * vg); \ - int r16 = y1 + (vi * vr) -#endif - -void MirrorRow_C(const uint8_t *src, uint8_t *dst, int width) { - int x; - src += width - 1; - for (x = 0; x < width - 1; x += 2) { - dst[x] = src[0]; - dst[x + 1] = src[-1]; - src -= 2; - } - if (width & 1) { - dst[width - 1] = src[0]; - } -} - -// Use scale to convert lsb formats to msb, depending how many bits there are: -// 32768 = 9 bits -// 16384 = 10 bits -// 4096 = 12 bits -// 256 = 16 bits -// TODO(fbarchard): change scale to bits -#define C16TO8(v, scale) clamp255(((v) * (scale)) >> 16) - -void CopyRow_C(const uint8_t *src, uint8_t *dst, int count) { - memcpy(dst, src, count); -} - -// Divide source RGB by alpha and store to destination. -// b = (b * 255 + (a / 2)) / a; -// g = (g * 255 + (a / 2)) / a; -// r = (r * 255 + (a / 2)) / a; -// Reciprocal method is off by 1 on some values. ie 125 -// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower. -#define T(a) 0x01000000 + (0x10000 / a) -const uint32_t fixed_invtbl8[256] = { - 0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), - T(0x07), T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), - T(0x0e), T(0x0f), T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), - T(0x15), T(0x16), T(0x17), T(0x18), T(0x19), T(0x1a), T(0x1b), - T(0x1c), T(0x1d), T(0x1e), T(0x1f), T(0x20), T(0x21), T(0x22), - T(0x23), T(0x24), T(0x25), T(0x26), T(0x27), T(0x28), T(0x29), - T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f), T(0x30), - T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37), - T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), - T(0x3f), T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), - T(0x46), T(0x47), T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), - T(0x4d), T(0x4e), T(0x4f), T(0x50), T(0x51), T(0x52), T(0x53), - T(0x54), T(0x55), T(0x56), T(0x57), T(0x58), T(0x59), T(0x5a), - T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f), T(0x60), T(0x61), - T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67), T(0x68), - T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f), - T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), - T(0x77), T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), - T(0x7e), T(0x7f), T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), - T(0x85), T(0x86), T(0x87), T(0x88), T(0x89), T(0x8a), T(0x8b), - T(0x8c), T(0x8d), T(0x8e), T(0x8f), T(0x90), T(0x91), T(0x92), - T(0x93), T(0x94), T(0x95), T(0x96), T(0x97), T(0x98), T(0x99), - T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f), T(0xa0), - T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7), - T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), - T(0xaf), T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), - T(0xb6), T(0xb7), T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), - T(0xbd), T(0xbe), T(0xbf), T(0xc0), T(0xc1), T(0xc2), T(0xc3), - T(0xc4), T(0xc5), T(0xc6), T(0xc7), T(0xc8), T(0xc9), T(0xca), - T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf), T(0xd0), T(0xd1), - T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7), T(0xd8), - T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf), - T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), - T(0xe7), T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), - T(0xee), T(0xef), T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), - T(0xf5), T(0xf6), T(0xf7), T(0xf8), T(0xf9), T(0xfa), T(0xfb), - T(0xfc), T(0xfd), T(0xfe), 0x01000100}; -#undef T - -// Blend 2 rows into 1. -static void HalfRow_C(const uint8_t *src_uv, - ptrdiff_t src_uv_stride, - uint8_t *dst_uv, - int width) { - int x; - for (x = 0; x < width; ++x) { - dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; - } -} - -// C version 2x2 -> 2x1. -void InterpolateRow_C(uint8_t *dst_ptr, - const uint8_t *src_ptr, - ptrdiff_t src_stride, - int width, - int source_y_fraction) { - int y1_fraction = source_y_fraction; - int y0_fraction = 256 - y1_fraction; - const uint8_t *src_ptr1 = src_ptr + src_stride; - int x; - assert(source_y_fraction >= 0); - assert(source_y_fraction < 256); - - if (y1_fraction == 0) { - memcpy(dst_ptr, src_ptr, width); - return; - } - if (y1_fraction == 128) { - HalfRow_C(src_ptr, src_stride, dst_ptr, width); - return; - } - for (x = 0; x < width; ++x) { - dst_ptr[0] = STATIC_CAST( - uint8_t, - (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8); - ++src_ptr; - ++src_ptr1; - ++dst_ptr; - } -} - -// Work around GCC 7 punning warning -Wstrict-aliasing -#if defined(__GNUC__) -typedef uint32_t __attribute__((__may_alias__)) uint32_alias_t; -#else -typedef uint32_t uint32_alias_t; -#endif - -#undef STATIC_CAST diff --git a/pkg/encoder/yuv/libyuv/row_gcc.c b/pkg/encoder/yuv/libyuv/row_gcc.c deleted file mode 100644 index 07e795e60..000000000 --- a/pkg/encoder/yuv/libyuv/row_gcc.c +++ /dev/null @@ -1,1090 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "row.h" - -// This module is for GCC x86 and x64. -#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) - -#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) - -// Constants for ARGB -static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u, - 25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u}; - - -#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) - -#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) - -static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0, - 112, -74, -38, 0, 112, -74, -38, 0}; - -static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0, - -18, -94, 112, 0, -18, -94, 112, 0}; - -// Constants for BGRA -static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u, - 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u}; - -static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112, - 0, -38, -74, 112, 0, -38, -74, 112}; - -static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18, - 0, 112, -94, -18, 0, 112, -94, -18}; - -// Constants for ABGR -static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u, - 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u}; - -static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0, - -38, -74, 112, 0, -38, -74, 112, 0}; - -static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0, - 112, -94, -18, 0, 112, -94, -18, 0}; - -// Constants for RGBA. -//static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u, -// 0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u}; - -static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38, - 0, 112, -74, -38, 0, 112, -74, -38}; - -static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112, - 0, -18, -94, 112, 0, -18, -94, 112}; - -static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u, - 0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u}; - -static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; - -static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, - 0x8080u, 0x8080u, 0x8080u, 0x8080u}; - -#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) - -// clang-format off - -// TODO(mraptis): Consider passing R, G, B multipliers as parameter. -// round parameter is register containing value to add before shift. -#define RGBTOY(round) \ - "1: \n" \ - "movdqu (%0),%%xmm0 \n" \ - "movdqu 0x10(%0),%%xmm1 \n" \ - "movdqu 0x20(%0),%%xmm2 \n" \ - "movdqu 0x30(%0),%%xmm3 \n" \ - "psubb %%xmm5,%%xmm0 \n" \ - "psubb %%xmm5,%%xmm1 \n" \ - "psubb %%xmm5,%%xmm2 \n" \ - "psubb %%xmm5,%%xmm3 \n" \ - "movdqu %%xmm4,%%xmm6 \n" \ - "pmaddubsw %%xmm0,%%xmm6 \n" \ - "movdqu %%xmm4,%%xmm0 \n" \ - "pmaddubsw %%xmm1,%%xmm0 \n" \ - "movdqu %%xmm4,%%xmm1 \n" \ - "pmaddubsw %%xmm2,%%xmm1 \n" \ - "movdqu %%xmm4,%%xmm2 \n" \ - "pmaddubsw %%xmm3,%%xmm2 \n" \ - "lea 0x40(%0),%0 \n" \ - "phaddw %%xmm0,%%xmm6 \n" \ - "phaddw %%xmm2,%%xmm1 \n" \ - "prefetcht0 1280(%0) \n" \ - "paddw %%" #round ",%%xmm6 \n" \ - "paddw %%" #round ",%%xmm1 \n" \ - "psrlw $0x8,%%xmm6 \n" \ - "psrlw $0x8,%%xmm1 \n" \ - "packuswb %%xmm1,%%xmm6 \n" \ - "movdqu %%xmm6,(%1) \n" \ - "lea 0x10(%1),%1 \n" \ - "sub $0x10,%2 \n" \ - "jg 1b \n" - -#define RGBTOY_AVX2(round) \ - "1: \n" \ - "vmovdqu (%0),%%ymm0 \n" \ - "vmovdqu 0x20(%0),%%ymm1 \n" \ - "vmovdqu 0x40(%0),%%ymm2 \n" \ - "vmovdqu 0x60(%0),%%ymm3 \n" \ - "vpsubb %%ymm5, %%ymm0, %%ymm0 \n" \ - "vpsubb %%ymm5, %%ymm1, %%ymm1 \n" \ - "vpsubb %%ymm5, %%ymm2, %%ymm2 \n" \ - "vpsubb %%ymm5, %%ymm3, %%ymm3 \n" \ - "vpmaddubsw %%ymm0,%%ymm4,%%ymm0 \n" \ - "vpmaddubsw %%ymm1,%%ymm4,%%ymm1 \n" \ - "vpmaddubsw %%ymm2,%%ymm4,%%ymm2 \n" \ - "vpmaddubsw %%ymm3,%%ymm4,%%ymm3 \n" \ - "lea 0x80(%0),%0 \n" \ - "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \ - "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \ - "prefetcht0 1280(%0) \n" \ - "vpaddw %%" #round ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \ - "vpaddw %%" #round ",%%ymm2,%%ymm2 \n" \ - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" \ - "vpsrlw $0x8,%%ymm2,%%ymm2 \n" \ - "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \ - "vpermd %%ymm0,%%ymm6,%%ymm0 \n" /* unmutate. */ \ - "vmovdqu %%ymm0,(%1) \n" \ - "lea 0x20(%1),%1 \n" \ - "sub $0x20,%2 \n" \ - "jg 1b \n" \ - "vzeroupper \n" - -// clang-format on - -#ifdef HAS_ARGBTOYROW_SSSE3 - -// Convert 16 ARGB pixels (64 bytes) to 16 Y values. -void ARGBToYRow_SSSE3(const uint8_t *src_argb, uint8_t *dst_y, int width) { - asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - "movdqa %5,%%xmm7 \n" - - LABELALIGN RGBTOY(xmm7) - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kARGBToY), // %3 - "m"(kSub128), // %4 - "m"(kAddY16) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} - -#endif // HAS_ARGBTOYROW_SSSE3 - -#if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ABGRTOYROW_AVX2) || \ - defined(HAS_ARGBEXTRACTALPHAROW_AVX2) -// vpermd for vphaddw + vpackuswb vpermd. -static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; -#endif - -#ifdef HAS_ARGBTOYROW_AVX2 - -// Convert 32 ARGB pixels (128 bytes) to 32 Y values. -void ARGBToYRow_AVX2(const uint8_t *src_argb, uint8_t *dst_y, int width) { - asm volatile( - "vbroadcastf128 %3,%%ymm4 \n" - "vbroadcastf128 %4,%%ymm5 \n" - "vbroadcastf128 %5,%%ymm7 \n" - "vmovdqu %6,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( - ymm7) "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kARGBToY), // %3 - "m"(kSub128), // %4 - "m"(kAddY16), // %5 - "m"(kPermdARGBToY_AVX) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} - -#endif // HAS_ARGBTOYROW_AVX2 - -#ifdef HAS_ABGRTOYROW_AVX2 - -// Convert 32 ABGR pixels (128 bytes) to 32 Y values. -void ABGRToYRow_AVX2(const uint8_t *src_abgr, uint8_t *dst_y, int width) { - asm volatile( - "vbroadcastf128 %3,%%ymm4 \n" - "vbroadcastf128 %4,%%ymm5 \n" - "vbroadcastf128 %5,%%ymm7 \n" - "vmovdqu %6,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( - ymm7) "vzeroupper \n" - : "+r"(src_abgr), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kABGRToY), // %3 - "m"(kSub128), // %4 - "m"(kAddY16), // %5 - "m"(kPermdARGBToY_AVX) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} - -#endif // HAS_ABGRTOYROW_AVX2 - -#ifdef HAS_ARGBTOUVROW_SSSE3 - -void ARGBToUVRow_SSSE3(const uint8_t *src_argb, - int src_stride_argb, - uint8_t *dst_u, - uint8_t *dst_v, - int width) { - asm volatile( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t) (src_stride_argb)), // %4 - "m"(kARGBToV), // %5 - "m"(kARGBToU), // %6 - "m"(kAddUV128) // %7 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); -} - -#endif // HAS_ARGBTOUVROW_SSSE3 - -#if defined(HAS_ARGBTOUVROW_AVX2) || defined(HAS_ABGRTOUVROW_AVX2) || \ - defined(HAS_ARGBTOUVJROW_AVX2) || defined(HAS_ABGRTOUVJROW_AVX2) -// vpshufb for vphaddw + vpackuswb packed to shorts. -static const lvec8 kShufARGBToUV_AVX = { - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; -#endif - -#if defined(HAS_ARGBTOUVROW_AVX2) - -void ARGBToUVRow_AVX2(const uint8_t *src_argb, - int src_stride_argb, - uint8_t *dst_u, - uint8_t *dst_v, - int width) { - asm volatile( - "vbroadcastf128 %5,%%ymm5 \n" - "vbroadcastf128 %6,%%ymm6 \n" - "vbroadcastf128 %7,%%ymm7 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" - "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" - "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" - "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" - "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" - "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" - "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" - "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" - "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" - - "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" - "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" - "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" - "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpsraw $0x8,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm0,%%ymm0 \n" - "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpshufb %8,%%ymm0,%%ymm0 \n" - "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" - - "vextractf128 $0x0,%%ymm0,(%1) \n" - "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t) (src_stride_argb)), // %4 - "m"(kAddUV128), // %5 - "m"(kARGBToV), // %6 - "m"(kARGBToU), // %7 - "m"(kShufARGBToUV_AVX) // %8 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} - -#endif // HAS_ARGBTOUVROW_AVX2 - -#ifdef HAS_ABGRTOUVROW_AVX2 - -void ABGRToUVRow_AVX2(const uint8_t *src_abgr, - int src_stride_abgr, - uint8_t *dst_u, - uint8_t *dst_v, - int width) { - asm volatile( - "vbroadcastf128 %5,%%ymm5 \n" - "vbroadcastf128 %6,%%ymm6 \n" - "vbroadcastf128 %7,%%ymm7 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" - "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" - "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" - "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" - "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" - "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" - "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" - "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" - "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" - - "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" - "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" - "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" - "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpsraw $0x8,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm0,%%ymm0 \n" - "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpshufb %8,%%ymm0,%%ymm0 \n" - "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" - - "vextractf128 $0x0,%%ymm0,(%1) \n" - "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_abgr), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t) (src_stride_abgr)), // %4 - "m"(kAddUV128), // %5 - "m"(kABGRToV), // %6 - "m"(kABGRToU), // %7 - "m"(kShufARGBToUV_AVX) // %8 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} - -#endif // HAS_ABGRTOUVROW_AVX2 - -void BGRAToYRow_SSSE3(const uint8_t *src_bgra, uint8_t *dst_y, int width) { - asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - "movdqa %5,%%xmm7 \n" - - LABELALIGN RGBTOY(xmm7) - : "+r"(src_bgra), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kBGRAToY), // %3 - "m"(kSub128), // %4 - "m"(kAddY16) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} - -void BGRAToUVRow_SSSE3(const uint8_t *src_bgra, - int src_stride_bgra, - uint8_t *dst_u, - uint8_t *dst_v, - int width) { - asm volatile( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_bgra), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t) (src_stride_bgra)), // %4 - "m"(kBGRAToV), // %5 - "m"(kBGRAToU), // %6 - "m"(kAddUV128) // %7 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); -} - -void ABGRToYRow_SSSE3(const uint8_t *src_abgr, uint8_t *dst_y, int width) { - asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - "movdqa %5,%%xmm7 \n" - - LABELALIGN RGBTOY(xmm7) - : "+r"(src_abgr), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kABGRToY), // %3 - "m"(kSub128), // %4 - "m"(kAddY16) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} - -void ABGRToUVRow_SSSE3(const uint8_t *src_abgr, - int src_stride_abgr, - uint8_t *dst_u, - uint8_t *dst_v, - int width) { - asm volatile( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_abgr), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t) (src_stride_abgr)), // %4 - "m"(kABGRToV), // %5 - "m"(kABGRToU), // %6 - "m"(kAddUV128) // %7 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); -} - -void RGBAToUVRow_SSSE3(const uint8_t *src_rgba, - int src_stride_rgba, - uint8_t *dst_u, - uint8_t *dst_v, - int width) { - asm volatile( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t) (src_stride_rgba)), // %4 - "m"(kRGBAToV), // %5 - "m"(kRGBAToU), // %6 - "m"(kAddUV128) // %7 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); -} - -#ifdef HAS_MIRRORROW_SSSE3 -// Shuffle table for reversing the bytes. -static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, - 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; - -void MirrorRow_SSSE3(const uint8_t *src, uint8_t *dst, int width) { - intptr_t temp_width = (intptr_t) (width); - asm volatile( - - "movdqa %3,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu -0x10(%0,%2,1),%%xmm0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleMirror) // %3 - : "memory", "cc", "xmm0", "xmm5"); -} - -#endif // HAS_MIRRORROW_SSSE3 - -#ifdef HAS_MIRRORROW_AVX2 - -void MirrorRow_AVX2(const uint8_t *src, uint8_t *dst, int width) { - intptr_t temp_width = (intptr_t) (width); - asm volatile( - - "vbroadcastf128 %3,%%ymm5 \n" - - LABELALIGN - "1: \n" - "vmovdqu -0x20(%0,%2,1),%%ymm0 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpermq $0x4e,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleMirror) // %3 - : "memory", "cc", "xmm0", "xmm5"); -} - -#endif // HAS_MIRRORROW_AVX2 - -#ifdef HAS_MIRRORUVROW_SSSE3 -// Shuffle table for reversing the UV. -static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u, - 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u}; - -void MirrorUVRow_SSSE3(const uint8_t *src_uv, uint8_t *dst_uv, int width) { - intptr_t temp_width = (intptr_t) (width); - asm volatile( - - "movdqa %3,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu -0x10(%0,%2,2),%%xmm0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_uv), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleMirrorUV) // %3 - : "memory", "cc", "xmm0", "xmm5"); -} - -#endif // HAS_MIRRORUVROW_SSSE3 - -#ifdef HAS_MIRRORUVROW_AVX2 - -void MirrorUVRow_AVX2(const uint8_t *src_uv, uint8_t *dst_uv, int width) { - intptr_t temp_width = (intptr_t) (width); - asm volatile( - - "vbroadcastf128 %3,%%ymm5 \n" - - LABELALIGN - "1: \n" - "vmovdqu -0x20(%0,%2,2),%%ymm0 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpermq $0x4e,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_uv), // %0 - "+r"(dst_uv), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleMirrorUV) // %3 - : "memory", "cc", "xmm0", "xmm5"); -} - -#endif // HAS_MIRRORUVROW_AVX2 - -#ifdef HAS_MIRRORSPLITUVROW_SSSE3 -// Shuffle table for reversing the bytes of UV channels. -static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, - 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; - -void MirrorSplitUVRow_SSSE3(const uint8_t *src, - uint8_t *dst_u, - uint8_t *dst_v, - int width) { - intptr_t temp_width = (intptr_t) (width); - asm volatile( - "movdqa %4,%%xmm1 \n" - "lea -0x10(%0,%3,2),%0 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea -0x10(%0),%0 \n" - "pshufb %%xmm1,%%xmm0 \n" - "movlpd %%xmm0,(%1) \n" - "movhpd %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $8,%3 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(temp_width) // %3 - : "m"(kShuffleMirrorSplitUV) // %4 - : "memory", "cc", "xmm0", "xmm1"); -} - -#endif // HAS_MIRRORSPLITUVROW_SSSE3 - -#ifdef HAS_ARGBMIRRORROW_SSE2 - -void ARGBMirrorRow_SSE2(const uint8_t *src, uint8_t *dst, int width) { - intptr_t temp_width = (intptr_t) (width); - asm volatile( - - "lea -0x10(%0,%2,4),%0 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "pshufd $0x1b,%%xmm0,%%xmm0 \n" - "lea -0x10(%0),%0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : - : "memory", "cc", "xmm0"); -} - -#endif // HAS_ARGBMIRRORROW_SSE2 - -#ifdef HAS_ARGBMIRRORROW_AVX2 -// Shuffle table for reversing the bytes. -static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; - -void ARGBMirrorRow_AVX2(const uint8_t *src, uint8_t *dst, int width) { - intptr_t temp_width = (intptr_t) (width); - asm volatile( - - "vmovdqu %3,%%ymm5 \n" - - LABELALIGN - "1: \n" - "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : "m"(kARGBShuffleMirror_AVX2) // %3 - : "memory", "cc", "xmm0", "xmm5"); -} - -#endif // HAS_ARGBMIRRORROW_AVX2 - - -#ifdef HAS_COPYROW_SSE2 - -void CopyRow_SSE2(const uint8_t *src, uint8_t *dst, int width) { - asm volatile( - "test $0xf,%0 \n" - "jne 2f \n" - "test $0xf,%1 \n" - "jne 2f \n" - - LABELALIGN - "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "movdqa %%xmm0,(%1) \n" - "movdqa %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "jmp 9f \n" - - LABELALIGN - "2: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 2b \n" - - LABELALIGN "9: \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); -} - -#endif // HAS_COPYROW_SSE2 - -#ifdef HAS_COPYROW_AVX - -void CopyRow_AVX(const uint8_t *src, uint8_t *dst, int width) { - asm volatile( - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x40,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); -} - -#endif // HAS_COPYROW_AVX - -#ifdef HAS_COPYROW_ERMS - -// Multiple of 1. -void CopyRow_ERMS(const uint8_t *src, uint8_t *dst, int width) { - size_t width_tmp = (size_t) (width); - asm volatile( - - "rep movsb \n" - : "+S"(src), // %0 - "+D"(dst), // %1 - "+c"(width_tmp) // %2 - : - : "memory", "cc"); -} - -#endif // HAS_COPYROW_ERMS - -#ifdef HAS_INTERPOLATEROW_SSSE3 - -// Bilinear filter 16x2 -> 16x1 -void InterpolateRow_SSSE3(uint8_t *dst_ptr, - const uint8_t *src_ptr, - ptrdiff_t src_stride, - int width, - int source_y_fraction) { - asm volatile( - "sub %1,%0 \n" - "cmp $0x0,%3 \n" - "je 100f \n" - "cmp $0x80,%3 \n" - "je 50f \n" - - "movd %3,%%xmm0 \n" - "neg %3 \n" - "add $0x100,%3 \n" - "movd %3,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" - "punpcklwd %%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "mov $0x80808080,%%eax \n" - "movd %%eax,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - - // General purpose row blend. - LABELALIGN - "1: \n" - "movdqu (%1),%%xmm0 \n" - "movdqu 0x00(%1,%4,1),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "psubb %%xmm4,%%xmm0 \n" - "psubb %%xmm4,%%xmm1 \n" - "movdqa %%xmm5,%%xmm2 \n" - "movdqa %%xmm5,%%xmm3 \n" - "pmaddubsw %%xmm0,%%xmm2 \n" - "pmaddubsw %%xmm1,%%xmm3 \n" - "paddw %%xmm4,%%xmm2 \n" - "paddw %%xmm4,%%xmm3 \n" - "psrlw $0x8,%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - "movdqu %%xmm2,0x00(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "jmp 99f \n" - - // Blend 50 / 50. - LABELALIGN - "50: \n" - "movdqu (%1),%%xmm0 \n" - "movdqu 0x00(%1,%4,1),%%xmm1 \n" - "pavgb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,0x00(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 50b \n" - "jmp 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - LABELALIGN - "100: \n" - "movdqu (%1),%%xmm0 \n" - "movdqu %%xmm0,0x00(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 100b \n" - - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+rm"(width), // %2 - "+r"(source_y_fraction) // %3 - : "r"((intptr_t) (src_stride)) // %4 - : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} - -#endif // HAS_INTERPOLATEROW_SSSE3 - -#ifdef HAS_INTERPOLATEROW_AVX2 - -// Bilinear filter 32x2 -> 32x1 -void InterpolateRow_AVX2(uint8_t *dst_ptr, - const uint8_t *src_ptr, - ptrdiff_t src_stride, - int width, - int source_y_fraction) { - asm volatile( - "sub %1,%0 \n" - "cmp $0x0,%3 \n" - "je 100f \n" - "cmp $0x80,%3 \n" - "je 50f \n" - - "vmovd %3,%%xmm0 \n" - "neg %3 \n" - "add $0x100,%3 \n" - "vmovd %3,%%xmm5 \n" - "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n" - "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n" - "vbroadcastss %%xmm5,%%ymm5 \n" - "mov $0x80808080,%%eax \n" - "vmovd %%eax,%%xmm4 \n" - "vbroadcastss %%xmm4,%%ymm4 \n" - - // General purpose row blend. - LABELALIGN - "1: \n" - "vmovdqu (%1),%%ymm0 \n" - "vmovdqu 0x00(%1,%4,1),%%ymm2 \n" - "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" - "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" - "vpsubb %%ymm4,%%ymm1,%%ymm1 \n" - "vpsubb %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n" - "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n" - "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" - "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,0x00(%1,%0,1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "jmp 99f \n" - - // Blend 50 / 50. - LABELALIGN - "50: \n" - "vmovdqu (%1),%%ymm0 \n" - "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,0x00(%1,%0,1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 50b \n" - "jmp 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - LABELALIGN - "100: \n" - "vmovdqu (%1),%%ymm0 \n" - "vmovdqu %%ymm0,0x00(%1,%0,1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 100b \n" - - "99: \n" - "vzeroupper \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(width), // %2 - "+r"(source_y_fraction) // %3 - : "r"((intptr_t) (src_stride)) // %4 - : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"); -} - -#endif // HAS_INTERPOLATEROW_AVX2 - -#endif // defined(__x86_64__) || defined(__i386__) diff --git a/pkg/encoder/yuv/libyuv/scale.c b/pkg/encoder/yuv/libyuv/scale.c deleted file mode 100644 index c4bd5b0b4..000000000 --- a/pkg/encoder/yuv/libyuv/scale.c +++ /dev/null @@ -1,946 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "scale.h" - -#include -#include - -#include "cpu_id.h" -#include "planar_functions.h" // For CopyPlane -#include "row.h" -#include "scale_row.h" - -static __inline int Abs(int v) { - return v >= 0 ? v : -v; -} - -#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) -#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s) - -// Scale plane, 1/2 -// This is an optimized version for scaling down a plane to 1/2 of -// its original size. - -static void ScalePlaneDown2(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t *src_ptr, - uint8_t *dst_ptr, - enum FilterMode filtering) { - int y; - void (*ScaleRowDown2)(const uint8_t *src_ptr, ptrdiff_t src_stride, - uint8_t *dst_ptr, int dst_width) = - filtering == kFilterNone - ? ScaleRowDown2_C - : (filtering == kFilterLinear ? ScaleRowDown2Linear_C - : ScaleRowDown2Box_C); - int row_stride = src_stride * 2; - (void) src_width; - (void) src_height; - if (!filtering) { - src_ptr += src_stride; // Point to odd rows. - src_stride = 0; - } - - -#if defined(HAS_SCALEROWDOWN2_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowDown2 = - filtering == kFilterNone - ? ScaleRowDown2_Any_SSSE3 - : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 - : ScaleRowDown2Box_Any_SSSE3); - if (IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = - filtering == kFilterNone - ? ScaleRowDown2_SSSE3 - : (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 - : ScaleRowDown2Box_SSSE3); - } - } -#endif -#if defined(HAS_SCALEROWDOWN2_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ScaleRowDown2 = - filtering == kFilterNone - ? ScaleRowDown2_Any_AVX2 - : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 - : ScaleRowDown2Box_Any_AVX2); - if (IS_ALIGNED(dst_width, 32)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 - : (filtering == kFilterLinear - ? ScaleRowDown2Linear_AVX2 - : ScaleRowDown2Box_AVX2); - } - } -#endif - - if (filtering == kFilterLinear) { - src_stride = 0; - } - // TODO(fbarchard): Loop through source height to allow odd height. - for (y = 0; y < dst_height; ++y) { - ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); - src_ptr += row_stride; - dst_ptr += dst_stride; - } -} - -// Scale plane, 1/4 -// This is an optimized version for scaling down a plane to 1/4 of -// its original size. - -static void ScalePlaneDown4(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t *src_ptr, - uint8_t *dst_ptr, - enum FilterMode filtering) { - int y; - void (*ScaleRowDown4)(const uint8_t *src_ptr, ptrdiff_t src_stride, - uint8_t *dst_ptr, int dst_width) = - filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C; - int row_stride = src_stride * 4; - (void) src_width; - (void) src_height; - if (!filtering) { - src_ptr += src_stride * 2; // Point to row 2. - src_stride = 0; - } - -#if defined(HAS_SCALEROWDOWN4_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowDown4 = - filtering ? ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3; - if (IS_ALIGNED(dst_width, 8)) { - ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3; - } - } -#endif -#if defined(HAS_SCALEROWDOWN4_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ScaleRowDown4 = - filtering ? ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2; - if (IS_ALIGNED(dst_width, 16)) { - ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2; - } - } -#endif - - if (filtering == kFilterLinear) { - src_stride = 0; - } - for (y = 0; y < dst_height; ++y) { - ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width); - src_ptr += row_stride; - dst_ptr += dst_stride; - } -} - -// Scale plane down, 3/4 -static void ScalePlaneDown34(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t *src_ptr, - uint8_t *dst_ptr, - enum FilterMode filtering) { - int y; - void (*ScaleRowDown34_0)(const uint8_t *src_ptr, ptrdiff_t src_stride, - uint8_t *dst_ptr, int dst_width); - void (*ScaleRowDown34_1)(const uint8_t *src_ptr, ptrdiff_t src_stride, - uint8_t *dst_ptr, int dst_width); - const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; - (void) src_width; - (void) src_height; - assert(dst_width % 3 == 0); - if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_C; - ScaleRowDown34_1 = ScaleRowDown34_C; - } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_C; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_C; - } - -#if defined(HAS_SCALEROWDOWN34_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_Any_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_Any_SSSE3; - } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_SSSE3; - } - if (dst_width % 24 == 0) { - if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_SSSE3; - } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3; - } - } - } -#endif - - for (y = 0; y < dst_height - 2; y += 3) { - ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); - src_ptr += src_stride; - dst_ptr += dst_stride; - ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width); - src_ptr += src_stride; - dst_ptr += dst_stride; - ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width); - src_ptr += src_stride * 2; - dst_ptr += dst_stride; - } - - // Remainder 1 or 2 rows with last row vertically unfiltered - if ((dst_height % 3) == 2) { - ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); - src_ptr += src_stride; - dst_ptr += dst_stride; - ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width); - } else if ((dst_height % 3) == 1) { - ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width); - } -} - -// Scale plane, 3/8 -// This is an optimized version for scaling down a plane to 3/8 -// of its original size. -// -// Uses box filter arranges like this -// aaabbbcc -> abc -// aaabbbcc def -// aaabbbcc ghi -// dddeeeff -// dddeeeff -// dddeeeff -// ggghhhii -// ggghhhii -// Boxes are 3x3, 2x3, 3x2 and 2x2 - -static void ScalePlaneDown38(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t *src_ptr, - uint8_t *dst_ptr, - enum FilterMode filtering) { - int y; - void (*ScaleRowDown38_3)(const uint8_t *src_ptr, ptrdiff_t src_stride, - uint8_t *dst_ptr, int dst_width); - void (*ScaleRowDown38_2)(const uint8_t *src_ptr, ptrdiff_t src_stride, - uint8_t *dst_ptr, int dst_width); - const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; - assert(dst_width % 3 == 0); - (void) src_width; - (void) src_height; - if (!filtering) { - ScaleRowDown38_3 = ScaleRowDown38_C; - ScaleRowDown38_2 = ScaleRowDown38_C; - } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Box_C; - ScaleRowDown38_2 = ScaleRowDown38_2_Box_C; - } - -#if defined(HAS_SCALEROWDOWN38_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - if (!filtering) { - ScaleRowDown38_3 = ScaleRowDown38_Any_SSSE3; - ScaleRowDown38_2 = ScaleRowDown38_Any_SSSE3; - } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_SSSE3; - ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_SSSE3; - } - if (dst_width % 12 == 0 && !filtering) { - ScaleRowDown38_3 = ScaleRowDown38_SSSE3; - ScaleRowDown38_2 = ScaleRowDown38_SSSE3; - } - if (dst_width % 6 == 0 && filtering) { - ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3; - ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3; - } - } -#endif - - for (y = 0; y < dst_height - 2; y += 3) { - ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); - src_ptr += src_stride * 3; - dst_ptr += dst_stride; - ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); - src_ptr += src_stride * 3; - dst_ptr += dst_stride; - ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width); - src_ptr += src_stride * 2; - dst_ptr += dst_stride; - } - - // Remainder 1 or 2 rows with last row vertically unfiltered - if ((dst_height % 3) == 2) { - ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); - src_ptr += src_stride * 3; - dst_ptr += dst_stride; - ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); - } else if ((dst_height % 3) == 1) { - ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); - } -} - -#define MIN1(x) ((x) < 1 ? 1 : (x)) - -static __inline uint32_t SumPixels(int iboxwidth, const uint16_t *src_ptr) { - uint32_t sum = 0u; - int x; - assert(iboxwidth > 0); - for (x = 0; x < iboxwidth; ++x) { - sum += src_ptr[x]; - } - return sum; -} - -static __inline uint32_t SumPixels_16(int iboxwidth, const uint32_t *src_ptr) { - uint32_t sum = 0u; - int x; - assert(iboxwidth > 0); - for (x = 0; x < iboxwidth; ++x) { - sum += src_ptr[x]; - } - return sum; -} - -static void ScaleAddCols2_C(int dst_width, - int boxheight, - int x, - int dx, - const uint16_t *src_ptr, - uint8_t *dst_ptr) { - int i; - int scaletbl[2]; - int minboxwidth = dx >> 16; - int boxwidth; - scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight); - scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight); - for (i = 0; i < dst_width; ++i) { - int ix = x >> 16; - x += dx; - boxwidth = MIN1((x >> 16) - ix); - int scaletbl_index = boxwidth - minboxwidth; - assert((scaletbl_index == 0) || (scaletbl_index == 1)); - *dst_ptr++ = (uint8_t) (SumPixels(boxwidth, src_ptr + ix) * - scaletbl[scaletbl_index] >> - 16); - } -} - -static void ScaleAddCols0_C(int dst_width, - int boxheight, - int x, - int dx, - const uint16_t *src_ptr, - uint8_t *dst_ptr) { - int scaleval = 65536 / boxheight; - int i; - (void) dx; - src_ptr += (x >> 16); - for (i = 0; i < dst_width; ++i) { - *dst_ptr++ = (uint8_t) (src_ptr[i] * scaleval >> 16); - } -} - -static void ScaleAddCols1_C(int dst_width, - int boxheight, - int x, - int dx, - const uint16_t *src_ptr, - uint8_t *dst_ptr) { - int boxwidth = MIN1(dx >> 16); - int scaleval = 65536 / (boxwidth * boxheight); - int i; - x >>= 16; - for (i = 0; i < dst_width; ++i) { - *dst_ptr++ = (uint8_t) (SumPixels(boxwidth, src_ptr + x) * scaleval >> 16); - x += boxwidth; - } -} - -// Scale plane down to any dimensions, with interpolation. -// (boxfilter). -// -// Same method as SimpleScale, which is fixed point, outputting -// one pixel of destination using fixed point (16.16) to step -// through source, sampling a box of pixel with simple -// averaging. -static void ScalePlaneBox(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t *src_ptr, - uint8_t *dst_ptr) { - int j, k; - // Initial source x/y coordinate and step values as 16.16 fixed point. - int x = 0; - int y = 0; - int dx = 0; - int dy = 0; - const int max_y = (src_height << 16); - ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y, - &dx, &dy); - src_width = Abs(src_width); - { - // Allocate a row buffer of uint16_t. - align_buffer_64(row16, src_width * 2); - void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, - const uint16_t *src_ptr, uint8_t *dst_ptr) = - (dx & 0xffff) ? ScaleAddCols2_C - : ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C); - void (*ScaleAddRow)(const uint8_t *src_ptr, uint16_t *dst_ptr, - int src_width) = ScaleAddRow_C; -#if defined(HAS_SCALEADDROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ScaleAddRow = ScaleAddRow_Any_SSE2; - if (IS_ALIGNED(src_width, 16)) { - ScaleAddRow = ScaleAddRow_SSE2; - } - } -#endif -#if defined(HAS_SCALEADDROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ScaleAddRow = ScaleAddRow_Any_AVX2; - if (IS_ALIGNED(src_width, 32)) { - ScaleAddRow = ScaleAddRow_AVX2; - } - } -#endif - - - for (j = 0; j < dst_height; ++j) { - int boxheight; - int iy = y >> 16; - const uint8_t *src = src_ptr + iy * (int64_t) src_stride; - y += dy; - if (y > max_y) { - y = max_y; - } - boxheight = MIN1((y >> 16) - iy); - memset(row16, 0, src_width * 2); - for (k = 0; k < boxheight; ++k) { - ScaleAddRow(src, (uint16_t *) (row16), src_width); - src += src_stride; - } - ScaleAddCols(dst_width, boxheight, x, dx, (uint16_t *) (row16), dst_ptr); - dst_ptr += dst_stride; - } - free_aligned_buffer_64(row16); - } -} - -// Scale plane down with bilinear interpolation. -static void ScalePlaneBilinearDown(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t *src_ptr, - uint8_t *dst_ptr, - enum FilterMode filtering) { - // Initial source x/y coordinate and step values as 16.16 fixed point. - int x = 0; - int y = 0; - int dx = 0; - int dy = 0; - // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. - // Allocate a row buffer. - align_buffer_64(row, src_width); - - const int max_y = (src_height - 1) << 16; - int j; - void (*ScaleFilterCols)(uint8_t *dst_ptr, const uint8_t *src_ptr, - int dst_width, int x, int dx) = - (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C; - void (*InterpolateRow)(uint8_t *dst_ptr, const uint8_t *src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) = InterpolateRow_C; - ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, - &dx, &dy); - src_width = Abs(src_width); - -#if defined(HAS_INTERPOLATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; - if (IS_ALIGNED(src_width, 16)) { - InterpolateRow = InterpolateRow_SSSE3; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - InterpolateRow = InterpolateRow_Any_AVX2; - if (IS_ALIGNED(src_width, 32)) { - InterpolateRow = InterpolateRow_AVX2; - } - } -#endif - -#if defined(HAS_SCALEFILTERCOLS_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleFilterCols = ScaleFilterCols_SSSE3; - } -#endif - - if (y > max_y) { - y = max_y; - } - - for (j = 0; j < dst_height; ++j) { - int yi = y >> 16; - const uint8_t *src = src_ptr + yi * (int64_t) src_stride; - if (filtering == kFilterLinear) { - ScaleFilterCols(dst_ptr, src, dst_width, x, dx); - } else { - int yf = (y >> 8) & 255; - InterpolateRow(row, src, src_stride, src_width, yf); - ScaleFilterCols(dst_ptr, row, dst_width, x, dx); - } - dst_ptr += dst_stride; - y += dy; - if (y > max_y) { - y = max_y; - } - } - free_aligned_buffer_64(row); -} - -// Scale up down with bilinear interpolation. -static void ScalePlaneBilinearUp(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t *src_ptr, - uint8_t *dst_ptr, - enum FilterMode filtering) { - int j; - // Initial source x/y coordinate and step values as 16.16 fixed point. - int x = 0; - int y = 0; - int dx = 0; - int dy = 0; - const int max_y = (src_height - 1) << 16; - void (*InterpolateRow)(uint8_t *dst_ptr, const uint8_t *src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) = InterpolateRow_C; - void (*ScaleFilterCols)(uint8_t *dst_ptr, const uint8_t *src_ptr, - int dst_width, int x, int dx) = - filtering ? ScaleFilterCols_C : ScaleCols_C; - ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, - &dx, &dy); - src_width = Abs(src_width); - -#if defined(HAS_INTERPOLATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; - if (IS_ALIGNED(dst_width, 16)) { - InterpolateRow = InterpolateRow_SSSE3; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - InterpolateRow = InterpolateRow_Any_AVX2; - if (IS_ALIGNED(dst_width, 32)) { - InterpolateRow = InterpolateRow_AVX2; - } - } -#endif - - if (filtering && src_width >= 32768) { - ScaleFilterCols = ScaleFilterCols64_C; - } -#if defined(HAS_SCALEFILTERCOLS_SSSE3) - if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleFilterCols = ScaleFilterCols_SSSE3; - } -#endif - - if (!filtering && src_width * 2 == dst_width && x < 0x8000) { - ScaleFilterCols = ScaleColsUp2_C; -#if defined(HAS_SCALECOLS_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { - ScaleFilterCols = ScaleColsUp2_SSE2; - } -#endif - } - - if (y > max_y) { - y = max_y; - } - { - int yi = y >> 16; - const uint8_t *src = src_ptr + yi * (int64_t) src_stride; - - // Allocate 2 row buffers. - const int row_size = (dst_width + 31) & ~31; - align_buffer_64(row, row_size * 2); - - uint8_t *rowptr = row; - int rowstride = row_size; - int lasty = yi; - - ScaleFilterCols(rowptr, src, dst_width, x, dx); - if (src_height > 1) { - src += src_stride; - } - ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx); - if (src_height > 2) { - src += src_stride; - } - - for (j = 0; j < dst_height; ++j) { - yi = y >> 16; - if (yi != lasty) { - if (y > max_y) { - y = max_y; - yi = y >> 16; - src = src_ptr + yi * (int64_t) src_stride; - } - if (yi != lasty) { - ScaleFilterCols(rowptr, src, dst_width, x, dx); - rowptr += rowstride; - rowstride = -rowstride; - lasty = yi; - if ((y + 65536) < max_y) { - src += src_stride; - } - } - } - if (filtering == kFilterLinear) { - InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0); - } else { - int yf = (y >> 8) & 255; - InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf); - } - dst_ptr += dst_stride; - y += dy; - } - free_aligned_buffer_64(row); - } -} - -// Scale plane, horizontally up by 2 times. -// Uses linear filter horizontally, nearest vertically. -// This is an optimized version for scaling up a plane to 2 times of -// its original width, using linear interpolation. -// This is used to scale U and V planes of I422 to I444. -static void ScalePlaneUp2_Linear(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t *src_ptr, - uint8_t *dst_ptr) { - void (*ScaleRowUp)(const uint8_t *src_ptr, uint8_t *dst_ptr, int dst_width) = - ScaleRowUp2_Linear_Any_C; - int i; - int y; - int dy; - - (void) src_width; - // This function can only scale up by 2 times horizontally. - assert(src_width == ((dst_width + 1) / 2)); - -#ifdef HAS_SCALEROWUP2_LINEAR_SSE2 - if (TestCpuFlag(kCpuHasSSE2)) { - ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2; - } -#endif - -#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3 - if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3; - } -#endif - -#ifdef HAS_SCALEROWUP2_LINEAR_AVX2 - if (TestCpuFlag(kCpuHasAVX2)) { - ScaleRowUp = ScaleRowUp2_Linear_Any_AVX2; - } -#endif - - - if (dst_height == 1) { - ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t) src_stride, dst_ptr, - dst_width); - } else { - dy = FixedDiv(src_height - 1, dst_height - 1); - y = (1 << 15) - 1; - for (i = 0; i < dst_height; ++i) { - ScaleRowUp(src_ptr + (y >> 16) * (int64_t) src_stride, dst_ptr, dst_width); - dst_ptr += dst_stride; - y += dy; - } - } -} - -// Scale plane, up by 2 times. -// This is an optimized version for scaling up a plane to 2 times of -// its original size, using bilinear interpolation. -// This is used to scale U and V planes of I420 to I444. -static void ScalePlaneUp2_Bilinear(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t *src_ptr, - uint8_t *dst_ptr) { - void (*Scale2RowUp)(const uint8_t *src_ptr, ptrdiff_t src_stride, - uint8_t *dst_ptr, ptrdiff_t dst_stride, int dst_width) = - ScaleRowUp2_Bilinear_Any_C; - int x; - - (void) src_width; - // This function can only scale up by 2 times. - assert(src_width == ((dst_width + 1) / 2)); - assert(src_height == ((dst_height + 1) / 2)); - -#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2 - if (TestCpuFlag(kCpuHasSSE2)) { - Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2; - } -#endif - -#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3 - if (TestCpuFlag(kCpuHasSSSE3)) { - Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3; - } -#endif - -#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2 - if (TestCpuFlag(kCpuHasAVX2)) { - Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2; - } -#endif - - - Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); - dst_ptr += dst_stride; - for (x = 0; x < src_height - 1; ++x) { - Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); - src_ptr += src_stride; - // TODO(fbarchard): Test performance of writing one row of destination at a - // time. - dst_ptr += 2 * dst_stride; - } - if (!(dst_height & 1)) { - Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); - } -} - -// Scale Plane to/from any dimensions, without interpolation. -// Fixed point math is used for performance: The upper 16 bits -// of x and dx is the integer part of the source position and -// the lower 16 bits are the fixed decimal part. - -static void ScalePlaneSimple(int src_width, - int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t *src_ptr, - uint8_t *dst_ptr) { - int i; - void (*ScaleCols)(uint8_t *dst_ptr, const uint8_t *src_ptr, int dst_width, - int x, int dx) = ScaleCols_C; - // Initial source x/y coordinate and step values as 16.16 fixed point. - int x = 0; - int y = 0; - int dx = 0; - int dy = 0; - ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y, - &dx, &dy); - src_width = Abs(src_width); - - if (src_width * 2 == dst_width && x < 0x8000) { - ScaleCols = ScaleColsUp2_C; -#if defined(HAS_SCALECOLS_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { - ScaleCols = ScaleColsUp2_SSE2; - } -#endif - } - - for (i = 0; i < dst_height; ++i) { - ScaleCols(dst_ptr, src_ptr + (y >> 16) * (int64_t) src_stride, dst_width, x, - dx); - dst_ptr += dst_stride; - y += dy; - } -} - -// Scale a plane. -// This function dispatches to a specialized scaler based on scale factor. -LIBYUV_API -void ScalePlane(const uint8_t *src, - int src_stride, - int src_width, - int src_height, - uint8_t *dst, - int dst_stride, - int dst_width, - int dst_height, - enum FilterMode filtering) { - // Simplify filtering when possible. - filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, - filtering); - - // Negative height means invert the image. - if (src_height < 0) { - src_height = -src_height; - src = src + (src_height - 1) * (int64_t) src_stride; - src_stride = -src_stride; - } - // Use specialized scales to improve performance for common resolutions. - // For example, all the 1/2 scalings will use ScalePlaneDown2() - if (dst_width == src_width && dst_height == src_height) { - // Straight copy. - CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height); - return; - } - if (dst_width == src_width && filtering != kFilterBox) { - int dy = 0; - int y = 0; - // When scaling down, use the center 2 rows to filter. - // When scaling up, last row of destination uses the last 2 source rows. - if (dst_height <= src_height) { - dy = FixedDiv(src_height, dst_height); - y = CENTERSTART(dy, -32768); // Subtract 0.5 (32768) to center filter. - } else if (src_height > 1 && dst_height > 1) { - dy = FixedDiv1(src_height, dst_height); - } - // Arbitrary scale vertically, but unscaled horizontally. - ScalePlaneVertical(src_height, dst_width, dst_height, src_stride, - dst_stride, src, dst, 0, y, dy, /*bpp=*/1, filtering); - return; - } - if (dst_width <= Abs(src_width) && dst_height <= src_height) { - // Scale down. - if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) { - // optimized, 3/4 - ScalePlaneDown34(src_width, src_height, dst_width, dst_height, src_stride, - dst_stride, src, dst, filtering); - return; - } - if (2 * dst_width == src_width && 2 * dst_height == src_height) { - // optimized, 1/2 - ScalePlaneDown2(src_width, src_height, dst_width, dst_height, src_stride, - dst_stride, src, dst, filtering); - return; - } - // 3/8 rounded up for odd sized chroma height. - if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) { - // optimized, 3/8 - ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride, - dst_stride, src, dst, filtering); - return; - } - if (4 * dst_width == src_width && 4 * dst_height == src_height && - (filtering == kFilterBox || filtering == kFilterNone)) { - // optimized, 1/4 - ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride, - dst_stride, src, dst, filtering); - return; - } - } - if (filtering == kFilterBox && dst_height * 2 < src_height) { - ScalePlaneBox(src_width, src_height, dst_width, dst_height, src_stride, - dst_stride, src, dst); - return; - } - if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) { - ScalePlaneUp2_Linear(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst); - return; - } - if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width && - (filtering == kFilterBilinear || filtering == kFilterBox)) { - ScalePlaneUp2_Bilinear(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst); - return; - } - if (filtering && dst_height > src_height) { - ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); - return; - } - if (filtering) { - ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); - return; - } - ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride, - dst_stride, src, dst); -} - -LIBYUV_API -int I420Scale(const uint8_t *src_y, - int src_stride_y, - const uint8_t *src_u, - int src_stride_u, - const uint8_t *src_v, - int src_stride_v, - int src_width, - int src_height, - uint8_t *dst_y, - int dst_stride_y, - uint8_t *dst_u, - int dst_stride_u, - uint8_t *dst_v, - int dst_stride_v, - int dst_width, - int dst_height, - enum FilterMode filtering) { - int src_halfwidth = SUBSAMPLE(src_width, 1, 1); - int src_halfheight = SUBSAMPLE(src_height, 1, 1); - int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); - int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); - - if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || - src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || - dst_width <= 0 || dst_height <= 0) { - return -1; - } - - ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, - dst_width, dst_height, filtering); - ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u, - dst_stride_u, dst_halfwidth, dst_halfheight, filtering); - ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v, - dst_stride_v, dst_halfwidth, dst_halfheight, filtering); - return 0; -} diff --git a/pkg/encoder/yuv/libyuv/scale.h b/pkg/encoder/yuv/libyuv/scale.h deleted file mode 100644 index ed0a1983f..000000000 --- a/pkg/encoder/yuv/libyuv/scale.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_SCALE_H_ -#define INCLUDE_LIBYUV_SCALE_H_ - -#include "basic_types.h" - -// Supported filtering. -typedef enum FilterMode { - kFilterNone = 0, // Point sample; Fastest. - kFilterLinear = 1, // Filter horizontally only. - kFilterBilinear = 2, // Faster than box, but lower quality scaling down. - kFilterBox = 3 // Highest quality. -} FilterModeEnum; - -// Scales a YUV 4:2:0 image from the src width and height to the -// dst width and height. -// If filtering is kFilterNone, a simple nearest-neighbor algorithm is -// used. This produces basic (blocky) quality at the fastest speed. -// If filtering is kFilterBilinear, interpolation is used to produce a better -// quality image, at the expense of speed. -// If filtering is kFilterBox, averaging is used to produce ever better -// quality image, at further expense of speed. -// Returns 0 if successful. - -LIBYUV_API -int I420Scale(const uint8_t *src_y, - int src_stride_y, - const uint8_t *src_u, - int src_stride_u, - const uint8_t *src_v, - int src_stride_v, - int src_width, - int src_height, - uint8_t *dst_y, - int dst_stride_y, - uint8_t *dst_u, - int dst_stride_u, - uint8_t *dst_v, - int dst_stride_v, - int dst_width, - int dst_height, - enum FilterMode filtering); - -#endif // INCLUDE_LIBYUV_SCALE_H_ diff --git a/pkg/encoder/yuv/libyuv/scale_any.c b/pkg/encoder/yuv/libyuv/scale_any.c deleted file mode 100644 index f05e55b6e..000000000 --- a/pkg/encoder/yuv/libyuv/scale_any.c +++ /dev/null @@ -1,632 +0,0 @@ -/* - * Copyright 2015 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "scale_row.h" - -// Fixed scale down. -// Mask may be non-power of 2, so use MOD -#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \ - int dst_width) { \ - int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */ \ - int n = dst_width - r; \ - if (n > 0) { \ - SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ - } \ - SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ - dst_ptr + n * BPP, r); \ - } - -// Fixed scale down for odd source width. Used by I420Blend subsampling. -// Since dst_width is (width + 1) / 2, this function scales one less pixel -// and copies the last pixel. -#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \ - int dst_width) { \ - int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */ \ - int n = (dst_width - 1) - r; \ - if (n > 0) { \ - SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ - } \ - SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ - dst_ptr + n * BPP, r + 1); \ - } - -#ifdef HAS_SCALEROWDOWN2_SSSE3 - -SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15) - -SDANY(ScaleRowDown2Linear_Any_SSSE3, - ScaleRowDown2Linear_SSSE3, - ScaleRowDown2Linear_C, - 2, - 1, - 15) - -SDANY(ScaleRowDown2Box_Any_SSSE3, - ScaleRowDown2Box_SSSE3, - ScaleRowDown2Box_C, - 2, - 1, - 15) - -SDODD(ScaleRowDown2Box_Odd_SSSE3, - ScaleRowDown2Box_SSSE3, - ScaleRowDown2Box_Odd_C, - 2, - 1, - 15) - -#endif -#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3 - -SDANY(ScaleUVRowDown2Box_Any_SSSE3, - ScaleUVRowDown2Box_SSSE3, - ScaleUVRowDown2Box_C, - 2, - 2, - 3) - -#endif -#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2 - -SDANY(ScaleUVRowDown2Box_Any_AVX2, - ScaleUVRowDown2Box_AVX2, - ScaleUVRowDown2Box_C, - 2, - 2, - 7) - -#endif -#ifdef HAS_SCALEROWDOWN2_AVX2 - -SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31) - -SDANY(ScaleRowDown2Linear_Any_AVX2, - ScaleRowDown2Linear_AVX2, - ScaleRowDown2Linear_C, - 2, - 1, - 31) - -SDANY(ScaleRowDown2Box_Any_AVX2, - ScaleRowDown2Box_AVX2, - ScaleRowDown2Box_C, - 2, - 1, - 31) - -SDODD(ScaleRowDown2Box_Odd_AVX2, - ScaleRowDown2Box_AVX2, - ScaleRowDown2Box_Odd_C, - 2, - 1, - 31) - -#endif -#ifdef HAS_SCALEROWDOWN4_SSSE3 - -SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7) - -SDANY(ScaleRowDown4Box_Any_SSSE3, - ScaleRowDown4Box_SSSE3, - ScaleRowDown4Box_C, - 4, - 1, - 7) - -#endif -#ifdef HAS_SCALEROWDOWN4_AVX2 - -SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15) - -SDANY(ScaleRowDown4Box_Any_AVX2, - ScaleRowDown4Box_AVX2, - ScaleRowDown4Box_C, - 4, - 1, - 15) - -#endif -#ifdef HAS_SCALEROWDOWN34_SSSE3 - -SDANY(ScaleRowDown34_Any_SSSE3, - ScaleRowDown34_SSSE3, - ScaleRowDown34_C, - 4 / 3, - 1, - 23) - -SDANY(ScaleRowDown34_0_Box_Any_SSSE3, - ScaleRowDown34_0_Box_SSSE3, - ScaleRowDown34_0_Box_C, - 4 / 3, - 1, - 23) - -SDANY(ScaleRowDown34_1_Box_Any_SSSE3, - ScaleRowDown34_1_Box_SSSE3, - ScaleRowDown34_1_Box_C, - 4 / 3, - 1, - 23) - -#endif - -#ifdef HAS_SCALEROWDOWN38_SSSE3 - -SDANY(ScaleRowDown38_Any_SSSE3, - ScaleRowDown38_SSSE3, - ScaleRowDown38_C, - 8 / 3, - 1, - 11) - -SDANY(ScaleRowDown38_3_Box_Any_SSSE3, - ScaleRowDown38_3_Box_SSSE3, - ScaleRowDown38_3_Box_C, - 8 / 3, - 1, - 5) - -SDANY(ScaleRowDown38_2_Box_Any_SSSE3, - ScaleRowDown38_2_Box_SSSE3, - ScaleRowDown38_2_Box_C, - 8 / 3, - 1, - 5) - -#endif - - -#undef SDANY - -// Scale down by even scale factor. -#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, \ - uint8_t* dst_ptr, int dst_width) { \ - int r = dst_width & MASK; \ - int n = dst_width & ~MASK; \ - if (n > 0) { \ - SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \ - } \ - SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx, \ - dst_ptr + n * BPP, r); \ - } - - - -#ifdef SASIMDONLY -// This also works and uses memcpy and SIMD instead of C, but is slower on ARM - -// Add rows box filter scale down. Using macro from row_any -#define SAROW(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int width) { \ - SIMD_ALIGNED(uint16_t dst_temp[32]); \ - SIMD_ALIGNED(uint8_t src_temp[32]); \ - memset(dst_temp, 0, 32 * 2); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_ptr, n); \ - } \ - memcpy(src_temp, src_ptr + n * SBPP, r * SBPP); \ - memcpy(dst_temp, dst_ptr + n * BPP, r * BPP); \ - ANY_SIMD(src_temp, dst_temp, MASK + 1); \ - memcpy(dst_ptr + n * BPP, dst_temp, r * BPP); \ - } - -#ifdef HAS_SCALEADDROW_SSE2 -SAROW(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, 1, 2, 15) -#endif -#ifdef HAS_SCALEADDROW_AVX2 -SAROW(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, 1, 2, 31) -#endif -#undef SAANY - -#else - -// Add rows box filter scale down. -#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \ - void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \ - int n = src_width & ~MASK; \ - if (n > 0) { \ - SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \ - } \ - SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \ - } - -#ifdef HAS_SCALEADDROW_SSE2 - -SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15) - -#endif -#ifdef HAS_SCALEADDROW_AVX2 - -SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31) - -#endif -#undef SAANY - -#endif // SASIMDONLY - -// Scale up horizontally 2 times using linear filter. -#define SUH2LANY(NAME, SIMD, C, MASK, PTYPE) \ - void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \ - int work_width = (dst_width - 1) & ~1; \ - int r = work_width & MASK; \ - int n = work_width & ~MASK; \ - dst_ptr[0] = src_ptr[0]; \ - if (work_width > 0) { \ - if (n != 0) { \ - SIMD(src_ptr, dst_ptr + 1, n); \ - } \ - C(src_ptr + (n / 2), dst_ptr + n + 1, r); \ - } \ - dst_ptr[dst_width - 1] = src_ptr[(dst_width - 1) / 2]; \ - } - -// Even the C versions need to be wrapped, because boundary pixels have to -// be handled differently - -SUH2LANY(ScaleRowUp2_Linear_Any_C, - ScaleRowUp2_Linear_C, - ScaleRowUp2_Linear_C, - 0, - uint8_t) - -SUH2LANY(ScaleRowUp2_Linear_16_Any_C, - ScaleRowUp2_Linear_16_C, - ScaleRowUp2_Linear_16_C, - 0, - uint16_t) - -#ifdef HAS_SCALEROWUP2_LINEAR_SSE2 - -SUH2LANY(ScaleRowUp2_Linear_Any_SSE2, - ScaleRowUp2_Linear_SSE2, - ScaleRowUp2_Linear_C, - 15, - uint8_t) - -#endif - -#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3 - -SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3, - ScaleRowUp2_Linear_SSSE3, - ScaleRowUp2_Linear_C, - 15, - uint8_t) - -#endif - -#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3 - -SUH2LANY(ScaleRowUp2_Linear_12_Any_SSSE3, - ScaleRowUp2_Linear_12_SSSE3, - ScaleRowUp2_Linear_16_C, - 15, - uint16_t) - -#endif - -#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2 - -SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2, - ScaleRowUp2_Linear_16_SSE2, - ScaleRowUp2_Linear_16_C, - 7, - uint16_t) - -#endif - -#ifdef HAS_SCALEROWUP2_LINEAR_AVX2 - -SUH2LANY(ScaleRowUp2_Linear_Any_AVX2, - ScaleRowUp2_Linear_AVX2, - ScaleRowUp2_Linear_C, - 31, - uint8_t) - -#endif - -#ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2 - -SUH2LANY(ScaleRowUp2_Linear_12_Any_AVX2, - ScaleRowUp2_Linear_12_AVX2, - ScaleRowUp2_Linear_16_C, - 31, - uint16_t) - -#endif - -#ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2 - -SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2, - ScaleRowUp2_Linear_16_AVX2, - ScaleRowUp2_Linear_16_C, - 15, - uint16_t) - -#endif -#undef SUH2LANY - -// Scale up 2 times using bilinear filter. -// This function produces 2 rows at a time. -#define SU2BLANY(NAME, SIMD, C, MASK, PTYPE) \ - void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \ - ptrdiff_t dst_stride, int dst_width) { \ - int work_width = (dst_width - 1) & ~1; \ - int r = work_width & MASK; \ - int n = work_width & ~MASK; \ - const PTYPE* sa = src_ptr; \ - const PTYPE* sb = src_ptr + src_stride; \ - PTYPE* da = dst_ptr; \ - PTYPE* db = dst_ptr + dst_stride; \ - da[0] = (3 * sa[0] + sb[0] + 2) >> 2; \ - db[0] = (sa[0] + 3 * sb[0] + 2) >> 2; \ - if (work_width > 0) { \ - if (n != 0) { \ - SIMD(sa, sb - sa, da + 1, db - da, n); \ - } \ - C(sa + (n / 2), sb - sa, da + n + 1, db - da, r); \ - } \ - da[dst_width - 1] = \ - (3 * sa[(dst_width - 1) / 2] + sb[(dst_width - 1) / 2] + 2) >> 2; \ - db[dst_width - 1] = \ - (sa[(dst_width - 1) / 2] + 3 * sb[(dst_width - 1) / 2] + 2) >> 2; \ - } - -SU2BLANY(ScaleRowUp2_Bilinear_Any_C, - ScaleRowUp2_Bilinear_C, - ScaleRowUp2_Bilinear_C, - 0, - uint8_t) - -SU2BLANY(ScaleRowUp2_Bilinear_16_Any_C, - ScaleRowUp2_Bilinear_16_C, - ScaleRowUp2_Bilinear_16_C, - 0, - uint16_t) - -#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2 - -SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2, - ScaleRowUp2_Bilinear_SSE2, - ScaleRowUp2_Bilinear_C, - 15, - uint8_t) - -#endif - -#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3 - -SU2BLANY(ScaleRowUp2_Bilinear_12_Any_SSSE3, - ScaleRowUp2_Bilinear_12_SSSE3, - ScaleRowUp2_Bilinear_16_C, - 15, - uint16_t) - -#endif - -#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2 - -SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2, - ScaleRowUp2_Bilinear_16_SSE2, - ScaleRowUp2_Bilinear_16_C, - 7, - uint16_t) - -#endif - -#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3 - -SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3, - ScaleRowUp2_Bilinear_SSSE3, - ScaleRowUp2_Bilinear_C, - 15, - uint8_t) - -#endif - -#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2 - -SU2BLANY(ScaleRowUp2_Bilinear_Any_AVX2, - ScaleRowUp2_Bilinear_AVX2, - ScaleRowUp2_Bilinear_C, - 31, - uint8_t) - -#endif - -#ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2 - -SU2BLANY(ScaleRowUp2_Bilinear_12_Any_AVX2, - ScaleRowUp2_Bilinear_12_AVX2, - ScaleRowUp2_Bilinear_16_C, - 15, - uint16_t) - -#endif - -#ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2 - -SU2BLANY(ScaleRowUp2_Bilinear_16_Any_AVX2, - ScaleRowUp2_Bilinear_16_AVX2, - ScaleRowUp2_Bilinear_16_C, - 15, - uint16_t) - -#endif - -#undef SU2BLANY - -// Scale bi-planar plane up horizontally 2 times using linear filter. -#define SBUH2LANY(NAME, SIMD, C, MASK, PTYPE) \ - void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \ - int work_width = (dst_width - 1) & ~1; \ - int r = work_width & MASK; \ - int n = work_width & ~MASK; \ - dst_ptr[0] = src_ptr[0]; \ - dst_ptr[1] = src_ptr[1]; \ - if (work_width > 0) { \ - if (n != 0) { \ - SIMD(src_ptr, dst_ptr + 2, n); \ - } \ - C(src_ptr + n, dst_ptr + 2 * n + 2, r); \ - } \ - dst_ptr[2 * dst_width - 2] = src_ptr[((dst_width + 1) & ~1) - 2]; \ - dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1]; \ - } - -SBUH2LANY(ScaleUVRowUp2_Linear_Any_C, - ScaleUVRowUp2_Linear_C, - ScaleUVRowUp2_Linear_C, - 0, - uint8_t) - -SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_C, - ScaleUVRowUp2_Linear_16_C, - ScaleUVRowUp2_Linear_16_C, - 0, - uint16_t) - -#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3 - -SBUH2LANY(ScaleUVRowUp2_Linear_Any_SSSE3, - ScaleUVRowUp2_Linear_SSSE3, - ScaleUVRowUp2_Linear_C, - 7, - uint8_t) - -#endif - -#ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2 - -SBUH2LANY(ScaleUVRowUp2_Linear_Any_AVX2, - ScaleUVRowUp2_Linear_AVX2, - ScaleUVRowUp2_Linear_C, - 15, - uint8_t) - -#endif - -#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41 - -SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_SSE41, - ScaleUVRowUp2_Linear_16_SSE41, - ScaleUVRowUp2_Linear_16_C, - 3, - uint16_t) - -#endif - -#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2 - -SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_AVX2, - ScaleUVRowUp2_Linear_16_AVX2, - ScaleUVRowUp2_Linear_16_C, - 7, - uint16_t) - -#endif - -#undef SBUH2LANY - -// Scale bi-planar plane up 2 times using bilinear filter. -// This function produces 2 rows at a time. -#define SBU2BLANY(NAME, SIMD, C, MASK, PTYPE) \ - void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \ - ptrdiff_t dst_stride, int dst_width) { \ - int work_width = (dst_width - 1) & ~1; \ - int r = work_width & MASK; \ - int n = work_width & ~MASK; \ - const PTYPE* sa = src_ptr; \ - const PTYPE* sb = src_ptr + src_stride; \ - PTYPE* da = dst_ptr; \ - PTYPE* db = dst_ptr + dst_stride; \ - da[0] = (3 * sa[0] + sb[0] + 2) >> 2; \ - db[0] = (sa[0] + 3 * sb[0] + 2) >> 2; \ - da[1] = (3 * sa[1] + sb[1] + 2) >> 2; \ - db[1] = (sa[1] + 3 * sb[1] + 2) >> 2; \ - if (work_width > 0) { \ - if (n != 0) { \ - SIMD(sa, sb - sa, da + 2, db - da, n); \ - } \ - C(sa + n, sb - sa, da + 2 * n + 2, db - da, r); \ - } \ - da[2 * dst_width - 2] = (3 * sa[((dst_width + 1) & ~1) - 2] + \ - sb[((dst_width + 1) & ~1) - 2] + 2) >> \ - 2; \ - db[2 * dst_width - 2] = (sa[((dst_width + 1) & ~1) - 2] + \ - 3 * sb[((dst_width + 1) & ~1) - 2] + 2) >> \ - 2; \ - da[2 * dst_width - 1] = (3 * sa[((dst_width + 1) & ~1) - 1] + \ - sb[((dst_width + 1) & ~1) - 1] + 2) >> \ - 2; \ - db[2 * dst_width - 1] = (sa[((dst_width + 1) & ~1) - 1] + \ - 3 * sb[((dst_width + 1) & ~1) - 1] + 2) >> \ - 2; \ - } - -SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_C, - ScaleUVRowUp2_Bilinear_C, - ScaleUVRowUp2_Bilinear_C, - 0, - uint8_t) - -SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_C, - ScaleUVRowUp2_Bilinear_16_C, - ScaleUVRowUp2_Bilinear_16_C, - 0, - uint16_t) - -#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3 - -SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_SSSE3, - ScaleUVRowUp2_Bilinear_SSSE3, - ScaleUVRowUp2_Bilinear_C, - 7, - uint8_t) - -#endif - -#ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2 - -SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_AVX2, - ScaleUVRowUp2_Bilinear_AVX2, - ScaleUVRowUp2_Bilinear_C, - 15, - uint8_t) - -#endif - -#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 - -SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_SSE41, - ScaleUVRowUp2_Bilinear_16_SSE41, - ScaleUVRowUp2_Bilinear_16_C, - 7, - uint16_t) - -#endif - -#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2 - -SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_AVX2, - ScaleUVRowUp2_Bilinear_16_AVX2, - ScaleUVRowUp2_Bilinear_16_C, - 7, - uint16_t) - -#endif - -#undef SBU2BLANY diff --git a/pkg/encoder/yuv/libyuv/scale_common.c b/pkg/encoder/yuv/libyuv/scale_common.c deleted file mode 100644 index 17eedd992..000000000 --- a/pkg/encoder/yuv/libyuv/scale_common.c +++ /dev/null @@ -1,930 +0,0 @@ -/* - * Copyright 2013 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "scale.h" - -#include - -#include "cpu_id.h" -#include "row.h" -#include "scale_row.h" - -#define STATIC_CAST(type, expr) (type)(expr) - -// TODO(fbarchard): make clamp255 preserve negative values. -static __inline int32_t clamp255(int32_t v) { - return (-(v >= 255) | v) & 255; -} - -// Use scale to convert lsb formats to msb, depending how many bits there are: -// 32768 = 9 bits -// 16384 = 10 bits -// 4096 = 12 bits -// 256 = 16 bits -// TODO(fbarchard): change scale to bits -#define C16TO8(v, scale) clamp255(((v) * (scale)) >> 16) - -static __inline int Abs(int v) { - return v >= 0 ? v : -v; -} - -// CPU agnostic row functions -void ScaleRowDown2_C(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst, - int dst_width) { - int x; - (void) src_stride; - for (x = 0; x < dst_width - 1; x += 2) { - dst[0] = src_ptr[1]; - dst[1] = src_ptr[3]; - dst += 2; - src_ptr += 4; - } - if (dst_width & 1) { - dst[0] = src_ptr[1]; - } -} - -void ScaleRowDown2Linear_C(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst, - int dst_width) { - const uint8_t *s = src_ptr; - int x; - (void) src_stride; - for (x = 0; x < dst_width - 1; x += 2) { - dst[0] = (s[0] + s[1] + 1) >> 1; - dst[1] = (s[2] + s[3] + 1) >> 1; - dst += 2; - s += 4; - } - if (dst_width & 1) { - dst[0] = (s[0] + s[1] + 1) >> 1; - } -} - -void ScaleRowDown2Box_C(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst, - int dst_width) { - const uint8_t *s = src_ptr; - const uint8_t *t = src_ptr + src_stride; - int x; - for (x = 0; x < dst_width - 1; x += 2) { - dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; - dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; - dst += 2; - s += 4; - t += 4; - } - if (dst_width & 1) { - dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; - } -} - -void ScaleRowDown2Box_Odd_C(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst, - int dst_width) { - const uint8_t *s = src_ptr; - const uint8_t *t = src_ptr + src_stride; - int x; - dst_width -= 1; - for (x = 0; x < dst_width - 1; x += 2) { - dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; - dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; - dst += 2; - s += 4; - t += 4; - } - if (dst_width & 1) { - dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; - dst += 1; - s += 2; - t += 2; - } - dst[0] = (s[0] + t[0] + 1) >> 1; -} - -void ScaleRowDown4_C(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst, - int dst_width) { - int x; - (void) src_stride; - for (x = 0; x < dst_width - 1; x += 2) { - dst[0] = src_ptr[2]; - dst[1] = src_ptr[6]; - dst += 2; - src_ptr += 8; - } - if (dst_width & 1) { - dst[0] = src_ptr[2]; - } -} - -void ScaleRowDown4Box_C(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst, - int dst_width) { - intptr_t stride = src_stride; - int x; - for (x = 0; x < dst_width - 1; x += 2) { - dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + - src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + - src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + - src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + - src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + - src_ptr[stride * 3 + 3] + 8) >> - 4; - dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + - src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] + - src_ptr[stride + 7] + src_ptr[stride * 2 + 4] + - src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] + - src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] + - src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] + - src_ptr[stride * 3 + 7] + 8) >> - 4; - dst += 2; - src_ptr += 8; - } - if (dst_width & 1) { - dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + - src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + - src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + - src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + - src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + - src_ptr[stride * 3 + 3] + 8) >> - 4; - } -} - -void ScaleRowDown34_C(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst, - int dst_width) { - int x; - (void) src_stride; - assert((dst_width % 3 == 0) && (dst_width > 0)); - for (x = 0; x < dst_width; x += 3) { - dst[0] = src_ptr[0]; - dst[1] = src_ptr[1]; - dst[2] = src_ptr[3]; - dst += 3; - src_ptr += 4; - } -} - -// Filter rows 0 and 1 together, 3 : 1 -void ScaleRowDown34_0_Box_C(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *d, - int dst_width) { - const uint8_t *s = src_ptr; - const uint8_t *t = src_ptr + src_stride; - int x; - assert((dst_width % 3 == 0) && (dst_width > 0)); - for (x = 0; x < dst_width; x += 3) { - uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; - uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; - uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; - uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; - uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; - uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; - d[0] = (a0 * 3 + b0 + 2) >> 2; - d[1] = (a1 * 3 + b1 + 2) >> 2; - d[2] = (a2 * 3 + b2 + 2) >> 2; - d += 3; - s += 4; - t += 4; - } -} - -// Filter rows 1 and 2 together, 1 : 1 -void ScaleRowDown34_1_Box_C(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *d, - int dst_width) { - const uint8_t *s = src_ptr; - const uint8_t *t = src_ptr + src_stride; - int x; - assert((dst_width % 3 == 0) && (dst_width > 0)); - for (x = 0; x < dst_width; x += 3) { - uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; - uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; - uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; - uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; - uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; - uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; - d[0] = (a0 + b0 + 1) >> 1; - d[1] = (a1 + b1 + 1) >> 1; - d[2] = (a2 + b2 + 1) >> 1; - d += 3; - s += 4; - t += 4; - } -} - -// Sample position: (O is src sample position, X is dst sample position) -// -// v dst_ptr at here v stop at here -// X O X X O X X O X X O X X O X -// ^ src_ptr at here -void ScaleRowUp2_Linear_C(const uint8_t *src_ptr, - uint8_t *dst_ptr, - int dst_width) { - int src_width = dst_width >> 1; - int x; - assert((dst_width % 2 == 0) && (dst_width >= 0)); - for (x = 0; x < src_width; ++x) { - dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2; - dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2; - } -} - -// Sample position: (O is src sample position, X is dst sample position) -// -// src_ptr at here -// X v X X X X X X X X X -// O O O O O -// X X X X X X X X X X -// ^ dst_ptr at here ^ stop at here -// X X X X X X X X X X -// O O O O O -// X X X X X X X X X X -void ScaleRowUp2_Bilinear_C(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - const uint8_t *s = src_ptr; - const uint8_t *t = src_ptr + src_stride; - uint8_t *d = dst_ptr; - uint8_t *e = dst_ptr + dst_stride; - int src_width = dst_width >> 1; - int x; - assert((dst_width % 2 == 0) && (dst_width >= 0)); - for (x = 0; x < src_width; ++x) { - d[2 * x + 0] = - (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4; - d[2 * x + 1] = - (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4; - e[2 * x + 0] = - (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4; - e[2 * x + 1] = - (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4; - } -} - -// Only suitable for at most 14 bit range. -void ScaleRowUp2_Linear_16_C(const uint16_t *src_ptr, - uint16_t *dst_ptr, - int dst_width) { - int src_width = dst_width >> 1; - int x; - assert((dst_width % 2 == 0) && (dst_width >= 0)); - for (x = 0; x < src_width; ++x) { - dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2; - dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2; - } -} - -// Only suitable for at most 12bit range. -void ScaleRowUp2_Bilinear_16_C(const uint16_t *src_ptr, - ptrdiff_t src_stride, - uint16_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - const uint16_t *s = src_ptr; - const uint16_t *t = src_ptr + src_stride; - uint16_t *d = dst_ptr; - uint16_t *e = dst_ptr + dst_stride; - int src_width = dst_width >> 1; - int x; - assert((dst_width % 2 == 0) && (dst_width >= 0)); - for (x = 0; x < src_width; ++x) { - d[2 * x + 0] = - (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4; - d[2 * x + 1] = - (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4; - e[2 * x + 0] = - (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4; - e[2 * x + 1] = - (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4; - } -} - -// (1-f)a + fb can be replaced with a + f(b-a) -#if defined(__arm__) || defined(__aarch64__) -#define BLENDER(a, b, f) \ - (uint8_t)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) -#else -// Intel uses 7 bit math with rounding. -#define BLENDER(a, b, f) \ - (uint8_t)((int)(a) + (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7)) -#endif - -void ScaleFilterCols_C(uint8_t *dst_ptr, - const uint8_t *src_ptr, - int dst_width, - int x, - int dx) { - int j; - for (j = 0; j < dst_width - 1; j += 2) { - int xi = x >> 16; - int a = src_ptr[xi]; - int b = src_ptr[xi + 1]; - dst_ptr[0] = BLENDER(a, b, x & 0xffff); - x += dx; - xi = x >> 16; - a = src_ptr[xi]; - b = src_ptr[xi + 1]; - dst_ptr[1] = BLENDER(a, b, x & 0xffff); - x += dx; - dst_ptr += 2; - } - if (dst_width & 1) { - int xi = x >> 16; - int a = src_ptr[xi]; - int b = src_ptr[xi + 1]; - dst_ptr[0] = BLENDER(a, b, x & 0xffff); - } -} - -void ScaleFilterCols64_C(uint8_t *dst_ptr, - const uint8_t *src_ptr, - int dst_width, - int x32, - int dx) { - int64_t x = (int64_t) (x32); - int j; - for (j = 0; j < dst_width - 1; j += 2) { - int64_t xi = x >> 16; - int a = src_ptr[xi]; - int b = src_ptr[xi + 1]; - dst_ptr[0] = BLENDER(a, b, x & 0xffff); - x += dx; - xi = x >> 16; - a = src_ptr[xi]; - b = src_ptr[xi + 1]; - dst_ptr[1] = BLENDER(a, b, x & 0xffff); - x += dx; - dst_ptr += 2; - } - if (dst_width & 1) { - int64_t xi = x >> 16; - int a = src_ptr[xi]; - int b = src_ptr[xi + 1]; - dst_ptr[0] = BLENDER(a, b, x & 0xffff); - } -} - -#undef BLENDER - -// Same as 8 bit arm blender but return is cast to uint16_t -#define BLENDER(a, b, f) \ - (uint16_t)( \ - (int)(a) + \ - (int)((((int64_t)((f)) * ((int64_t)(b) - (int)(a))) + 0x8000) >> 16)) -#undef BLENDER - -void ScaleRowDown38_C(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst, - int dst_width) { - int x; - (void) src_stride; - assert(dst_width % 3 == 0); - for (x = 0; x < dst_width; x += 3) { - dst[0] = src_ptr[0]; - dst[1] = src_ptr[3]; - dst[2] = src_ptr[6]; - dst += 3; - src_ptr += 8; - } -} - -// 8x3 -> 3x1 -void ScaleRowDown38_3_Box_C(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width) { - intptr_t stride = src_stride; - int i; - assert((dst_width % 3 == 0) && (dst_width > 0)); - for (i = 0; i < dst_width; i += 3) { - dst_ptr[0] = - (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + - src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + - src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * - (65536 / 9) >> - 16; - dst_ptr[1] = - (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + - src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + - src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * - (65536 / 9) >> - 16; - dst_ptr[2] = - (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] + - src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * - (65536 / 6) >> - 16; - src_ptr += 8; - dst_ptr += 3; - } -} - -// 8x2 -> 3x1 -void ScaleRowDown38_2_Box_C(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width) { - intptr_t stride = src_stride; - int i; - assert((dst_width % 3 == 0) && (dst_width > 0)); - for (i = 0; i < dst_width; i += 3) { - dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + - src_ptr[stride + 1] + src_ptr[stride + 2]) * - (65536 / 6) >> - 16; - dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + - src_ptr[stride + 4] + src_ptr[stride + 5]) * - (65536 / 6) >> - 16; - dst_ptr[2] = - (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) * - (65536 / 4) >> - 16; - src_ptr += 8; - dst_ptr += 3; - } -} - -void ScaleAddRow_C(const uint8_t *src_ptr, uint16_t *dst_ptr, int src_width) { - int x; - assert(src_width > 0); - for (x = 0; x < src_width - 1; x += 2) { - dst_ptr[0] += src_ptr[0]; - dst_ptr[1] += src_ptr[1]; - src_ptr += 2; - dst_ptr += 2; - } - if (src_width & 1) { - dst_ptr[0] += src_ptr[0]; - } -} - -// UV scale row functions -// same as ARGB but 2 channels - -void ScaleUVRowDown2_C(const uint8_t *src_uv, - ptrdiff_t src_stride, - uint8_t *dst_uv, - int dst_width) { - int x; - (void) src_stride; - for (x = 0; x < dst_width; ++x) { - dst_uv[0] = src_uv[2]; // Store the 2nd UV - dst_uv[1] = src_uv[3]; - src_uv += 4; - dst_uv += 2; - } -} - -void ScaleUVRowDown2Linear_C(const uint8_t *src_uv, - ptrdiff_t src_stride, - uint8_t *dst_uv, - int dst_width) { - int x; - (void) src_stride; - for (x = 0; x < dst_width; ++x) { - dst_uv[0] = (src_uv[0] + src_uv[2] + 1) >> 1; - dst_uv[1] = (src_uv[1] + src_uv[3] + 1) >> 1; - src_uv += 4; - dst_uv += 2; - } -} - -void ScaleUVRowDown2Box_C(const uint8_t *src_uv, - ptrdiff_t src_stride, - uint8_t *dst_uv, - int dst_width) { - int x; - for (x = 0; x < dst_width; ++x) { - dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] + - src_uv[src_stride + 2] + 2) >> - 2; - dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] + - src_uv[src_stride + 3] + 2) >> - 2; - src_uv += 4; - dst_uv += 2; - } -} - -void ScaleUVRowDownEven_C(const uint8_t *src_uv, - ptrdiff_t src_stride, - int src_stepx, - uint8_t *dst_uv, - int dst_width) { - const uint16_t *src = (const uint16_t *) (src_uv); - uint16_t *dst = (uint16_t *) (dst_uv); - (void) src_stride; - int x; - for (x = 0; x < dst_width - 1; x += 2) { - dst[0] = src[0]; - dst[1] = src[src_stepx]; - src += src_stepx * 2; - dst += 2; - } - if (dst_width & 1) { - dst[0] = src[0]; - } -} - -// Scales a single row of pixels using point sampling. -void ScaleCols_C(uint8_t *dst_ptr, - const uint8_t *src_ptr, - int dst_width, - int x, - int dx) { - int j; - for (j = 0; j < dst_width - 1; j += 2) { - dst_ptr[0] = src_ptr[x >> 16]; - x += dx; - dst_ptr[1] = src_ptr[x >> 16]; - x += dx; - dst_ptr += 2; - } - if (dst_width & 1) { - dst_ptr[0] = src_ptr[x >> 16]; - } -} - -// Scales a single row of pixels up by 2x using point sampling. -void ScaleColsUp2_C(uint8_t *dst_ptr, - const uint8_t *src_ptr, - int dst_width, - int x, - int dx) { - int j; - (void) x; - (void) dx; - for (j = 0; j < dst_width - 1; j += 2) { - dst_ptr[1] = dst_ptr[0] = src_ptr[0]; - src_ptr += 1; - dst_ptr += 2; - } - if (dst_width & 1) { - dst_ptr[0] = src_ptr[0]; - } -} - -void ScaleUVRowUp2_Linear_C(const uint8_t *src_ptr, - uint8_t *dst_ptr, - int dst_width) { - int src_width = dst_width >> 1; - int x; - assert((dst_width % 2 == 0) && (dst_width >= 0)); - for (x = 0; x < src_width; ++x) { - dst_ptr[4 * x + 0] = - (src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2; - dst_ptr[4 * x + 1] = - (src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2; - dst_ptr[4 * x + 2] = - (src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2; - dst_ptr[4 * x + 3] = - (src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2; - } -} - -void ScaleUVRowUp2_Bilinear_C(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - const uint8_t *s = src_ptr; - const uint8_t *t = src_ptr + src_stride; - uint8_t *d = dst_ptr; - uint8_t *e = dst_ptr + dst_stride; - int src_width = dst_width >> 1; - int x; - assert((dst_width % 2 == 0) && (dst_width >= 0)); - for (x = 0; x < src_width; ++x) { - d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + - t[2 * x + 2] * 1 + 8) >> - 4; - d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + - t[2 * x + 3] * 1 + 8) >> - 4; - d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 + - t[2 * x + 2] * 3 + 8) >> - 4; - d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 + - t[2 * x + 3] * 3 + 8) >> - 4; - e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 + - t[2 * x + 2] * 3 + 8) >> - 4; - e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 + - t[2 * x + 3] * 3 + 8) >> - 4; - e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + - t[2 * x + 2] * 9 + 8) >> - 4; - e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + - t[2 * x + 3] * 9 + 8) >> - 4; - } -} - -void ScaleUVRowUp2_Linear_16_C(const uint16_t *src_ptr, - uint16_t *dst_ptr, - int dst_width) { - int src_width = dst_width >> 1; - int x; - assert((dst_width % 2 == 0) && (dst_width >= 0)); - for (x = 0; x < src_width; ++x) { - dst_ptr[4 * x + 0] = - (src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2; - dst_ptr[4 * x + 1] = - (src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2; - dst_ptr[4 * x + 2] = - (src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2; - dst_ptr[4 * x + 3] = - (src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2; - } -} - -void ScaleUVRowUp2_Bilinear_16_C(const uint16_t *src_ptr, - ptrdiff_t src_stride, - uint16_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - const uint16_t *s = src_ptr; - const uint16_t *t = src_ptr + src_stride; - uint16_t *d = dst_ptr; - uint16_t *e = dst_ptr + dst_stride; - int src_width = dst_width >> 1; - int x; - assert((dst_width % 2 == 0) && (dst_width >= 0)); - for (x = 0; x < src_width; ++x) { - d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + - t[2 * x + 2] * 1 + 8) >> - 4; - d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + - t[2 * x + 3] * 1 + 8) >> - 4; - d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 + - t[2 * x + 2] * 3 + 8) >> - 4; - d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 + - t[2 * x + 3] * 3 + 8) >> - 4; - e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 + - t[2 * x + 2] * 3 + 8) >> - 4; - e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 + - t[2 * x + 3] * 3 + 8) >> - 4; - e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + - t[2 * x + 2] * 9 + 8) >> - 4; - e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + - t[2 * x + 3] * 9 + 8) >> - 4; - } -} - -// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607. -// Mimics SSSE3 blender -#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7 -#define BLENDERC(a, b, f, s) \ - (uint16_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s) -#define BLENDER(a, b, f) BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0) - -void ScaleUVFilterCols_C(uint8_t *dst_uv, - const uint8_t *src_uv, - int dst_width, - int x, - int dx) { - const uint16_t *src = (const uint16_t *) (src_uv); - uint16_t *dst = (uint16_t *) (dst_uv); - int j; - for (j = 0; j < dst_width - 1; j += 2) { - int xi = x >> 16; - int xf = (x >> 9) & 0x7f; - uint16_t a = src[xi]; - uint16_t b = src[xi + 1]; - dst[0] = BLENDER(a, b, xf); - x += dx; - xi = x >> 16; - xf = (x >> 9) & 0x7f; - a = src[xi]; - b = src[xi + 1]; - dst[1] = BLENDER(a, b, xf); - x += dx; - dst += 2; - } - if (dst_width & 1) { - int xi = x >> 16; - int xf = (x >> 9) & 0x7f; - uint16_t a = src[xi]; - uint16_t b = src[xi + 1]; - dst[0] = BLENDER(a, b, xf); - } -} - -#undef BLENDER1 -#undef BLENDERC -#undef BLENDER - -// Scale plane vertically with bilinear interpolation. -void ScalePlaneVertical(int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t *src_argb, - uint8_t *dst_argb, - int x, - int y, - int dy, - int bpp, // bytes per pixel. 4 for ARGB. - enum FilterMode filtering) { - // TODO(fbarchard): Allow higher bpp. - int dst_width_bytes = dst_width * bpp; - void (*InterpolateRow)(uint8_t *dst_argb, const uint8_t *src_argb, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) = InterpolateRow_C; - const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; - int j; - assert(bpp >= 1 && bpp <= 4); - assert(src_height != 0); - assert(dst_width > 0); - assert(dst_height > 0); - src_argb += (x >> 16) * bpp; -#if defined(HAS_INTERPOLATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; - if (IS_ALIGNED(dst_width_bytes, 16)) { - InterpolateRow = InterpolateRow_SSSE3; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - InterpolateRow = InterpolateRow_Any_AVX2; - if (IS_ALIGNED(dst_width_bytes, 32)) { - InterpolateRow = InterpolateRow_AVX2; - } - } -#endif - - - for (j = 0; j < dst_height; ++j) { - int yi; - int yf; - if (y > max_y) { - y = max_y; - } - yi = y >> 16; - yf = filtering ? ((y >> 8) & 255) : 0; - InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride, - dst_width_bytes, yf); - dst_argb += dst_stride; - y += dy; - } -} - -// Simplify the filtering based on scale factors. -enum FilterMode ScaleFilterReduce(int src_width, - int src_height, - int dst_width, - int dst_height, - enum FilterMode filtering) { - if (src_width < 0) { - src_width = -src_width; - } - if (src_height < 0) { - src_height = -src_height; - } - if (filtering == kFilterBox) { - // If scaling either axis to 0.5 or larger, switch from Box to Bilinear. - if (dst_width * 2 >= src_width || dst_height * 2 >= src_height) { - filtering = kFilterBilinear; - } - } - if (filtering == kFilterBilinear) { - if (src_height == 1) { - filtering = kFilterLinear; - } - // TODO(fbarchard): Detect any odd scale factor and reduce to Linear. - if (dst_height == src_height || dst_height * 3 == src_height) { - filtering = kFilterLinear; - } - // TODO(fbarchard): Remove 1 pixel wide filter restriction, which is to - // avoid reading 2 pixels horizontally that causes memory exception. - if (src_width == 1) { - filtering = kFilterNone; - } - } - if (filtering == kFilterLinear) { - if (src_width == 1) { - filtering = kFilterNone; - } - // TODO(fbarchard): Detect any odd scale factor and reduce to None. - if (dst_width == src_width || dst_width * 3 == src_width) { - filtering = kFilterNone; - } - } - return filtering; -} - -#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s) - -// Compute slope values for stepping. -void ScaleSlope(int src_width, - int src_height, - int dst_width, - int dst_height, - enum FilterMode filtering, - int *x, - int *y, - int *dx, - int *dy) { - assert(x != NULL); - assert(y != NULL); - assert(dx != NULL); - assert(dy != NULL); - assert(src_width != 0); - assert(src_height != 0); - assert(dst_width > 0); - assert(dst_height > 0); - // Check for 1 pixel and avoid FixedDiv overflow. - if (dst_width == 1 && src_width >= 32768) { - dst_width = src_width; - } - if (dst_height == 1 && src_height >= 32768) { - dst_height = src_height; - } - if (filtering == kFilterBox) { - // Scale step for point sampling duplicates all pixels equally. - *dx = FixedDiv(Abs(src_width), dst_width); - *dy = FixedDiv(src_height, dst_height); - *x = 0; - *y = 0; - } else if (filtering == kFilterBilinear) { - // Scale step for bilinear sampling renders last pixel once for upsample. - if (dst_width <= Abs(src_width)) { - *dx = FixedDiv(Abs(src_width), dst_width); - *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter. - } else if (src_width > 1 && dst_width > 1) { - *dx = FixedDiv1(Abs(src_width), dst_width); - *x = 0; - } - if (dst_height <= src_height) { - *dy = FixedDiv(src_height, dst_height); - *y = CENTERSTART(*dy, -32768); // Subtract 0.5 (32768) to center filter. - } else if (src_height > 1 && dst_height > 1) { - *dy = FixedDiv1(src_height, dst_height); - *y = 0; - } - } else if (filtering == kFilterLinear) { - // Scale step for bilinear sampling renders last pixel once for upsample. - if (dst_width <= Abs(src_width)) { - *dx = FixedDiv(Abs(src_width), dst_width); - *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter. - } else if (src_width > 1 && dst_width > 1) { - *dx = FixedDiv1(Abs(src_width), dst_width); - *x = 0; - } - *dy = FixedDiv(src_height, dst_height); - *y = *dy >> 1; - } else { - // Scale step for point sampling duplicates all pixels equally. - *dx = FixedDiv(Abs(src_width), dst_width); - *dy = FixedDiv(src_height, dst_height); - *x = CENTERSTART(*dx, 0); - *y = CENTERSTART(*dy, 0); - } - // Negative src_width means horizontally mirror. - if (src_width < 0) { - *x += (dst_width - 1) * *dx; - *dx = -*dx; - // src_width = -src_width; // Caller must do this. - } -} - -#undef CENTERSTART diff --git a/pkg/encoder/yuv/libyuv/scale_gcc.c b/pkg/encoder/yuv/libyuv/scale_gcc.c deleted file mode 100644 index 716d6cfdb..000000000 --- a/pkg/encoder/yuv/libyuv/scale_gcc.c +++ /dev/null @@ -1,2651 +0,0 @@ -/* - * Copyright 2013 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "row.h" -#include "scale_row.h" - -// This module is for GCC x86 and x64. -#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) - -// Offsets for source bytes 0 to 9 -static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, - 128, 128, 128, 128, 128, 128, 128, 128}; - -// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. -static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12, - 128, 128, 128, 128, 128, 128, 128, 128}; - -// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15, - 128, 128, 128, 128, 128, 128, 128, 128}; - -// Offsets for source bytes 0 to 10 -static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; - -// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. -static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, - 8, 9, 9, 10, 10, 11, 12, 13}; - -// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10, - 10, 11, 12, 13, 13, 14, 14, 15}; - -// Coefficients for source bytes 0 to 10 -static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2}; - -// Coefficients for source bytes 10 to 21 -static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1}; - -// Coefficients for source bytes 21 to 31 -static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3}; - -// Coefficients for source bytes 21 to 31 -static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2}; - -static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128}; - -static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3, - 6, 8, 11, 14, 128, 128, 128, 128}; - -// Arrange words 0,3,6 into 0,1,2 -static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128}; - -// Arrange words 0,3,6 into 3,4,5 -static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1, - 6, 7, 12, 13, 128, 128, 128, 128}; - -// Scaling values for boxes of 3x3 and 2x3 -static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, - 65536 / 9, 65536 / 6, 0, 0}; - -// Arrange first value for pixels 0,1,2,3,4,5 -static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128, - 11, 128, 14, 128, 128, 128, 128, 128}; - -// Arrange second value for pixels 0,1,2,3,4,5 -static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128, - 12, 128, 15, 128, 128, 128, 128, 128}; - -// Arrange third value for pixels 0,1,2,3,4,5 -static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128, - 13, 128, 128, 128, 128, 128, 128, 128}; - -// Scaling values for boxes of 3x2 and 2x2 -static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, - 65536 / 3, 65536 / 2, 0, 0}; - -// GCC versions of row functions are verbatim conversions from Visual C. -// Generated using gcc disassembly on Visual C object file: -// objdump -D yuvscaler.obj >yuvscaler.txt - -void ScaleRowDown2_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width) { - (void) src_stride; - asm volatile( - // 16 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); -} - -void ScaleRowDown2Linear_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width) { - (void) src_stride; - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrlw $0xf,%%xmm4 \n" - "packuswb %%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pavgw %%xmm5,%%xmm0 \n" - "pavgw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm4", "xmm5"); -} - -void ScaleRowDown2Box_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width) { - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrlw $0xf,%%xmm4 \n" - "packuswb %%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%3,1),%%xmm2 \n" - "movdqu 0x10(%0,%3,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "psrlw $0x1,%%xmm0 \n" - "psrlw $0x1,%%xmm1 \n" - "pavgw %%xmm5,%%xmm0 \n" - "pavgw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t) (src_stride)) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} - -#ifdef HAS_SCALEROWDOWN2_AVX2 - -void ScaleRowDown2_AVX2(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width) { - (void) src_stride; - asm volatile(LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); -} - -void ScaleRowDown2Linear_AVX2(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width) { - (void) src_stride; - asm volatile( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" - "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm4", "xmm5"); -} - -void ScaleRowDown2Box_AVX2(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width) { - asm volatile( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" - "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" - "lea 0x40(%0),%0 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vpsrlw $0x1,%%ymm0,%%ymm0 \n" - "vpsrlw $0x1,%%ymm1,%%ymm1 \n" - "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" - "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t) (src_stride)) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} - -#endif // HAS_SCALEROWDOWN2_AVX2 - -void ScaleRowDown4_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width) { - (void) src_stride; - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrld $0x18,%%xmm5 \n" - "pslld $0x10,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "psrlw $0x8,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm5"); -} - -void ScaleRowDown4Box_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width) { - intptr_t stridex3; - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrlw $0xf,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "packuswb %%xmm4,%%xmm4 \n" - "psllw $0x3,%%xmm5 \n" - "lea 0x00(%4,%4,2),%3 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%4,1),%%xmm2 \n" - "movdqu 0x10(%0,%4,1),%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "movdqu 0x00(%0,%4,2),%%xmm2 \n" - "movdqu 0x10(%0,%4,2),%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "movdqu 0x00(%0,%3,1),%%xmm2 \n" - "movdqu 0x10(%0,%3,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "phaddw %%xmm1,%%xmm0 \n" - "paddw %%xmm5,%%xmm0 \n" - "psrlw $0x4,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "=&r"(stridex3) // %3 - : "r"((intptr_t) (src_stride)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} - -#ifdef HAS_SCALEROWDOWN4_AVX2 - -void ScaleRowDown4_AVX2(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width) { - (void) src_stride; - asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrld $0x18,%%ymm5,%%ymm5 \n" - "vpslld $0x10,%%ymm5,%%ymm5 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm5"); -} - -void ScaleRowDown4Box_AVX2(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width) { - asm volatile( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpsllw $0x3,%%ymm4,%%ymm5 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" - "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vmovdqu 0x00(%0,%3,2),%%ymm2 \n" - "vmovdqu 0x20(%0,%3,2),%%ymm3 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vmovdqu 0x00(%0,%4,1),%%ymm2 \n" - "vmovdqu 0x20(%0,%4,1),%%ymm3 \n" - "lea 0x40(%0),%0 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" - "vpsrlw $0x4,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t) (src_stride)), // %3 - "r"((intptr_t) (src_stride * 3)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} - -#endif // HAS_SCALEROWDOWN4_AVX2 - -void ScaleRowDown34_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width) { - (void) src_stride; - asm volatile( - "movdqa %0,%%xmm3 \n" - "movdqa %1,%%xmm4 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kShuf0), // %0 - "m"(kShuf1), // %1 - "m"(kShuf2) // %2 - ); - asm volatile(LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm2 \n" - "lea 0x20(%0),%0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "palignr $0x8,%%xmm0,%%xmm1 \n" - "pshufb %%xmm3,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "pshufb %%xmm5,%%xmm2 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x8(%1) \n" - "movq %%xmm2,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} - -void ScaleRowDown34_1_Box_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width) { - asm volatile( - "movdqa %0,%%xmm2 \n" // kShuf01 - "movdqa %1,%%xmm3 \n" // kShuf11 - "movdqa %2,%%xmm4 \n" // kShuf21 - : - : "m"(kShuf01), // %0 - "m"(kShuf11), // %1 - "m"(kShuf21) // %2 - ); - asm volatile( - "movdqa %0,%%xmm5 \n" // kMadd01 - "movdqa %1,%%xmm0 \n" // kMadd11 - "movdqa %2,%%xmm1 \n" // kRound34 - : - : "m"(kMadd01), // %0 - "m"(kMadd11), // %1 - "m"(kRound34) // %2 - ); - asm volatile(LABELALIGN - "1: \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x00(%0,%3,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,(%1) \n" - "movdqu 0x8(%0),%%xmm6 \n" - "movdqu 0x8(%0,%3,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm0,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x8(%1) \n" - "movdqu 0x10(%0),%%xmm6 \n" - "movdqu 0x10(%0,%3,1),%%xmm7 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm4,%%xmm6 \n" - "pmaddubsw %4,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t) (src_stride)), // %3 - "m"(kMadd21) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6", "xmm7"); -} - -void ScaleRowDown34_0_Box_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width) { - asm volatile( - "movdqa %0,%%xmm2 \n" // kShuf01 - "movdqa %1,%%xmm3 \n" // kShuf11 - "movdqa %2,%%xmm4 \n" // kShuf21 - : - : "m"(kShuf01), // %0 - "m"(kShuf11), // %1 - "m"(kShuf21) // %2 - ); - asm volatile( - "movdqa %0,%%xmm5 \n" // kMadd01 - "movdqa %1,%%xmm0 \n" // kMadd11 - "movdqa %2,%%xmm1 \n" // kRound34 - : - : "m"(kMadd01), // %0 - "m"(kMadd11), // %1 - "m"(kRound34) // %2 - ); - - asm volatile(LABELALIGN - "1: \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x00(%0,%3,1),%%xmm7 \n" - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,(%1) \n" - "movdqu 0x8(%0),%%xmm6 \n" - "movdqu 0x8(%0,%3,1),%%xmm7 \n" - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm0,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x8(%1) \n" - "movdqu 0x10(%0),%%xmm6 \n" - "movdqu 0x10(%0,%3,1),%%xmm7 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm4,%%xmm6 \n" - "pmaddubsw %4,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t) (src_stride)), // %3 - "m"(kMadd21) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6", "xmm7"); -} - -void ScaleRowDown38_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width) { - (void) src_stride; - asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movhlps %%xmm0,%%xmm1 \n" - "movd %%xmm1,0x8(%1) \n" - "lea 0xc(%1),%1 \n" - "sub $0xc,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kShuf38a), // %3 - "m"(kShuf38b) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"); -} - -void ScaleRowDown38_2_Box_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width) { - asm volatile( - "movdqa %0,%%xmm2 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm4 \n" - "movdqa %3,%%xmm5 \n" - : - : "m"(kShufAb0), // %0 - "m"(kShufAb1), // %1 - "m"(kShufAb2), // %2 - "m"(kScaleAb2) // %3 - ); - asm volatile(LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%3,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "paddusw %%xmm6,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "paddusw %%xmm0,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movd %%xmm1,(%1) \n" - "psrlq $0x10,%%xmm1 \n" - "movd %%xmm1,0x2(%1) \n" - "lea 0x6(%1),%1 \n" - "sub $0x6,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t) (src_stride)) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6"); -} - -void ScaleRowDown38_3_Box_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width) { - asm volatile( - "movdqa %0,%%xmm2 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - : - : "m"(kShufAc), // %0 - "m"(kShufAc3), // %1 - "m"(kScaleAc33) // %2 - ); - asm volatile(LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%3,1),%%xmm6 \n" - "movhlps %%xmm0,%%xmm1 \n" - "movhlps %%xmm6,%%xmm7 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm6 \n" - "punpcklbw %%xmm5,%%xmm7 \n" - "paddusw %%xmm6,%%xmm0 \n" - "paddusw %%xmm7,%%xmm1 \n" - "movdqu 0x00(%0,%3,2),%%xmm6 \n" - "lea 0x10(%0),%0 \n" - "movhlps %%xmm6,%%xmm7 \n" - "punpcklbw %%xmm5,%%xmm6 \n" - "punpcklbw %%xmm5,%%xmm7 \n" - "paddusw %%xmm6,%%xmm0 \n" - "paddusw %%xmm7,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "psrldq $0x2,%%xmm0 \n" - "paddusw %%xmm0,%%xmm6 \n" - "psrldq $0x2,%%xmm0 \n" - "paddusw %%xmm0,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "movdqa %%xmm1,%%xmm7 \n" - "psrldq $0x2,%%xmm1 \n" - "paddusw %%xmm1,%%xmm7 \n" - "psrldq $0x2,%%xmm1 \n" - "paddusw %%xmm1,%%xmm7 \n" - "pshufb %%xmm3,%%xmm7 \n" - "paddusw %%xmm7,%%xmm6 \n" - "pmulhuw %%xmm4,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movd %%xmm6,(%1) \n" - "psrlq $0x10,%%xmm6 \n" - "movd %%xmm6,0x2(%1) \n" - "lea 0x6(%1),%1 \n" - "sub $0x6,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t) (src_stride)) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6", "xmm7"); -} - -static const uvec8 kLinearShuffleFar = {2, 3, 0, 1, 6, 7, 4, 5, - 10, 11, 8, 9, 14, 15, 12, 13}; - -static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3, - 3, 1, 1, 3, 3, 1, 1, 3}; - -#ifdef HAS_SCALEROWUP2_LINEAR_SSE2 - -void ScaleRowUp2_Linear_SSE2(const uint8_t *src_ptr, - uint8_t *dst_ptr, - int dst_width) { - asm volatile( - "pxor %%xmm0,%%xmm0 \n" // 0 - "pcmpeqw %%xmm6,%%xmm6 \n" - "psrlw $15,%%xmm6 \n" - "psllw $1,%%xmm6 \n" // all 2 - - LABELALIGN - "1: \n" - "movq (%0),%%xmm1 \n" // 01234567 - "movq 1(%0),%%xmm2 \n" // 12345678 - "movdqa %%xmm1,%%xmm3 \n" - "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778 - "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677 - "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788 - "movdqa %%xmm1,%%xmm4 \n" - "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16) - "movdqa %%xmm2,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16) - "paddw %%xmm5,%%xmm4 \n" - "movdqa %%xmm3,%%xmm5 \n" - "paddw %%xmm6,%%xmm4 \n" - "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16) - "paddw %%xmm5,%%xmm5 \n" - "paddw %%xmm4,%%xmm5 \n" // 3*near+far+2 (lo) - "psrlw $2,%%xmm5 \n" // 3/4*near+1/4*far (lo) - - "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16) - "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16) - "paddw %%xmm2,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16) - "paddw %%xmm6,%%xmm1 \n" - "paddw %%xmm3,%%xmm3 \n" - "paddw %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) - "psrlw $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) - - "packuswb %%xmm1,%%xmm5 \n" - "movdqu %%xmm5,(%1) \n" - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 8 sample to 16 sample - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} - -#endif - -#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2 - -void ScaleRowUp2_Bilinear_SSE2(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - LABELALIGN - "1: \n" - "pxor %%xmm0,%%xmm0 \n" // 0 - // above line - "movq (%0),%%xmm1 \n" // 01234567 - "movq 1(%0),%%xmm2 \n" // 12345678 - "movdqa %%xmm1,%%xmm3 \n" - "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778 - "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677 - "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788 - - "movdqa %%xmm1,%%xmm4 \n" - "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16) - "movdqa %%xmm2,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16) - "paddw %%xmm5,%%xmm4 \n" // near+far - "movdqa %%xmm3,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16) - "paddw %%xmm5,%%xmm5 \n" // 2*near - "paddw %%xmm5,%%xmm4 \n" // 3*near+far (1, lo) - - "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16) - "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16) - "paddw %%xmm2,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16) - "paddw %%xmm3,%%xmm3 \n" // 2*near - "paddw %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) - - // below line - "movq (%0,%3),%%xmm6 \n" // 01234567 - "movq 1(%0,%3),%%xmm2 \n" // 12345678 - "movdqa %%xmm6,%%xmm3 \n" - "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778 - "punpcklbw %%xmm6,%%xmm6 \n" // 0011223344556677 - "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788 - - "movdqa %%xmm6,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" // 00112233 (16) - "movdqa %%xmm2,%%xmm7 \n" - "punpcklbw %%xmm0,%%xmm7 \n" // 11223344 (16) - "paddw %%xmm7,%%xmm5 \n" // near+far - "movdqa %%xmm3,%%xmm7 \n" - "punpcklbw %%xmm0,%%xmm7 \n" // 01122334 (16) - "paddw %%xmm7,%%xmm7 \n" // 2*near - "paddw %%xmm7,%%xmm5 \n" // 3*near+far (2, lo) - - "punpckhbw %%xmm0,%%xmm6 \n" // 44556677 (16) - "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16) - "paddw %%xmm6,%%xmm2 \n" // near+far - "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16) - "paddw %%xmm3,%%xmm3 \n" // 2*near - "paddw %%xmm3,%%xmm2 \n" // 3*near+far (2, hi) - - // xmm4 xmm1 - // xmm5 xmm2 - "pcmpeqw %%xmm0,%%xmm0 \n" - "psrlw $15,%%xmm0 \n" - "psllw $3,%%xmm0 \n" // all 8 - - "movdqa %%xmm4,%%xmm3 \n" - "movdqa %%xmm5,%%xmm6 \n" - "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (1, lo) - "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, lo) - "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (1, lo) - "paddw %%xmm6,%%xmm3 \n" // 9 3 3 1 + 8 (1, lo) - "psrlw $4,%%xmm3 \n" // ^ div by 16 - - "movdqa %%xmm1,%%xmm7 \n" - "movdqa %%xmm2,%%xmm6 \n" - "paddw %%xmm7,%%xmm7 \n" // 6*near+2*far (1, hi) - "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, hi) - "paddw %%xmm1,%%xmm7 \n" // 9*near+3*far (1, hi) - "paddw %%xmm6,%%xmm7 \n" // 9 3 3 1 + 8 (1, hi) - "psrlw $4,%%xmm7 \n" // ^ div by 16 - - "packuswb %%xmm7,%%xmm3 \n" - "movdqu %%xmm3,(%1) \n" // save above line - - "movdqa %%xmm5,%%xmm3 \n" - "paddw %%xmm0,%%xmm4 \n" // 3*near+far+8 (1, lo) - "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, lo) - "paddw %%xmm3,%%xmm5 \n" // 9*near+3*far (2, lo) - "paddw %%xmm4,%%xmm5 \n" // 9 3 3 1 + 8 (lo) - "psrlw $4,%%xmm5 \n" // ^ div by 16 - - "movdqa %%xmm2,%%xmm3 \n" - "paddw %%xmm0,%%xmm1 \n" // 3*near+far+8 (1, hi) - "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, hi) - "paddw %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) - "paddw %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (hi) - "psrlw $4,%%xmm2 \n" // ^ div by 16 - - "packuswb %%xmm2,%%xmm5 \n" - "movdqu %%xmm5,(%1,%4) \n" // save below line - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 8 sample to 16 sample - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t) (src_stride)), // %3 - "r"((intptr_t) (dst_stride)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} - -#endif - -#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3 - -void ScaleRowUp2_Linear_12_SSSE3(const uint16_t *src_ptr, - uint16_t *dst_ptr, - int dst_width) { - asm volatile( - "movdqa %3,%%xmm5 \n" - "pcmpeqw %%xmm4,%%xmm4 \n" - "psrlw $15,%%xmm4 \n" - "psllw $1,%%xmm4 \n" // all 2 - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" // 01234567 (16) - "movdqu 2(%0),%%xmm1 \n" // 12345678 (16) - - "movdqa %%xmm0,%%xmm2 \n" - "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16) - "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16) - - "movdqa %%xmm2,%%xmm3 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm5,%%xmm3 \n" // 54657687 (far) - "pshufb %%xmm5,%%xmm1 \n" // 10213243 (far) - - "paddw %%xmm4,%%xmm1 \n" // far+2 - "paddw %%xmm4,%%xmm3 \n" // far+2 - "paddw %%xmm0,%%xmm1 \n" // near+far+2 - "paddw %%xmm2,%%xmm3 \n" // near+far+2 - "paddw %%xmm0,%%xmm0 \n" // 2*near - "paddw %%xmm2,%%xmm2 \n" // 2*near - "paddw %%xmm1,%%xmm0 \n" // 3*near+far+2 (lo) - "paddw %%xmm3,%%xmm2 \n" // 3*near+far+2 (hi) - - "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far - "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm2,16(%1) \n" - - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" // 8 sample to 16 sample - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kLinearShuffleFar) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} - -#endif - -#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3 - -void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t *src_ptr, - ptrdiff_t src_stride, - uint16_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "pcmpeqw %%xmm7,%%xmm7 \n" - "psrlw $15,%%xmm7 \n" - "psllw $3,%%xmm7 \n" // all 8 - "movdqa %5,%%xmm6 \n" - - LABELALIGN - "1: \n" - // above line - "movdqu (%0),%%xmm0 \n" // 01234567 (16) - "movdqu 2(%0),%%xmm1 \n" // 12345678 (16) - "movdqa %%xmm0,%%xmm2 \n" - "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16) - "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16) - "movdqa %%xmm2,%%xmm3 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm6,%%xmm3 \n" // 54657687 (far) - "pshufb %%xmm6,%%xmm1 \n" // 10213243 (far) - "paddw %%xmm0,%%xmm1 \n" // near+far - "paddw %%xmm2,%%xmm3 \n" // near+far - "paddw %%xmm0,%%xmm0 \n" // 2*near - "paddw %%xmm2,%%xmm2 \n" // 2*near - "paddw %%xmm1,%%xmm0 \n" // 3*near+far (1, lo) - "paddw %%xmm3,%%xmm2 \n" // 3*near+far (1, hi) - - // below line - "movdqu (%0,%3,2),%%xmm1 \n" // 01234567 (16) - "movdqu 2(%0,%3,2),%%xmm4 \n" // 12345678 (16) - "movdqa %%xmm1,%%xmm3 \n" - "punpckhwd %%xmm4,%%xmm3 \n" // 45566778 (16) - "punpcklwd %%xmm4,%%xmm1 \n" // 01122334 (16) - "movdqa %%xmm3,%%xmm5 \n" - "movdqa %%xmm1,%%xmm4 \n" - "pshufb %%xmm6,%%xmm5 \n" // 54657687 (far) - "pshufb %%xmm6,%%xmm4 \n" // 10213243 (far) - "paddw %%xmm1,%%xmm4 \n" // near+far - "paddw %%xmm3,%%xmm5 \n" // near+far - "paddw %%xmm1,%%xmm1 \n" // 2*near - "paddw %%xmm3,%%xmm3 \n" // 2*near - "paddw %%xmm4,%%xmm1 \n" // 3*near+far (2, lo) - "paddw %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) - - // xmm0 xmm2 - // xmm1 xmm3 - - "movdqa %%xmm0,%%xmm4 \n" - "movdqa %%xmm1,%%xmm5 \n" - "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, lo) - "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, lo) - "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) - "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) - "psrlw $4,%%xmm4 \n" // ^ div by 16 - "movdqu %%xmm4,(%1) \n" - - "movdqa %%xmm2,%%xmm4 \n" - "movdqa %%xmm3,%%xmm5 \n" - "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, hi) - "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, hi) - "paddw %%xmm2,%%xmm4 \n" // 9*near+3*far (1, hi) - "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, hi) - "psrlw $4,%%xmm4 \n" // ^ div by 16 - "movdqu %%xmm4,0x10(%1) \n" - - "movdqa %%xmm1,%%xmm4 \n" - "paddw %%xmm7,%%xmm0 \n" // 3*near+far+8 (1, lo) - "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, lo) - "paddw %%xmm4,%%xmm1 \n" // 9*near+3*far (2, lo) - "paddw %%xmm0,%%xmm1 \n" // 9 3 3 1 + 8 (2, lo) - "psrlw $4,%%xmm1 \n" // ^ div by 16 - "movdqu %%xmm1,(%1,%4,2) \n" - - "movdqa %%xmm3,%%xmm4 \n" - "paddw %%xmm7,%%xmm2 \n" // 3*near+far+8 (1, hi) - "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, hi) - "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (2, hi) - "paddw %%xmm2,%%xmm3 \n" // 9 3 3 1 + 8 (2, hi) - "psrlw $4,%%xmm3 \n" // ^ div by 16 - "movdqu %%xmm3,0x10(%1,%4,2) \n" - - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" // 8 sample to 16 sample - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t) (src_stride)), // %3 - "r"((intptr_t) (dst_stride)), // %4 - "m"(kLinearShuffleFar) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} - -#endif - -#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2 - -void ScaleRowUp2_Linear_16_SSE2(const uint16_t *src_ptr, - uint16_t *dst_ptr, - int dst_width) { - asm volatile( - "pxor %%xmm5,%%xmm5 \n" - "pcmpeqd %%xmm4,%%xmm4 \n" - "psrld $31,%%xmm4 \n" - "pslld $1,%%xmm4 \n" // all 2 - - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 0123 (16b) - "movq 2(%0),%%xmm1 \n" // 1234 (16b) - - "punpcklwd %%xmm5,%%xmm0 \n" // 0123 (32b) - "punpcklwd %%xmm5,%%xmm1 \n" // 1234 (32b) - - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - - "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far) - "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far) - - "paddd %%xmm4,%%xmm2 \n" // far+2 (lo) - "paddd %%xmm4,%%xmm3 \n" // far+2 (hi) - "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo) - "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi) - "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) - "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) - "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo) - "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) - - "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) - "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) - "packssdw %%xmm1,%%xmm0 \n" - "pshufd $0b11011000,%%xmm0,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} - -#endif - -#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2 - -void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t *src_ptr, - ptrdiff_t src_stride, - uint16_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "pxor %%xmm7,%%xmm7 \n" - "pcmpeqd %%xmm6,%%xmm6 \n" - "psrld $31,%%xmm6 \n" - "pslld $3,%%xmm6 \n" // all 8 - - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) - "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) - "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v) - "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v) - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo) - "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi) - "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo) - "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi) - "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo) - "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi) - "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) - "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) - - "movq (%0),%%xmm0 \n" // 0123 (16b) - "movq 2(%0),%%xmm1 \n" // 1234 (16b) - "punpcklwd %%xmm7,%%xmm0 \n" // 0123 (32b) - "punpcklwd %%xmm7,%%xmm1 \n" // 1234 (32b) - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far) - "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far) - "paddd %%xmm0,%%xmm2 \n" // near+far (lo) - "paddd %%xmm1,%%xmm3 \n" // near+far (hi) - "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) - "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) - "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) - "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) - - "movq (%0,%3,2),%%xmm2 \n" - "movq 2(%0,%3,2),%%xmm3 \n" - "punpcklwd %%xmm7,%%xmm2 \n" // 0123 (32b) - "punpcklwd %%xmm7,%%xmm3 \n" // 1234 (32b) - "movdqa %%xmm2,%%xmm4 \n" - "movdqa %%xmm3,%%xmm5 \n" - "pshufd $0b10110001,%%xmm4,%%xmm4 \n" // 1032 (even, far) - "pshufd $0b10110001,%%xmm5,%%xmm5 \n" // 2143 (odd, far) - "paddd %%xmm2,%%xmm4 \n" // near+far (lo) - "paddd %%xmm3,%%xmm5 \n" // near+far (hi) - "paddd %%xmm2,%%xmm2 \n" // 2*near (lo) - "paddd %%xmm3,%%xmm3 \n" // 2*near (hi) - "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo) - "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) - - "movdqa %%xmm0,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) - "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) - "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) - "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) - "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo) - - "movdqa %%xmm2,%%xmm5 \n" - "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo) - "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) - "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo) - "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) - "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo) - - "movdqa %%xmm1,%%xmm0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi) - "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi) - "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi) - "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) - "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi) - - "movdqa %%xmm3,%%xmm2 \n" - "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi) - "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi) - "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) - "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi) - "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi) - - "packssdw %%xmm0,%%xmm4 \n" - "pshufd $0b11011000,%%xmm4,%%xmm4 \n" - "movdqu %%xmm4,(%1) \n" // store above - "packssdw %%xmm2,%%xmm5 \n" - "pshufd $0b11011000,%%xmm5,%%xmm5 \n" - "movdqu %%xmm5,(%1,%4,2) \n" // store below - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t) (src_stride)), // %3 - "r"((intptr_t) (dst_stride)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} - -#endif - -#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3 - -void ScaleRowUp2_Linear_SSSE3(const uint8_t *src_ptr, - uint8_t *dst_ptr, - int dst_width) { - asm volatile( - "pcmpeqw %%xmm4,%%xmm4 \n" - "psrlw $15,%%xmm4 \n" - "psllw $1,%%xmm4 \n" // all 2 - "movdqa %3,%%xmm3 \n" - - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 01234567 - "movq 1(%0),%%xmm1 \n" // 12345678 - "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767 - "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878 - "movdqa %%xmm0,%%xmm2 \n" - "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878 - "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434 - "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (hi) - "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (lo) - "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo) - "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi) - "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) - "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi) - "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 8 sample to 16 sample - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kLinearMadd31) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); -} - -#endif - -#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3 - -void ScaleRowUp2_Bilinear_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "pcmpeqw %%xmm6,%%xmm6 \n" - "psrlw $15,%%xmm6 \n" - "psllw $3,%%xmm6 \n" // all 8 - "movdqa %5,%%xmm7 \n" - - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 01234567 - "movq 1(%0),%%xmm1 \n" // 12345678 - "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767 - "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878 - "movdqa %%xmm0,%%xmm2 \n" - "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878 - "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434 - "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1, hi) - "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1, lo) - - "movq (%0,%3),%%xmm1 \n" - "movq 1(%0,%3),%%xmm4 \n" - "punpcklwd %%xmm1,%%xmm1 \n" - "punpcklwd %%xmm4,%%xmm4 \n" - "movdqa %%xmm1,%%xmm3 \n" - "punpckhdq %%xmm4,%%xmm3 \n" - "punpckldq %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi) - "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo) - - // xmm0 xmm2 - // xmm1 xmm3 - - "movdqa %%xmm0,%%xmm4 \n" - "movdqa %%xmm1,%%xmm5 \n" - "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) - "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) - "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) - "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) - "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo) - - "movdqa %%xmm1,%%xmm5 \n" - "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo) - "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) - "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo) - "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) - "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo) - - "movdqa %%xmm2,%%xmm0 \n" - "movdqa %%xmm3,%%xmm1 \n" - "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi) - "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi) - "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi) - "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) - "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi) - - "movdqa %%xmm3,%%xmm1 \n" - "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi) - "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi) - "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi) - "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi) - "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi) - - "packuswb %%xmm0,%%xmm4 \n" - "movdqu %%xmm4,(%1) \n" // store above - "packuswb %%xmm1,%%xmm5 \n" - "movdqu %%xmm5,(%1,%4) \n" // store below - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 8 sample to 16 sample - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t) (src_stride)), // %3 - "r"((intptr_t) (dst_stride)), // %4 - "m"(kLinearMadd31) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} - -#endif - -#ifdef HAS_SCALEROWUP2_LINEAR_AVX2 - -void ScaleRowUp2_Linear_AVX2(const uint8_t *src_ptr, - uint8_t *dst_ptr, - int dst_width) { - asm volatile( - "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $15,%%ymm4,%%ymm4 \n" - "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 - "vbroadcastf128 %3,%%ymm3 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF - "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0 - "vpermq $0b11011000,%%ymm0,%%ymm0 \n" - "vpermq $0b11011000,%%ymm1,%%ymm1 \n" - "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" - "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n" - "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" - "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi) - "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo) - "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo) - "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi) - "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) - "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" // 16 sample to 32 sample - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kLinearMadd31) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); -} - -#endif - -#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2 - -void ScaleRowUp2_Bilinear_AVX2(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" - "vpsrlw $15,%%ymm6,%%ymm6 \n" - "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8 - "vbroadcastf128 %5,%%ymm7 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF - "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0 - "vpermq $0b11011000,%%ymm0,%%ymm0 \n" - "vpermq $0b11011000,%%ymm1,%%ymm1 \n" - "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" - "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n" - "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" - "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi) - "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo) - - "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF - "vmovdqu 1(%0,%3),%%xmm3 \n" // 123456789ABCDEF0 - "vpermq $0b11011000,%%ymm2,%%ymm2 \n" - "vpermq $0b11011000,%%ymm3,%%ymm3 \n" - "vpunpcklwd %%ymm2,%%ymm2,%%ymm2 \n" - "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" - "vpunpckhdq %%ymm3,%%ymm2,%%ymm4 \n" - "vpunpckldq %%ymm3,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi) - "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo) - - // ymm0 ymm1 - // ymm2 ymm3 - - "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) - "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) - "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) - "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) - "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) - - "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) - "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) - "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) - "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) - "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) - - "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) - "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) - "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) - "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) - "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) - - "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) - "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) - "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) - "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) - "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) - - "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n" - "vmovdqu %%ymm4,(%1) \n" // store above - "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n" - "vmovdqu %%ymm5,(%1,%4) \n" // store below - - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" // 16 sample to 32 sample - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t) (src_stride)), // %3 - "r"((intptr_t) (dst_stride)), // %4 - "m"(kLinearMadd31) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} - -#endif - -#ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2 - -void ScaleRowUp2_Linear_12_AVX2(const uint16_t *src_ptr, - uint16_t *dst_ptr, - int dst_width) { - asm volatile( - "vbroadcastf128 %3,%%ymm5 \n" - "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $15,%%ymm4,%%ymm4 \n" - "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" // 0123456789ABCDEF (16b) - "vmovdqu 2(%0),%%ymm1 \n" // 123456789ABCDEF0 (16b) - - "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 012389AB4567CDEF - "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 12349ABC5678DEF0 - - "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n" // 899AABBCCDDEEFF0 (near) - "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near) - "vpshufb %%ymm5,%%ymm2,%%ymm3 \n" // 98A9BACBDCEDFE0F (far) - "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far) - - "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // far+2 - "vpaddw %%ymm4,%%ymm3,%%ymm3 \n" // far+2 - "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far+2 - "vpaddw %%ymm2,%%ymm3,%%ymm3 \n" // near+far+2 - "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near - "vpaddw %%ymm2,%%ymm2,%%ymm2 \n" // 2*near - "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 3*near+far+2 - "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 3*near+far+2 - - "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far - "vpsrlw $2,%%ymm2,%%ymm2 \n" // 3/4*near+1/4*far - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm2,32(%1) \n" - - "lea 0x20(%0),%0 \n" - "lea 0x40(%1),%1 \n" // 16 sample to 32 sample - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kLinearShuffleFar) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} - -#endif - -#ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2 - -void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t *src_ptr, - ptrdiff_t src_stride, - uint16_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "vbroadcastf128 %5,%%ymm5 \n" - "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $15,%%ymm4,%%ymm4 \n" - "vpsllw $3,%%ymm4,%%ymm4 \n" // all 8 - - LABELALIGN - "1: \n" - - "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b) - "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b) - "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000 - "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000 - "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near) - "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far) - "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far - "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near - "vpaddw %%ymm0,%%ymm1,%%ymm2 \n" // 3*near+far (1) - - "vmovdqu (%0,%3,2),%%xmm0 \n" // 01234567 (16b) - "vmovdqu 2(%0,%3,2),%%xmm1 \n" // 12345678 (16b) - "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000 - "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000 - "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near) - "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far) - "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far - "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near - "vpaddw %%ymm0,%%ymm1,%%ymm3 \n" // 3*near+far (2) - - "vpaddw %%ymm2,%%ymm2,%%ymm0 \n" // 6*near+2*far (1) - "vpaddw %%ymm4,%%ymm3,%%ymm1 \n" // 3*near+far+8 (2) - "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9*near+3*far (1) - "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (1) - "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 - "vmovdqu %%ymm0,(%1) \n" // store above - - "vpaddw %%ymm3,%%ymm3,%%ymm0 \n" // 6*near+2*far (2) - "vpaddw %%ymm4,%%ymm2,%%ymm1 \n" // 3*near+far+8 (1) - "vpaddw %%ymm0,%%ymm3,%%ymm0 \n" // 9*near+3*far (2) - "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (2) - "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 - "vmovdqu %%ymm0,(%1,%4,2) \n" // store below - - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" // 8 sample to 16 sample - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t) (src_stride)), // %3 - "r"((intptr_t) (dst_stride)), // %4 - "m"(kLinearShuffleFar) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} - -#endif - -#ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2 - -void ScaleRowUp2_Linear_16_AVX2(const uint16_t *src_ptr, - uint16_t *dst_ptr, - int dst_width) { - asm volatile( - "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrld $31,%%ymm4,%%ymm4 \n" - "vpslld $1,%%ymm4,%%ymm4 \n" // all 2 - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v) - "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v) - - "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) - "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) - - "vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far) - "vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far) - - "vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo) - "vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi) - "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo) - "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi) - "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) - "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) - "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo) - "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi) - - "vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) - "vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) - "vpackusdw %%ymm1,%%ymm0,%%ymm0 \n" - "vpshufd $0b11011000,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); -} - -#endif - -#ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2 - -void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t *src_ptr, - ptrdiff_t src_stride, - uint16_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" - "vpsrld $31,%%ymm6,%%ymm6 \n" - "vpslld $3,%%ymm6,%%ymm6 \n" // all 8 - - LABELALIGN - "1: \n" - - "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v) - "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v) - "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) - "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) - "vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far) - "vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far) - "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo) - "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi) - "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) - "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) - "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (1, lo) - "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (1, hi) - - "vmovdqu (%0,%3,2),%%xmm2 \n" // 01234567 (16b, 1u1v) - "vmovdqu 2(%0,%3,2),%%xmm3 \n" // 12345678 (16b, 1u1v) - "vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v) - "vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v) - "vpshufd $0b10110001,%%ymm2,%%ymm4 \n" // 10325476 (lo, far) - "vpshufd $0b10110001,%%ymm3,%%ymm5 \n" // 21436587 (hi, far) - "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo) - "vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi) - "vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo) - "vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi) - "vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (2, lo) - "vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (2, hi) - - "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) - "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) - "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) - "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) - "vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) - - "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) - "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) - "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) - "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) - "vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) - - "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) - "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) - "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) - "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) - "vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) - - "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) - "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) - "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) - "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) - "vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) - - "vpackusdw %%ymm0,%%ymm4,%%ymm4 \n" - "vpshufd $0b11011000,%%ymm4,%%ymm4 \n" - "vmovdqu %%ymm4,(%1) \n" // store above - "vpackusdw %%ymm2,%%ymm5,%%ymm5 \n" - "vpshufd $0b11011000,%%ymm5,%%ymm5 \n" - "vmovdqu %%ymm5,(%1,%4,2) \n" // store below - - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t) (src_stride)), // %3 - "r"((intptr_t) (dst_stride)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} - -#endif - -// Reads 16xN bytes and produces 16 shorts at a time. -void ScaleAddRow_SSE2(const uint8_t *src_ptr, - uint16_t *dst_ptr, - int src_width) { - asm volatile("pxor %%xmm5,%%xmm5 \n" - - // 16 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm3 \n" - "lea 0x10(%0),%0 \n" // src_ptr += 16 - "movdqu (%1),%%xmm0 \n" - "movdqu 0x10(%1),%%xmm1 \n" - "movdqa %%xmm3,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpckhbw %%xmm5,%%xmm3 \n" - "paddusw %%xmm2,%%xmm0 \n" - "paddusw %%xmm3,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(src_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} - -#ifdef HAS_SCALEADDROW_AVX2 - -// Reads 32 bytes and accumulates to 32 shorts at a time. -void ScaleAddRow_AVX2(const uint8_t *src_ptr, - uint16_t *dst_ptr, - int src_width) { - asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm3 \n" - "lea 0x20(%0),%0 \n" // src_ptr += 32 - "vpermq $0xd8,%%ymm3,%%ymm3 \n" - "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" - "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" - "vpaddusw (%1),%%ymm2,%%ymm0 \n" - "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(src_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} - -#endif // HAS_SCALEADDROW_AVX2 - -// Constant for making pixels signed to avoid pmaddubsw -// saturation. -static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; - -// Constant for making pixels unsigned and adding .5 for rounding. -static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040, - 0x4040, 0x4040, 0x4040, 0x4040}; - -// Bilinear column filtering. SSSE3 version. -void ScaleFilterCols_SSSE3(uint8_t *dst_ptr, - const uint8_t *src_ptr, - int dst_width, - int x, - int dx) { - intptr_t x0, x1, temp_pixel; - asm volatile( - "movd %6,%%xmm2 \n" - "movd %7,%%xmm3 \n" - "movl $0x04040000,%k2 \n" - "movd %k2,%%xmm5 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x9,%%xmm6 \n" // 0x007f007f - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $15,%%xmm7 \n" // 0x00010001 - - "pextrw $0x1,%%xmm2,%k3 \n" - "subl $0x2,%5 \n" - "jl 29f \n" - "movdqa %%xmm2,%%xmm0 \n" - "paddd %%xmm3,%%xmm0 \n" - "punpckldq %%xmm0,%%xmm2 \n" - "punpckldq %%xmm3,%%xmm3 \n" - "paddd %%xmm3,%%xmm3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - - LABELALIGN - "2: \n" - "movdqa %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm2 \n" - "movzwl 0x00(%1,%3,1),%k2 \n" - "movd %k2,%%xmm0 \n" - "psrlw $0x9,%%xmm1 \n" - "movzwl 0x00(%1,%4,1),%k2 \n" - "movd %k2,%%xmm4 \n" - "pshufb %%xmm5,%%xmm1 \n" - "punpcklwd %%xmm4,%%xmm0 \n" - "psubb %8,%%xmm0 \n" // make pixels signed. - "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) + - // 1 - "paddusb %%xmm7,%%xmm1 \n" - "pmaddubsw %%xmm0,%%xmm1 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - "paddw %9,%%xmm1 \n" // make pixels unsigned. - "psrlw $0x7,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movd %%xmm1,%k2 \n" - "mov %w2,(%0) \n" - "lea 0x2(%0),%0 \n" - "subl $0x2,%5 \n" - "jge 2b \n" - - LABELALIGN - "29: \n" - "addl $0x1,%5 \n" - "jl 99f \n" - "movzwl 0x00(%1,%3,1),%k2 \n" - "movd %k2,%%xmm0 \n" - "psrlw $0x9,%%xmm2 \n" - "pshufb %%xmm5,%%xmm2 \n" - "psubb %8,%%xmm0 \n" // make pixels signed. - "pxor %%xmm6,%%xmm2 \n" - "paddusb %%xmm7,%%xmm2 \n" - "pmaddubsw %%xmm0,%%xmm2 \n" - "paddw %9,%%xmm2 \n" // make pixels unsigned. - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm2 \n" - "movd %%xmm2,%k2 \n" - "mov %b2,(%0) \n" - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "=&a"(temp_pixel), // %2 - "=&r"(x0), // %3 - "=&r"(x1), // %4 -#if defined(__x86_64__) - "+rm"(dst_width) // %5 -#else - "+m"(dst_width) // %5 -#endif - : "rm"(x), // %6 - "rm"(dx), // %7 -#if defined(__x86_64__) - "x"(kFsub80), // %8 - "x"(kFadd40) // %9 -#else - "m"(kFsub80), // %8 - "m"(kFadd40) // %9 -#endif - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} - -// Reads 4 pixels, duplicates them and writes 8 pixels. -// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. -void ScaleColsUp2_SSE2(uint8_t *dst_ptr, - const uint8_t *src_ptr, - int dst_width, - int x, - int dx) { - (void) x; - (void) dx; - asm volatile(LABELALIGN - "1: \n" - "movdqu (%1),%%xmm0 \n" - "lea 0x10(%1),%1 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "movdqu %%xmm0,(%0) \n" - "movdqu %%xmm1,0x10(%0) \n" - "lea 0x20(%0),%0 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); -} - -// Divide num by div and return as 16.16 fixed point result. -int FixedDiv_X86(int num, int div) { - asm volatile( - "cdq \n" - "shld $0x10,%%eax,%%edx \n" - "shl $0x10,%%eax \n" - "idiv %1 \n" - "mov %0, %%eax \n" - : "+a"(num) // %0 - : "c"(div) // %1 - : "memory", "cc", "edx"); - return num; -} - -// Divide num - 1 by div - 1 and return as 16.16 fixed point result. -int FixedDiv1_X86(int num, int div) { - asm volatile( - "cdq \n" - "shld $0x10,%%eax,%%edx \n" - "shl $0x10,%%eax \n" - "sub $0x10001,%%eax \n" - "sbb $0x0,%%edx \n" - "sub $0x1,%1 \n" - "idiv %1 \n" - "mov %0, %%eax \n" - : "+a"(num) // %0 - : "c"(div) // %1 - : "memory", "cc", "edx"); - return num; -} - -#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) || \ - defined(HAS_SCALEUVROWDOWN2BOX_AVX2) - -// Shuffle table for splitting UV into upper and lower part of register. -static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u, - 1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u}; -static const uvec8 kShuffleMergeUV = {0u, 8u, 2u, 10u, 4u, 12u, - 6u, 14u, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80}; -#endif - -#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3 - -void ScaleUVRowDown2Box_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width) { - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" // 01010101 - "psrlw $0xf,%%xmm4 \n" - "packuswb %%xmm4,%%xmm4 \n" - "pxor %%xmm5, %%xmm5 \n" // zero - "movdqa %4,%%xmm1 \n" // split shuffler - "movdqa %5,%%xmm3 \n" // merge shuffler - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" // 8 UV row 0 - "movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1 - "lea 0x10(%0),%0 \n" - "pshufb %%xmm1,%%xmm0 \n" // uuuuvvvv - "pshufb %%xmm1,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" // horizontal add - "pmaddubsw %%xmm4,%%xmm2 \n" - "paddw %%xmm2,%%xmm0 \n" // vertical add - "psrlw $0x1,%%xmm0 \n" // round - "pavgw %%xmm5,%%xmm0 \n" - "pshufb %%xmm3,%%xmm0 \n" // merge uv - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" // 4 UV - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t) (src_stride)), // %3 - "m"(kShuffleSplitUV), // %4 - "m"(kShuffleMergeUV) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} - -#endif // HAS_SCALEUVROWDOWN2BOX_SSSE3 - -#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2 - -void ScaleUVRowDown2Box_AVX2(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width) { - asm volatile( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101 - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero - "vbroadcastf128 %4,%%ymm1 \n" // split shuffler - "vbroadcastf128 %5,%%ymm3 \n" // merge shuffler - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" // 16 UV row 0 - "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1 - "lea 0x20(%0),%0 \n" - "vpshufb %%ymm1,%%ymm0,%%ymm0 \n" // uuuuvvvv - "vpshufb %%ymm1,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // horizontal add - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" // vertical add - "vpsrlw $0x1,%%ymm0,%%ymm0 \n" // round - "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" - "vpshufb %%ymm3,%%ymm0,%%ymm0 \n" // merge uv - "vpermq $0xd8,%%ymm0,%%ymm0 \n" // combine qwords - "vmovdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" // 8 UV - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t) (src_stride)), // %3 - "m"(kShuffleSplitUV), // %4 - "m"(kShuffleMergeUV) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} - -#endif // HAS_SCALEUVROWDOWN2BOX_AVX2 - -static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3, - 3, 1, 3, 1, 1, 3, 1, 3}; - -#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3 - -void ScaleUVRowUp2_Linear_SSSE3(const uint8_t *src_ptr, - uint8_t *dst_ptr, - int dst_width) { - asm volatile( - "pcmpeqw %%xmm4,%%xmm4 \n" - "psrlw $15,%%xmm4 \n" - "psllw $1,%%xmm4 \n" // all 2 - "movdqa %3,%%xmm3 \n" - - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 00112233 (1u1v) - "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v) - "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v) - "movdqa %%xmm0,%%xmm2 \n" - "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v) - "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v) - "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (1u1v16, hi) - "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (1u1v16, lo) - "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo) - "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi) - "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) - "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi) - "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 4 uv to 8 uv - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kUVLinearMadd31) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} - -#endif - -#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3 - -void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "pcmpeqw %%xmm6,%%xmm6 \n" - "psrlw $15,%%xmm6 \n" - "psllw $3,%%xmm6 \n" // all 8 - "movdqa %5,%%xmm7 \n" - - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 00112233 (1u1v) - "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v) - "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v) - "movdqa %%xmm0,%%xmm2 \n" - "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v) - "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v) - "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1u1v16, hi) - "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1u1v16, lo) - - "movq (%0,%3),%%xmm1 \n" - "movq 2(%0,%3),%%xmm4 \n" - "punpcklbw %%xmm4,%%xmm1 \n" - "movdqa %%xmm1,%%xmm3 \n" - "punpckhdq %%xmm1,%%xmm3 \n" - "punpckldq %%xmm1,%%xmm1 \n" - "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi) - "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo) - - // xmm0 xmm2 - // xmm1 xmm3 - - "movdqa %%xmm0,%%xmm4 \n" - "movdqa %%xmm1,%%xmm5 \n" - "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) - "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) - "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) - "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) - "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo) - - "movdqa %%xmm1,%%xmm5 \n" - "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo) - "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) - "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo) - "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) - "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo) - - "movdqa %%xmm2,%%xmm0 \n" - "movdqa %%xmm3,%%xmm1 \n" - "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi) - "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi) - "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi) - "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) - "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi) - - "movdqa %%xmm3,%%xmm1 \n" - "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi) - "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi) - "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi) - "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi) - "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi) - - "packuswb %%xmm0,%%xmm4 \n" - "movdqu %%xmm4,(%1) \n" // store above - "packuswb %%xmm1,%%xmm5 \n" - "movdqu %%xmm5,(%1,%4) \n" // store below - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 4 uv to 8 uv - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t) (src_stride)), // %3 - "r"((intptr_t) (dst_stride)), // %4 - "m"(kUVLinearMadd31) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} - -#endif - -#ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2 - -void ScaleUVRowUp2_Linear_AVX2(const uint8_t *src_ptr, - uint8_t *dst_ptr, - int dst_width) { - asm volatile( - "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $15,%%ymm4,%%ymm4 \n" - "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 - "vbroadcastf128 %3,%%ymm3 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%xmm0 \n" - "vmovdqu 2(%0),%%xmm1 \n" - "vpermq $0b11011000,%%ymm0,%%ymm0 \n" - "vpermq $0b11011000,%%ymm1,%%ymm1 \n" - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" - "vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n" - "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi) - "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo) - "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo) - "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi) - "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) - "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" // 8 uv to 16 uv - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kUVLinearMadd31) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); -} - -#endif - -#ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2 - -void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" - "vpsrlw $15,%%ymm6,%%ymm6 \n" - "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8 - "vbroadcastf128 %5,%%ymm7 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%xmm0 \n" - "vmovdqu 2(%0),%%xmm1 \n" - "vpermq $0b11011000,%%ymm0,%%ymm0 \n" - "vpermq $0b11011000,%%ymm1,%%ymm1 \n" - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" - "vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n" - "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi) - "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo) - - "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF - "vmovdqu 2(%0,%3),%%xmm3 \n" // 123456789ABCDEF0 - "vpermq $0b11011000,%%ymm2,%%ymm2 \n" - "vpermq $0b11011000,%%ymm3,%%ymm3 \n" - "vpunpcklbw %%ymm3,%%ymm2,%%ymm2 \n" - "vpunpckhdq %%ymm2,%%ymm2,%%ymm4 \n" - "vpunpckldq %%ymm2,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi) - "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo) - - // ymm0 ymm1 - // ymm2 ymm3 - - "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) - "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) - "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) - "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) - "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) - - "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) - "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) - "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) - "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) - "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) - - "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) - "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) - "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) - "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) - "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) - - "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) - "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) - "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) - "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) - "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) - - "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n" - "vmovdqu %%ymm4,(%1) \n" // store above - "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n" - "vmovdqu %%ymm5,(%1,%4) \n" // store below - - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" // 8 uv to 16 uv - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t) (src_stride)), // %3 - "r"((intptr_t) (dst_stride)), // %4 - "m"(kUVLinearMadd31) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} - -#endif - -#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41 - -void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t *src_ptr, - uint16_t *dst_ptr, - int dst_width) { - asm volatile( - "pxor %%xmm5,%%xmm5 \n" - "pcmpeqd %%xmm4,%%xmm4 \n" - "psrld $31,%%xmm4 \n" - "pslld $1,%%xmm4 \n" // all 2 - - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) - "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) - - "punpcklwd %%xmm5,%%xmm0 \n" // 0011 (32b, 1u1v) - "punpcklwd %%xmm5,%%xmm1 \n" // 1122 (32b, 1u1v) - - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - - "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (lo, far) - "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (hi, far) - - "paddd %%xmm4,%%xmm2 \n" // far+2 (lo) - "paddd %%xmm4,%%xmm3 \n" // far+2 (hi) - "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo) - "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi) - "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) - "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) - "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo) - "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) - - "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) - "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) - "packusdw %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 2 uv to 4 uv - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} - -#endif - -#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 - -void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t *src_ptr, - ptrdiff_t src_stride, - uint16_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "pxor %%xmm7,%%xmm7 \n" - "pcmpeqd %%xmm6,%%xmm6 \n" - "psrld $31,%%xmm6 \n" - "pslld $3,%%xmm6 \n" // all 8 - - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) - "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) - "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v) - "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v) - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo) - "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi) - "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo) - "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi) - "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo) - "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi) - "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) - "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) - - "movq (%0,%3,2),%%xmm2 \n" - "movq 4(%0,%3,2),%%xmm3 \n" - "punpcklwd %%xmm7,%%xmm2 \n" - "punpcklwd %%xmm7,%%xmm3 \n" - "movdqa %%xmm2,%%xmm4 \n" - "movdqa %%xmm3,%%xmm5 \n" - "pshufd $0b01001110,%%xmm4,%%xmm4 \n" // 1100 (far) (2, lo) - "pshufd $0b01001110,%%xmm5,%%xmm5 \n" // 2211 (far) (2, hi) - "paddd %%xmm2,%%xmm4 \n" // near+far (2, lo) - "paddd %%xmm3,%%xmm5 \n" // near+far (2, hi) - "paddd %%xmm2,%%xmm2 \n" // 2*near (2, lo) - "paddd %%xmm3,%%xmm3 \n" // 2*near (2, hi) - "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo) - "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) - - "movdqa %%xmm0,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) - "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) - "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) - "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) - "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo) - - "movdqa %%xmm2,%%xmm5 \n" - "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo) - "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) - "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo) - "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) - "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo) - - "movdqa %%xmm1,%%xmm0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi) - "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi) - "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi) - "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) - "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi) - - "movdqa %%xmm3,%%xmm2 \n" - "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi) - "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi) - "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) - "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi) - "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi) - - "packusdw %%xmm0,%%xmm4 \n" - "movdqu %%xmm4,(%1) \n" // store above - "packusdw %%xmm2,%%xmm5 \n" - "movdqu %%xmm5,(%1,%4,2) \n" // store below - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 2 uv to 4 uv - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t) (src_stride)), // %3 - "r"((intptr_t) (dst_stride)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} - -#endif - -#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2 - -void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t *src_ptr, - uint16_t *dst_ptr, - int dst_width) { - asm volatile( - "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrld $31,%%ymm4,%%ymm4 \n" - "vpslld $1,%%ymm4,%%ymm4 \n" // all 2 - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v) - "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v) - - "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) - "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) - - "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far) - "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far) - - "vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo) - "vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi) - "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo) - "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi) - "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) - "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) - "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo) - "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi) - - "vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) - "vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) - "vpackusdw %%ymm1,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" // 4 uv to 8 uv - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); -} - -#endif - -#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2 - -void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t *src_ptr, - ptrdiff_t src_stride, - uint16_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" - "vpsrld $31,%%ymm6,%%ymm6 \n" - "vpslld $3,%%ymm6,%%ymm6 \n" // all 8 - - LABELALIGN - "1: \n" - - "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v) - "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v) - "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) - "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) - "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far) - "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far) - "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo) - "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi) - "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) - "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) - "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (lo) - "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (hi) - - "vmovdqu (%0,%3,2),%%xmm2 \n" // 00112233 (16b, 1u1v) - "vmovdqu 4(%0,%3,2),%%xmm3 \n" // 11223344 (16b, 1u1v) - "vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v) - "vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v) - "vpshufd $0b01001110,%%ymm2,%%ymm4 \n" // 11003322 (lo, far) - "vpshufd $0b01001110,%%ymm3,%%ymm5 \n" // 22114433 (hi, far) - "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo) - "vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi) - "vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo) - "vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi) - "vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (lo) - "vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (hi) - - "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) - "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) - "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) - "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) - "vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) - - "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) - "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) - "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) - "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) - "vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) - - "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) - "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) - "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) - "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) - "vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) - - "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) - "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) - "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) - "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) - "vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) - - "vpackusdw %%ymm0,%%ymm4,%%ymm4 \n" - "vmovdqu %%ymm4,(%1) \n" // store above - "vpackusdw %%ymm2,%%ymm5,%%ymm5 \n" - "vmovdqu %%ymm5,(%1,%4,2) \n" // store below - - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" // 4 uv to 8 uv - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t) (src_stride)), // %3 - "r"((intptr_t) (dst_stride)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} - -#endif - -#endif // defined(__x86_64__) || defined(__i386__) diff --git a/pkg/encoder/yuv/libyuv/scale_row.h b/pkg/encoder/yuv/libyuv/scale_row.h deleted file mode 100644 index 16389cdcf..000000000 --- a/pkg/encoder/yuv/libyuv/scale_row.h +++ /dev/null @@ -1,768 +0,0 @@ -/* - * Copyright 2013 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_ -#define INCLUDE_LIBYUV_SCALE_ROW_H_ - -#include "basic_types.h" -#include "scale.h" - -#if defined(__pnacl__) || defined(__CLR_VER) || \ - (defined(__native_client__) && defined(__x86_64__)) || \ - (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) -#define LIBYUV_DISABLE_X86 -#endif -#if defined(__native_client__) -#define LIBYUV_DISABLE_NEON -#endif -// MemorySanitizer does not support assembly code yet. http://crbug.com/344505 -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON) -#define LIBYUV_DISABLE_NEON -#endif -#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86) -#define LIBYUV_DISABLE_X86 -#endif -#endif -// GCC >= 4.7.0 required for AVX2. -#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) -#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) -#define GCC_HAS_AVX2 1 -#endif // GNUC >= 4.7 -#endif // __GNUC__ - -// The following are available on all x86 platforms: -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) -#define HAS_FIXEDDIV1_X86 -#define HAS_FIXEDDIV_X86 -#define HAS_SCALEADDROW_SSE2 -#define HAS_SCALECOLSUP2_SSE2 -#define HAS_SCALEFILTERCOLS_SSSE3 -#define HAS_SCALEROWDOWN2_SSSE3 -#define HAS_SCALEROWDOWN34_SSSE3 -#define HAS_SCALEROWDOWN38_SSSE3 -#define HAS_SCALEROWDOWN4_SSSE3 -#endif - -// The following are available for gcc/clang x86 platforms: -// TODO(fbarchard): Port to Visual C -#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) -#define HAS_SCALEUVROWDOWN2BOX_SSSE3 -#define HAS_SCALEROWUP2_LINEAR_SSE2 -#define HAS_SCALEROWUP2_LINEAR_SSSE3 -#define HAS_SCALEROWUP2_BILINEAR_SSE2 -#define HAS_SCALEROWUP2_BILINEAR_SSSE3 -#define HAS_SCALEROWUP2_LINEAR_12_SSSE3 -#define HAS_SCALEROWUP2_BILINEAR_12_SSSE3 -#define HAS_SCALEROWUP2_LINEAR_16_SSE2 -#define HAS_SCALEROWUP2_BILINEAR_16_SSE2 -#define HAS_SCALEUVROWUP2_LINEAR_SSSE3 -#define HAS_SCALEUVROWUP2_BILINEAR_SSSE3 -#define HAS_SCALEUVROWUP2_LINEAR_16_SSE41 -#define HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 -#endif - -// The following are available for gcc/clang x86 platforms, but -// require clang 3.4 or gcc 4.7. -// TODO(fbarchard): Port to Visual C -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__)) && \ - (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) -#define HAS_SCALEUVROWDOWN2BOX_AVX2 -#define HAS_SCALEROWUP2_LINEAR_AVX2 -#define HAS_SCALEROWUP2_BILINEAR_AVX2 -#define HAS_SCALEROWUP2_LINEAR_12_AVX2 -#define HAS_SCALEROWUP2_BILINEAR_12_AVX2 -#define HAS_SCALEROWUP2_LINEAR_16_AVX2 -#define HAS_SCALEROWUP2_BILINEAR_16_AVX2 -#define HAS_SCALEUVROWUP2_LINEAR_AVX2 -#define HAS_SCALEUVROWUP2_BILINEAR_AVX2 -#define HAS_SCALEUVROWUP2_LINEAR_16_AVX2 -#define HAS_SCALEUVROWUP2_BILINEAR_16_AVX2 -#endif - -// The following are available on all x86 platforms, but -// require VS2012, clang 3.4 or gcc 4.7. -// The code supports NaCL but requires a new compiler and validator. -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \ - defined(GCC_HAS_AVX2)) -#define HAS_SCALEADDROW_AVX2 -#define HAS_SCALEROWDOWN2_AVX2 -#define HAS_SCALEROWDOWN4_AVX2 -#endif - -// Scale ARGB vertically with bilinear interpolation. -void ScalePlaneVertical(int src_height, - int dst_width, - int dst_height, - int src_stride, - int dst_stride, - const uint8_t *src_argb, - uint8_t *dst_argb, - int x, - int y, - int dy, - int bpp, - enum FilterMode filtering); - -// Simplify the filtering based on scale factors. -enum FilterMode ScaleFilterReduce(int src_width, - int src_height, - int dst_width, - int dst_height, - enum FilterMode filtering); - -// Divide num by div and return as 16.16 fixed point result. -int FixedDiv_X86(int num, int div); - -int FixedDiv1_X86(int num, int div); - -#ifdef HAS_FIXEDDIV_X86 -#define FixedDiv FixedDiv_X86 -#define FixedDiv1 FixedDiv1_X86 -#endif - -// Compute slope values for stepping. -void ScaleSlope(int src_width, - int src_height, - int dst_width, - int dst_height, - enum FilterMode filtering, - int *x, - int *y, - int *dx, - int *dy); - -void ScaleRowDown2_C(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst, - int dst_width); - -void ScaleRowDown2Linear_C(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst, - int dst_width); - -void ScaleRowDown2Box_C(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst, - int dst_width); - -void ScaleRowDown2Box_Odd_C(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst, - int dst_width); - -void ScaleRowDown4_C(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst, - int dst_width); - -void ScaleRowDown4Box_C(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst, - int dst_width); - -void ScaleRowDown34_C(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst, - int dst_width); - -void ScaleRowDown34_0_Box_C(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *d, - int dst_width); - -void ScaleRowDown34_1_Box_C(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *d, - int dst_width); - -void ScaleRowUp2_Linear_C(const uint8_t *src_ptr, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowUp2_Bilinear_C(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleRowUp2_Linear_16_C(const uint16_t *src_ptr, - uint16_t *dst_ptr, - int dst_width); - -void ScaleRowUp2_Bilinear_16_C(const uint16_t *src_ptr, - ptrdiff_t src_stride, - uint16_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleRowUp2_Linear_Any_C(const uint8_t *src_ptr, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowUp2_Bilinear_Any_C(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleRowUp2_Linear_16_Any_C(const uint16_t *src_ptr, - uint16_t *dst_ptr, - int dst_width); - -void ScaleRowUp2_Bilinear_16_Any_C(const uint16_t *src_ptr, - ptrdiff_t src_stride, - uint16_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleCols_C(uint8_t *dst_ptr, - const uint8_t *src_ptr, - int dst_width, - int x, - int dx); - -void ScaleColsUp2_C(uint8_t *dst_ptr, - const uint8_t *src_ptr, - int dst_width, - int, - int); - -void ScaleFilterCols_C(uint8_t *dst_ptr, - const uint8_t *src_ptr, - int dst_width, - int x, - int dx); - -void ScaleFilterCols64_C(uint8_t *dst_ptr, - const uint8_t *src_ptr, - int dst_width, - int x32, - int dx); - -void ScaleRowDown38_C(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst, - int dst_width); - -void ScaleRowDown38_3_Box_C(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown38_2_Box_C(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleAddRow_C(const uint8_t *src_ptr, uint16_t *dst_ptr, int src_width); - -void ScaleUVRowDown2_C(const uint8_t *src_uv, - ptrdiff_t src_stride, - uint8_t *dst_uv, - int dst_width); - -void ScaleUVRowDown2Linear_C(const uint8_t *src_uv, - ptrdiff_t src_stride, - uint8_t *dst_uv, - int dst_width); - -void ScaleUVRowDown2Box_C(const uint8_t *src_uv, - ptrdiff_t src_stride, - uint8_t *dst_uv, - int dst_width); - -void ScaleUVRowDownEven_C(const uint8_t *src_uv, - ptrdiff_t src_stride, - int src_stepx, - uint8_t *dst_uv, - int dst_width); - -void ScaleUVRowUp2_Linear_C(const uint8_t *src_ptr, - uint8_t *dst_ptr, - int dst_width); - -void ScaleUVRowUp2_Bilinear_C(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleUVRowUp2_Linear_Any_C(const uint8_t *src_ptr, - uint8_t *dst_ptr, - int dst_width); - -void ScaleUVRowUp2_Bilinear_Any_C(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleUVRowUp2_Linear_16_C(const uint16_t *src_ptr, - uint16_t *dst_ptr, - int dst_width); - -void ScaleUVRowUp2_Bilinear_16_C(const uint16_t *src_ptr, - ptrdiff_t src_stride, - uint16_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleUVRowUp2_Linear_16_Any_C(const uint16_t *src_ptr, - uint16_t *dst_ptr, - int dst_width); - -void ScaleUVRowUp2_Bilinear_16_Any_C(const uint16_t *src_ptr, - ptrdiff_t src_stride, - uint16_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -// Specialized scalers for x86. -void ScaleRowDown2_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown2Linear_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown2Box_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown2_AVX2(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown2Linear_AVX2(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown2Box_AVX2(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown4_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown4Box_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown4_AVX2(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown4Box_AVX2(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown34_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown34_1_Box_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown34_0_Box_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown38_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown38_3_Box_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown38_2_Box_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowUp2_Linear_SSE2(const uint8_t *src_ptr, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowUp2_Bilinear_SSE2(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleRowUp2_Linear_12_SSSE3(const uint16_t *src_ptr, - uint16_t *dst_ptr, - int dst_width); - -void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t *src_ptr, - ptrdiff_t src_stride, - uint16_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleRowUp2_Linear_16_SSE2(const uint16_t *src_ptr, - uint16_t *dst_ptr, - int dst_width); - -void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t *src_ptr, - ptrdiff_t src_stride, - uint16_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleRowUp2_Linear_SSSE3(const uint8_t *src_ptr, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowUp2_Bilinear_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleRowUp2_Linear_AVX2(const uint8_t *src_ptr, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowUp2_Bilinear_AVX2(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleRowUp2_Linear_12_AVX2(const uint16_t *src_ptr, - uint16_t *dst_ptr, - int dst_width); - -void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t *src_ptr, - ptrdiff_t src_stride, - uint16_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleRowUp2_Linear_16_AVX2(const uint16_t *src_ptr, - uint16_t *dst_ptr, - int dst_width); - -void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t *src_ptr, - ptrdiff_t src_stride, - uint16_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleRowUp2_Linear_Any_SSE2(const uint8_t *src_ptr, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowUp2_Bilinear_Any_SSE2(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleRowUp2_Linear_12_Any_SSSE3(const uint16_t *src_ptr, - uint16_t *dst_ptr, - int dst_width); - -void ScaleRowUp2_Bilinear_12_Any_SSSE3(const uint16_t *src_ptr, - ptrdiff_t src_stride, - uint16_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleRowUp2_Linear_16_Any_SSE2(const uint16_t *src_ptr, - uint16_t *dst_ptr, - int dst_width); - -void ScaleRowUp2_Bilinear_16_Any_SSE2(const uint16_t *src_ptr, - ptrdiff_t src_stride, - uint16_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleRowUp2_Linear_Any_SSSE3(const uint8_t *src_ptr, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowUp2_Bilinear_Any_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleRowUp2_Linear_Any_AVX2(const uint8_t *src_ptr, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowUp2_Bilinear_Any_AVX2(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleRowUp2_Linear_12_Any_AVX2(const uint16_t *src_ptr, - uint16_t *dst_ptr, - int dst_width); - -void ScaleRowUp2_Bilinear_12_Any_AVX2(const uint16_t *src_ptr, - ptrdiff_t src_stride, - uint16_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleRowUp2_Linear_16_Any_AVX2(const uint16_t *src_ptr, - uint16_t *dst_ptr, - int dst_width); - -void ScaleRowUp2_Bilinear_16_Any_AVX2(const uint16_t *src_ptr, - ptrdiff_t src_stride, - uint16_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleRowDown2_Any_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown2Linear_Any_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown2Box_Any_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown2Box_Odd_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown2_Any_AVX2(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown2Linear_Any_AVX2(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown2Box_Any_AVX2(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown2Box_Odd_AVX2(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown4_Any_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown4Box_Any_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown4_Any_AVX2(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown4Box_Any_AVX2(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown34_Any_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown34_1_Box_Any_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown34_0_Box_Any_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown38_Any_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown38_3_Box_Any_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleRowDown38_2_Box_Any_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleAddRow_SSE2(const uint8_t *src_ptr, uint16_t *dst_ptr, int src_width); - -void ScaleAddRow_AVX2(const uint8_t *src_ptr, uint16_t *dst_ptr, int src_width); - -void ScaleAddRow_Any_SSE2(const uint8_t *src_ptr, - uint16_t *dst_ptr, - int src_width); - -void ScaleAddRow_Any_AVX2(const uint8_t *src_ptr, - uint16_t *dst_ptr, - int src_width); - -void ScaleFilterCols_SSSE3(uint8_t *dst_ptr, - const uint8_t *src_ptr, - int dst_width, - int x, - int dx); - -void ScaleColsUp2_SSE2(uint8_t *dst_ptr, - const uint8_t *src_ptr, - int dst_width, - int x, - int dx); - -// UV Row functions -void ScaleUVRowDown2Box_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_uv, - int dst_width); - -void ScaleUVRowDown2Box_AVX2(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_uv, - int dst_width); - -void ScaleUVRowDown2Box_Any_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleUVRowDown2Box_Any_AVX2(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - int dst_width); - -void ScaleUVRowUp2_Linear_SSSE3(const uint8_t *src_ptr, - uint8_t *dst_ptr, - int dst_width); - -void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleUVRowUp2_Linear_Any_SSSE3(const uint8_t *src_ptr, - uint8_t *dst_ptr, - int dst_width); - -void ScaleUVRowUp2_Bilinear_Any_SSSE3(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleUVRowUp2_Linear_AVX2(const uint8_t *src_ptr, - uint8_t *dst_ptr, - int dst_width); - -void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleUVRowUp2_Linear_Any_AVX2(const uint8_t *src_ptr, - uint8_t *dst_ptr, - int dst_width); - -void ScaleUVRowUp2_Bilinear_Any_AVX2(const uint8_t *src_ptr, - ptrdiff_t src_stride, - uint8_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t *src_ptr, - uint16_t *dst_ptr, - int dst_width); - -void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t *src_ptr, - ptrdiff_t src_stride, - uint16_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleUVRowUp2_Linear_16_Any_SSE41(const uint16_t *src_ptr, - uint16_t *dst_ptr, - int dst_width); - -void ScaleUVRowUp2_Bilinear_16_Any_SSE41(const uint16_t *src_ptr, - ptrdiff_t src_stride, - uint16_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t *src_ptr, - uint16_t *dst_ptr, - int dst_width); - -void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t *src_ptr, - ptrdiff_t src_stride, - uint16_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -void ScaleUVRowUp2_Linear_16_Any_AVX2(const uint16_t *src_ptr, - uint16_t *dst_ptr, - int dst_width); - -void ScaleUVRowUp2_Bilinear_16_Any_AVX2(const uint16_t *src_ptr, - ptrdiff_t src_stride, - uint16_t *dst_ptr, - ptrdiff_t dst_stride, - int dst_width); - -#endif // INCLUDE_LIBYUV_SCALE_ROW_H_ \ No newline at end of file diff --git a/pkg/encoder/yuv/libyuv/version.h b/pkg/encoder/yuv/libyuv/version.h deleted file mode 100644 index d45ef09d6..000000000 --- a/pkg/encoder/yuv/libyuv/version.h +++ /dev/null @@ -1,16 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_VERSION_H_ -#define INCLUDE_LIBYUV_VERSION_H_ - -#define LIBYUV_VERSION 1875 - -#endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/pkg/encoder/yuv/libyuv/video_common.c b/pkg/encoder/yuv/libyuv/video_common.c deleted file mode 100644 index e492402e8..000000000 --- a/pkg/encoder/yuv/libyuv/video_common.c +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "video_common.h" - -struct FourCCAliasEntry { - uint32_t alias; - uint32_t canonical; -}; - -#define NUM_ALIASES 18 -static const struct FourCCAliasEntry kFourCCAliases[NUM_ALIASES] = { - {FOURCC_IYUV, FOURCC_I420}, - {FOURCC_YU12, FOURCC_I420}, - {FOURCC_YU16, FOURCC_I422}, - {FOURCC_YU24, FOURCC_I444}, - {FOURCC_YUYV, FOURCC_YUY2}, - {FOURCC_YUVS, FOURCC_YUY2}, // kCMPixelFormat_422YpCbCr8_yuvs - {FOURCC_HDYC, FOURCC_UYVY}, - {FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8 - {FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not. - {FOURCC_DMB1, FOURCC_MJPG}, - {FOURCC_BA81, FOURCC_BGGR}, // deprecated. - {FOURCC_RGB3, FOURCC_RAW}, - {FOURCC_BGR3, FOURCC_24BG}, - {FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB - {FOURCC_CM24, FOURCC_RAW}, // kCMPixelFormat_24RGB - {FOURCC_L555, FOURCC_RGBO}, // kCMPixelFormat_16LE555 - {FOURCC_L565, FOURCC_RGBP}, // kCMPixelFormat_16LE565 - {FOURCC_5551, FOURCC_RGBO}, // kCMPixelFormat_16LE5551 -}; - -LIBYUV_API -uint32_t CanonicalFourCC(uint32_t fourcc) { - int i; - for (i = 0; i < NUM_ALIASES; ++i) { - if (kFourCCAliases[i].alias == fourcc) { - return kFourCCAliases[i].canonical; - } - } - // Not an alias, so return it as-is. - return fourcc; -} diff --git a/pkg/encoder/yuv/libyuv/video_common.h b/pkg/encoder/yuv/libyuv/video_common.h deleted file mode 100644 index e2aacf44c..000000000 --- a/pkg/encoder/yuv/libyuv/video_common.h +++ /dev/null @@ -1,212 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -// Common definitions for video, including fourcc and VideoFormat. - -#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_ -#define INCLUDE_LIBYUV_VIDEO_COMMON_H_ - -#include "basic_types.h" - -////////////////////////////////////////////////////////////////////////////// -// Definition of FourCC codes -////////////////////////////////////////////////////////////////////////////// - -// Convert four characters to a FourCC code. -// Needs to be a macro otherwise the OS X compiler complains when the kFormat* -// constants are used in a switch. -#ifdef __cplusplus -#define FOURCC(a, b, c, d) \ - ((static_cast(a)) | (static_cast(b) << 8) | \ - (static_cast(c) << 16) | /* NOLINT */ \ - (static_cast(d) << 24)) /* NOLINT */ -#else -#define FOURCC(a, b, c, d) \ - (((uint32_t)(a)) | ((uint32_t)(b) << 8) | /* NOLINT */ \ - ((uint32_t)(c) << 16) | ((uint32_t)(d) << 24)) /* NOLINT */ -#endif - -// Some pages discussing FourCC codes: -// http://www.fourcc.org/yuv.php -// http://v4l2spec.bytesex.org/spec/book1.htm -// http://developer.apple.com/quicktime/icefloe/dispatch020.html -// http://msdn.microsoft.com/library/windows/desktop/dd206750.aspx#nv12 -// http://people.xiph.org/~xiphmont/containers/nut/nut4cc.txt - -// FourCC codes grouped according to implementation efficiency. -// Primary formats should convert in 1 efficient step. -// Secondary formats are converted in 2 steps. -// Auxilliary formats call primary converters. -enum FourCC { - // 10 Primary YUV formats: 5 planar, 2 biplanar, 2 packed. - FOURCC_I420 = FOURCC('I', '4', '2', '0'), - FOURCC_I422 = FOURCC('I', '4', '2', '2'), - FOURCC_I444 = FOURCC('I', '4', '4', '4'), - FOURCC_I400 = FOURCC('I', '4', '0', '0'), - FOURCC_NV21 = FOURCC('N', 'V', '2', '1'), - FOURCC_NV12 = FOURCC('N', 'V', '1', '2'), - FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'), - FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'), - FOURCC_I010 = FOURCC('I', '0', '1', '0'), // bt.601 10 bit 420 - FOURCC_I210 = FOURCC('I', '2', '1', '0'), // bt.601 10 bit 422 - - // 1 Secondary YUV format: row biplanar. deprecated. - FOURCC_M420 = FOURCC('M', '4', '2', '0'), - - // 13 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc 2 64 bpp - FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'), - FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'), - FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'), - FOURCC_AR30 = FOURCC('A', 'R', '3', '0'), // 10 bit per channel. 2101010. - FOURCC_AB30 = FOURCC('A', 'B', '3', '0'), // ABGR version of 10 bit - FOURCC_AR64 = FOURCC('A', 'R', '6', '4'), // 16 bit per channel. - FOURCC_AB64 = FOURCC('A', 'B', '6', '4'), // ABGR version of 16 bit - FOURCC_24BG = FOURCC('2', '4', 'B', 'G'), - FOURCC_RAW = FOURCC('r', 'a', 'w', ' '), - FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'), - FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'), // rgb565 LE. - FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'), // argb1555 LE. - FOURCC_R444 = FOURCC('R', '4', '4', '4'), // argb4444 LE. - - // 1 Primary Compressed YUV format. - FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'), - - // 14 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias. - FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'), - FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'), - FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'), - FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Linux version of I420. - FOURCC_J420 = - FOURCC('J', '4', '2', '0'), // jpeg (bt.601 full), unofficial fourcc - FOURCC_J422 = - FOURCC('J', '4', '2', '2'), // jpeg (bt.601 full), unofficial fourcc - FOURCC_J444 = - FOURCC('J', '4', '4', '4'), // jpeg (bt.601 full), unofficial fourcc - FOURCC_J400 = - FOURCC('J', '4', '0', '0'), // jpeg (bt.601 full), unofficial fourcc - FOURCC_F420 = FOURCC('F', '4', '2', '0'), // bt.709 full, unofficial fourcc - FOURCC_F422 = FOURCC('F', '4', '2', '2'), // bt.709 full, unofficial fourcc - FOURCC_F444 = FOURCC('F', '4', '4', '4'), // bt.709 full, unofficial fourcc - FOURCC_H420 = FOURCC('H', '4', '2', '0'), // bt.709, unofficial fourcc - FOURCC_H422 = FOURCC('H', '4', '2', '2'), // bt.709, unofficial fourcc - FOURCC_H444 = FOURCC('H', '4', '4', '4'), // bt.709, unofficial fourcc - FOURCC_U420 = FOURCC('U', '4', '2', '0'), // bt.2020, unofficial fourcc - FOURCC_U422 = FOURCC('U', '4', '2', '2'), // bt.2020, unofficial fourcc - FOURCC_U444 = FOURCC('U', '4', '4', '4'), // bt.2020, unofficial fourcc - FOURCC_F010 = FOURCC('F', '0', '1', '0'), // bt.709 full range 10 bit 420 - FOURCC_H010 = FOURCC('H', '0', '1', '0'), // bt.709 10 bit 420 - FOURCC_U010 = FOURCC('U', '0', '1', '0'), // bt.2020 10 bit 420 - FOURCC_F210 = FOURCC('F', '2', '1', '0'), // bt.709 full range 10 bit 422 - FOURCC_H210 = FOURCC('H', '2', '1', '0'), // bt.709 10 bit 422 - FOURCC_U210 = FOURCC('U', '2', '1', '0'), // bt.2020 10 bit 422 - FOURCC_P010 = FOURCC('P', '0', '1', '0'), - FOURCC_P210 = FOURCC('P', '2', '1', '0'), - - // 14 Auxiliary aliases. CanonicalFourCC() maps these to canonical fourcc. - FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420. - FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'), // Alias for I422. - FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'), // Alias for I444. - FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'), // Alias for YUY2. - FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'), // Alias for YUY2 on Mac. - FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'), // Alias for UYVY. - FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'), // Alias for UYVY on Mac. - FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'), // Alias for MJPG. - FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'), // Alias for MJPG on Mac. - FOURCC_BA81 = FOURCC('B', 'A', '8', '1'), // Alias for BGGR. - FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'), // Alias for RAW. - FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'), // Alias for 24BG. - FOURCC_CM32 = FOURCC(0, 0, 0, 32), // Alias for BGRA kCMPixelFormat_32ARGB - FOURCC_CM24 = FOURCC(0, 0, 0, 24), // Alias for RAW kCMPixelFormat_24RGB - FOURCC_L555 = FOURCC('L', '5', '5', '5'), // Alias for RGBO. - FOURCC_L565 = FOURCC('L', '5', '6', '5'), // Alias for RGBP. - FOURCC_5551 = FOURCC('5', '5', '5', '1'), // Alias for RGBO. - - // deprecated formats. Not supported, but defined for backward compatibility. - FOURCC_I411 = FOURCC('I', '4', '1', '1'), - FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), - FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'), - FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'), - FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'), - FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'), - FOURCC_H264 = FOURCC('H', '2', '6', '4'), - - // Match any fourcc. - FOURCC_ANY = -1, -}; - -enum FourCCBpp { - // Canonical fourcc codes used in our code. - FOURCC_BPP_I420 = 12, - FOURCC_BPP_I422 = 16, - FOURCC_BPP_I444 = 24, - FOURCC_BPP_I411 = 12, - FOURCC_BPP_I400 = 8, - FOURCC_BPP_NV21 = 12, - FOURCC_BPP_NV12 = 12, - FOURCC_BPP_YUY2 = 16, - FOURCC_BPP_UYVY = 16, - FOURCC_BPP_M420 = 12, // deprecated - FOURCC_BPP_Q420 = 12, - FOURCC_BPP_ARGB = 32, - FOURCC_BPP_BGRA = 32, - FOURCC_BPP_ABGR = 32, - FOURCC_BPP_RGBA = 32, - FOURCC_BPP_AR30 = 32, - FOURCC_BPP_AB30 = 32, - FOURCC_BPP_AR64 = 64, - FOURCC_BPP_AB64 = 64, - FOURCC_BPP_24BG = 24, - FOURCC_BPP_RAW = 24, - FOURCC_BPP_RGBP = 16, - FOURCC_BPP_RGBO = 16, - FOURCC_BPP_R444 = 16, - FOURCC_BPP_RGGB = 8, - FOURCC_BPP_BGGR = 8, - FOURCC_BPP_GRBG = 8, - FOURCC_BPP_GBRG = 8, - FOURCC_BPP_YV12 = 12, - FOURCC_BPP_YV16 = 16, - FOURCC_BPP_YV24 = 24, - FOURCC_BPP_YU12 = 12, - FOURCC_BPP_J420 = 12, - FOURCC_BPP_J400 = 8, - FOURCC_BPP_H420 = 12, - FOURCC_BPP_H422 = 16, - FOURCC_BPP_I010 = 15, - FOURCC_BPP_I210 = 20, - FOURCC_BPP_H010 = 15, - FOURCC_BPP_H210 = 20, - FOURCC_BPP_P010 = 15, - FOURCC_BPP_P210 = 20, - FOURCC_BPP_MJPG = 0, // 0 means unknown. - FOURCC_BPP_H264 = 0, - FOURCC_BPP_IYUV = 12, - FOURCC_BPP_YU16 = 16, - FOURCC_BPP_YU24 = 24, - FOURCC_BPP_YUYV = 16, - FOURCC_BPP_YUVS = 16, - FOURCC_BPP_HDYC = 16, - FOURCC_BPP_2VUY = 16, - FOURCC_BPP_JPEG = 1, - FOURCC_BPP_DMB1 = 1, - FOURCC_BPP_BA81 = 8, - FOURCC_BPP_RGB3 = 24, - FOURCC_BPP_BGR3 = 24, - FOURCC_BPP_CM32 = 32, - FOURCC_BPP_CM24 = 24, - - // Match any fourcc. - FOURCC_BPP_ANY = 0, // 0 means unknown. -}; - -// Converts fourcc aliases into canonical ones. -LIBYUV_API uint32_t CanonicalFourCC(uint32_t fourcc); - -#endif // INCLUDE_LIBYUV_VIDEO_COMMON_H_ \ No newline at end of file diff --git a/pkg/encoder/yuv/yuv_test.go b/pkg/encoder/yuv/yuv_test.go index 6b67c29f0..3f07aa69d 100644 --- a/pkg/encoder/yuv/yuv_test.go +++ b/pkg/encoder/yuv/yuv_test.go @@ -115,6 +115,9 @@ func TestYuvPredefined(t *testing.T) { frame := RawFrame{Data: im, Stride: 32, W: 32, H: 32} a := pc.Process(frame, 0, PixFmt(libyuv.FourccAbgr)) + v := libyuv.Version() + t.Logf("%v", v) + if len(a) != len(should) { t.Fatalf("diffrent size a: %v, o: %v", len(a), len(should)) }