diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index ccae921f1..4da180c58 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -38,7 +38,7 @@ jobs:
       - name: Get MacOS dev libraries and tools
         if: matrix.os == 'macos-latest'
         run: |
-          brew install pkg-config libvpx x264 opus sdl2
+          brew install pkg-config libvpx x264 opus sdl2 jpeg-turbo
 
       - name: Get Windows dev libraries and tools
         if: matrix.os == 'windows-latest'
diff --git a/Makefile b/Makefile
index f0afe6ad7..748aa595f 100644
--- a/Makefile
+++ b/Makefile
@@ -38,7 +38,7 @@ test:
 	go test -v ./pkg/...
 
 verify-cores:
-	go test -run TestAll ./pkg/worker/room -v -renderFrames $(GL_CTX) -outputPath "../../../_rendered"
+	go test -run TestAll ./pkg/worker/room -v -renderFrames $(GL_CTX) -outputPath "./_rendered"
 
 dev.build: compile build
 
diff --git a/README.md b/README.md
index b3f181c31..d1d837ad7 100644
--- a/README.md
+++ b/README.md
@@ -64,7 +64,7 @@ a better sense of performance.
 apt-get install -y make gcc pkg-config libvpx-dev libx264-dev libopus-dev libsdl2-dev libyuv-dev
 
 # MacOS
-brew install pkg-config libvpx x264 opus sdl2
+brew install pkg-config libvpx x264 opus sdl2 jpeg-turbo
 
 # Windows (MSYS2)
 pacman -Sy --noconfirm --needed git make mingw-w64-x86_64-{gcc,pkgconf,dlfcn,libvpx,opus,x264-git,SDL2,libyuv}
diff --git a/pkg/encoder/yuv/libyuv/LICENSE b/pkg/encoder/yuv/libyuv/LICENSE
deleted file mode 100644
index c911747a6..000000000
--- a/pkg/encoder/yuv/libyuv/LICENSE
+++ /dev/null
@@ -1,29 +0,0 @@
-Copyright 2011 The LibYuv Project Authors. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-  * Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
-
-  * Redistributions in binary form must reproduce the above copyright
-    notice, this list of conditions and the following disclaimer in
-    the documentation and/or other materials provided with the
-    distribution.
-
-  * Neither the name of Google nor the names of its contributors may
-    be used to endorse or promote products derived from this software
-    without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/pkg/encoder/yuv/libyuv/basic_types.h b/pkg/encoder/yuv/libyuv/basic_types.h
deleted file mode 100644
index 9c66a132a..000000000
--- a/pkg/encoder/yuv/libyuv/basic_types.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_
-#define INCLUDE_LIBYUV_BASIC_TYPES_H_
-
-#include <stddef.h>  // For size_t and NULL
-
-#if !defined(INT_TYPES_DEFINED) && !defined(GG_LONGLONG)
-#define INT_TYPES_DEFINED
-
-#include <stdint.h>  // for uintptr_t and C99 types
-
-#endif  // INT_TYPES_DEFINED
-
-#if !defined(LIBYUV_API)
-#define LIBYUV_API
-#endif  // LIBYUV_API
-
-#define LIBYUV_BOOL int
-
-#endif  // INCLUDE_LIBYUV_BASIC_TYPES_H_
diff --git a/pkg/encoder/yuv/libyuv/convert.c b/pkg/encoder/yuv/libyuv/convert.c
deleted file mode 100644
index c59da3b1b..000000000
--- a/pkg/encoder/yuv/libyuv/convert.c
+++ /dev/null
@@ -1,336 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "convert.h"
-
-#include "basic_types.h"
-#include "cpu_id.h"
-#include "planar_functions.h"
-#include "row.h"
-
-// Subsample amount uses a shift.
-//   v is value
-//   a is amount to add to round up
-//   s is shift to subsample down
-#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
-
-static __inline int Abs(int v) {
-    return v >= 0 ? v : -v;
-}
-
-// Copy I420 with optional flipping.
-// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure
-// is does row coalescing.
-LIBYUV_API
-int I420Copy(const uint8_t *src_y,
-             int src_stride_y,
-             const uint8_t *src_u,
-             int src_stride_u,
-             const uint8_t *src_v,
-             int src_stride_v,
-             uint8_t *dst_y,
-             int dst_stride_y,
-             uint8_t *dst_u,
-             int dst_stride_u,
-             uint8_t *dst_v,
-             int dst_stride_v,
-             int width,
-             int height) {
-    int halfwidth = (width + 1) >> 1;
-    int halfheight = (height + 1) >> 1;
-    if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
-        height == 0) {
-        return -1;
-    }
-    // Negative height means invert the image.
-    if (height < 0) {
-        height = -height;
-        halfheight = (height + 1) >> 1;
-        src_y = src_y + (height - 1) * src_stride_y;
-        src_u = src_u + (halfheight - 1) * src_stride_u;
-        src_v = src_v + (halfheight - 1) * src_stride_v;
-        src_stride_y = -src_stride_y;
-        src_stride_u = -src_stride_u;
-        src_stride_v = -src_stride_v;
-    }
-
-    if (dst_y) {
-        CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-    }
-    // Copy UV planes.
-    CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
-    CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
-    return 0;
-}
-
-// Convert ARGB to I420.
-LIBYUV_API
-int ARGBToI420(const uint8_t *src_argb,
-               int src_stride_argb,
-               uint8_t *dst_y,
-               int dst_stride_y,
-               uint8_t *dst_u,
-               int dst_stride_u,
-               uint8_t *dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-    int y;
-    void (*ARGBToUVRow)(const uint8_t *src_argb0, int src_stride_argb,
-                        uint8_t *dst_u, uint8_t *dst_v, int width) =
-    ARGBToUVRow_C;
-    void (*ARGBToYRow)(const uint8_t *src_argb, uint8_t *dst_y, int width) =
-    ARGBToYRow_C;
-    if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-        return -1;
-    }
-    // Negative height means invert the image.
-    if (height < 0) {
-        height = -height;
-        src_argb = src_argb + (height - 1) * src_stride_argb;
-        src_stride_argb = -src_stride_argb;
-    }
-#if defined(HAS_ARGBTOYROW_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3)) {
-        ARGBToYRow = ARGBToYRow_Any_SSSE3;
-        if (IS_ALIGNED(width, 16)) {
-            ARGBToYRow = ARGBToYRow_SSSE3;
-        }
-    }
-#endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3)) {
-        ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-        if (IS_ALIGNED(width, 16)) {
-            ARGBToUVRow = ARGBToUVRow_SSSE3;
-        }
-    }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2)
-    if (TestCpuFlag(kCpuHasAVX2)) {
-        ARGBToYRow = ARGBToYRow_Any_AVX2;
-        if (IS_ALIGNED(width, 32)) {
-            ARGBToYRow = ARGBToYRow_AVX2;
-        }
-    }
-#endif
-#if defined(HAS_ARGBTOUVROW_AVX2)
-    if (TestCpuFlag(kCpuHasAVX2)) {
-        ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-        if (IS_ALIGNED(width, 32)) {
-            ARGBToUVRow = ARGBToUVRow_AVX2;
-        }
-    }
-#endif
-
-    for (y = 0; y < height - 1; y += 2) {
-        ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
-        ARGBToYRow(src_argb, dst_y, width);
-        ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
-        src_argb += src_stride_argb * 2;
-        dst_y += dst_stride_y * 2;
-        dst_u += dst_stride_u;
-        dst_v += dst_stride_v;
-    }
-    if (height & 1) {
-        ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
-        ARGBToYRow(src_argb, dst_y, width);
-    }
-    return 0;
-}
-
-// Convert ABGR to I420.
-LIBYUV_API
-int ABGRToI420(const uint8_t *src_abgr,
-               int src_stride_abgr,
-               uint8_t *dst_y,
-               int dst_stride_y,
-               uint8_t *dst_u,
-               int dst_stride_u,
-               uint8_t *dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-    int y;
-    void (*ABGRToUVRow)(const uint8_t *src_abgr0, int src_stride_abgr,
-                        uint8_t *dst_u, uint8_t *dst_v, int width) =
-    ABGRToUVRow_C;
-    void (*ABGRToYRow)(const uint8_t *src_abgr, uint8_t *dst_y, int width) =
-    ABGRToYRow_C;
-    if (!src_abgr || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-        return -1;
-    }
-    // Negative height means invert the image.
-    if (height < 0) {
-        height = -height;
-        src_abgr = src_abgr + (height - 1) * src_stride_abgr;
-        src_stride_abgr = -src_stride_abgr;
-    }
-#if defined(HAS_ABGRTOYROW_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3)) {
-        ABGRToYRow = ABGRToYRow_Any_SSSE3;
-        if (IS_ALIGNED(width, 16)) {
-            ABGRToYRow = ABGRToYRow_SSSE3;
-        }
-    }
-#endif
-#if defined(HAS_ABGRTOUVROW_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3)) {
-        ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
-        if (IS_ALIGNED(width, 16)) {
-            ABGRToUVRow = ABGRToUVRow_SSSE3;
-        }
-    }
-#endif
-#if defined(HAS_ABGRTOYROW_AVX2)
-    if (TestCpuFlag(kCpuHasAVX2)) {
-        ABGRToYRow = ABGRToYRow_Any_AVX2;
-        if (IS_ALIGNED(width, 32)) {
-            ABGRToYRow = ABGRToYRow_AVX2;
-        }
-    }
-#endif
-#if defined(HAS_ABGRTOUVROW_AVX2)
-    if (TestCpuFlag(kCpuHasAVX2)) {
-        ABGRToUVRow = ABGRToUVRow_Any_AVX2;
-        if (IS_ALIGNED(width, 32)) {
-            ABGRToUVRow = ABGRToUVRow_AVX2;
-        }
-    }
-#endif
-
-    for (y = 0; y < height - 1; y += 2) {
-        ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
-        ABGRToYRow(src_abgr, dst_y, width);
-        ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
-        src_abgr += src_stride_abgr * 2;
-        dst_y += dst_stride_y * 2;
-        dst_u += dst_stride_u;
-        dst_v += dst_stride_v;
-    }
-    if (height & 1) {
-        ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width);
-        ABGRToYRow(src_abgr, dst_y, width);
-    }
-    return 0;
-}
-
-// Convert RGB565 to I420.
-LIBYUV_API
-int RGB565ToI420(const uint8_t *src_rgb565,
-                 int src_stride_rgb565,
-                 uint8_t *dst_y,
-                 int dst_stride_y,
-                 uint8_t *dst_u,
-                 int dst_stride_u,
-                 uint8_t *dst_v,
-                 int dst_stride_v,
-                 int width,
-                 int height) {
-    int y;
-    void (*RGB565ToARGBRow)(const uint8_t *src_rgb, uint8_t *dst_argb,
-                            int width) = RGB565ToARGBRow_C;
-    void (*ARGBToUVRow)(const uint8_t *src_argb0, int src_stride_argb,
-                        uint8_t *dst_u, uint8_t *dst_v, int width) =
-    ARGBToUVRow_C;
-    void (*ARGBToYRow)(const uint8_t *src_argb, uint8_t *dst_y, int width) =
-    ARGBToYRow_C;
-    if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-        return -1;
-    }
-    // Negative height means invert the image.
-    if (height < 0) {
-        height = -height;
-        src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
-        src_stride_rgb565 = -src_stride_rgb565;
-    }
-
-#if defined(HAS_RGB565TOARGBROW_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2)) {
-        RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
-        if (IS_ALIGNED(width, 8)) {
-            RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
-        }
-    }
-#endif
-#if defined(HAS_RGB565TOARGBROW_AVX2)
-    if (TestCpuFlag(kCpuHasAVX2)) {
-      RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
-      if (IS_ALIGNED(width, 16)) {
-        RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
-      }
-    }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3)) {
-        ARGBToYRow = ARGBToYRow_Any_SSSE3;
-        if (IS_ALIGNED(width, 16)) {
-            ARGBToYRow = ARGBToYRow_SSSE3;
-        }
-    }
-#endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3)) {
-        ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-        if (IS_ALIGNED(width, 16)) {
-            ARGBToUVRow = ARGBToUVRow_SSSE3;
-        }
-    }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2)
-    if (TestCpuFlag(kCpuHasAVX2)) {
-        ARGBToYRow = ARGBToYRow_Any_AVX2;
-        if (IS_ALIGNED(width, 32)) {
-            ARGBToYRow = ARGBToYRow_AVX2;
-        }
-    }
-#endif
-#if defined(HAS_ARGBTOUVROW_AVX2)
-    if (TestCpuFlag(kCpuHasAVX2)) {
-        ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-        if (IS_ALIGNED(width, 32)) {
-            ARGBToUVRow = ARGBToUVRow_AVX2;
-        }
-    }
-#endif
-    {
-#if !(defined(HAS_RGB565TOYROW_NEON))
-        // Allocate 2 rows of ARGB.
-        const int row_size = (width * 4 + 31) & ~31;
-        align_buffer_64(row, row_size * 2);
-#endif
-        for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_RGB565TOYROW_NEON))
-#else
-            RGB565ToARGBRow(src_rgb565, row, width);
-            RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + row_size, width);
-            ARGBToUVRow(row, row_size, dst_u, dst_v, width);
-            ARGBToYRow(row, dst_y, width);
-            ARGBToYRow(row + row_size, dst_y + dst_stride_y, width);
-#endif
-            src_rgb565 += src_stride_rgb565 * 2;
-            dst_y += dst_stride_y * 2;
-            dst_u += dst_stride_u;
-            dst_v += dst_stride_v;
-        }
-        if (height & 1) {
-#if (defined(HAS_RGB565TOYROW_NEON))
-#else
-            RGB565ToARGBRow(src_rgb565, row, width);
-            ARGBToUVRow(row, 0, dst_u, dst_v, width);
-            ARGBToYRow(row, dst_y, width);
-#endif
-        }
-#if !(defined(HAS_RGB565TOYROW_NEON))
-        free_aligned_buffer_64(row);
-#endif
-    }
-    return 0;
-}
diff --git a/pkg/encoder/yuv/libyuv/convert.h b/pkg/encoder/yuv/libyuv/convert.h
deleted file mode 100644
index 9a81c509c..000000000
--- a/pkg/encoder/yuv/libyuv/convert.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_CONVERT_H_
-#define INCLUDE_LIBYUV_CONVERT_H_
-
-#include "rotate.h"  // For enum RotationMode.
-
-// Copy I420 to I420.
-#define I420ToI420 I420Copy
-LIBYUV_API
-int I420Copy(const uint8_t *src_y,
-             int src_stride_y,
-             const uint8_t *src_u,
-             int src_stride_u,
-             const uint8_t *src_v,
-             int src_stride_v,
-             uint8_t *dst_y,
-             int dst_stride_y,
-             uint8_t *dst_u,
-             int dst_stride_u,
-             uint8_t *dst_v,
-             int dst_stride_v,
-             int width,
-             int height);
-
-// ARGB little endian (bgra in memory) to I420.
-LIBYUV_API
-int ARGBToI420(const uint8_t *src_argb,
-               int src_stride_argb,
-               uint8_t *dst_y,
-               int dst_stride_y,
-               uint8_t *dst_u,
-               int dst_stride_u,
-               uint8_t *dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// ABGR little endian (rgba in memory) to I420.
-LIBYUV_API
-int ABGRToI420(const uint8_t *src_abgr,
-               int src_stride_abgr,
-               uint8_t *dst_y,
-               int dst_stride_y,
-               uint8_t *dst_u,
-               int dst_stride_u,
-               uint8_t *dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// RGB16 (RGBP fourcc) little endian to I420.
-LIBYUV_API
-int RGB565ToI420(const uint8_t *src_rgb565,
-                 int src_stride_rgb565,
-                 uint8_t *dst_y,
-                 int dst_stride_y,
-                 uint8_t *dst_u,
-                 int dst_stride_u,
-                 uint8_t *dst_v,
-                 int dst_stride_v,
-                 int width,
-                 int height);
-
-// Convert camera sample to I420 with cropping, rotation and vertical flip.
-// "src_size" is needed to parse MJPG.
-// "dst_stride_y" number of bytes in a row of the dst_y plane.
-//   Normally this would be the same as dst_width, with recommended alignment
-//   to 16 bytes for better efficiency.
-//   If rotation of 90 or 270 is used, stride is affected. The caller should
-//   allocate the I420 buffer according to rotation.
-// "dst_stride_u" number of bytes in a row of the dst_u plane.
-//   Normally this would be the same as (dst_width + 1) / 2, with
-//   recommended alignment to 16 bytes for better efficiency.
-//   If rotation of 90 or 270 is used, stride is affected.
-// "crop_x" and "crop_y" are starting position for cropping.
-//   To center, crop_x = (src_width - dst_width) / 2
-//              crop_y = (src_height - dst_height) / 2
-// "src_width" / "src_height" is size of src_frame in pixels.
-//   "src_height" can be negative indicating a vertically flipped image source.
-// "crop_width" / "crop_height" is the size to crop the src to.
-//    Must be less than or equal to src_width/src_height
-//    Cropping parameters are pre-rotation.
-// "rotation" can be 0, 90, 180 or 270.
-// "fourcc" is a fourcc. ie 'I420', 'YUY2'
-// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
-LIBYUV_API
-int ConvertToI420(const uint8_t *sample,
-                  size_t sample_size,
-                  uint8_t *dst_y,
-                  int dst_stride_y,
-                  uint8_t *dst_u,
-                  int dst_stride_u,
-                  uint8_t *dst_v,
-                  int dst_stride_v,
-                  int crop_x,
-                  int crop_y,
-                  int src_width,
-                  int src_height,
-                  int crop_width,
-                  int crop_height,
-                  enum RotationMode rotation,
-                  uint32_t fourcc);
-
-#endif  // INCLUDE_LIBYUV_CONVERT_H_
\ No newline at end of file
diff --git a/pkg/encoder/yuv/libyuv/convert_argb.h b/pkg/encoder/yuv/libyuv/convert_argb.h
deleted file mode 100644
index ac8e97169..000000000
--- a/pkg/encoder/yuv/libyuv/convert_argb.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_
-#define INCLUDE_LIBYUV_CONVERT_ARGB_H_
-
-#include "basic_types.h"
-
-// Conversion matrix for YVU to BGR
-LIBYUV_API extern const struct YuvConstants kYvuI601Constants;   // BT.601
-LIBYUV_API extern const struct YuvConstants kYvuJPEGConstants;   // BT.601 full
-LIBYUV_API extern const struct YuvConstants kYvuH709Constants;   // BT.709
-LIBYUV_API extern const struct YuvConstants kYvuF709Constants;   // BT.709 full
-LIBYUV_API extern const struct YuvConstants kYvu2020Constants;   // BT.2020
-LIBYUV_API extern const struct YuvConstants kYvuV2020Constants;  // BT.2020 full
-
-#endif  // INCLUDE_LIBYUV_CONVERT_ARGB_H_
diff --git a/pkg/encoder/yuv/libyuv/convert_to_i420.c b/pkg/encoder/yuv/libyuv/convert_to_i420.c
deleted file mode 100644
index 848021427..000000000
--- a/pkg/encoder/yuv/libyuv/convert_to_i420.c
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-
-#include "convert.h"
-#include "video_common.h"
-
-// Convert camera sample to I420 with cropping, rotation and vertical flip.
-// src_width is used for source stride computation
-// src_height is used to compute location of planes, and indicate inversion
-// sample_size is measured in bytes and is the size of the frame.
-//   With MJPEG it is the compressed size of the frame.
-LIBYUV_API
-int ConvertToI420(const uint8_t *sample,
-                  size_t sample_size,
-                  uint8_t *dst_y,
-                  int dst_stride_y,
-                  uint8_t *dst_u,
-                  int dst_stride_u,
-                  uint8_t *dst_v,
-                  int dst_stride_v,
-                  int crop_x,
-                  int crop_y,
-                  int src_width,
-                  int src_height,
-                  int crop_width,
-                  int crop_height,
-                  enum RotationMode rotation,
-                  uint32_t fourcc) {
-    uint32_t format = CanonicalFourCC(fourcc);
-    const uint8_t *src;
-    // TODO(nisse): Why allow crop_height < 0?
-    const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
-    int r = 0;
-    LIBYUV_BOOL need_buf =
-            (rotation && format != FOURCC_I420 && format != FOURCC_NV12 &&
-             format != FOURCC_NV21 && format != FOURCC_YV12) ||
-            dst_y == sample;
-    uint8_t *tmp_y = dst_y;
-    uint8_t *tmp_u = dst_u;
-    uint8_t *tmp_v = dst_v;
-    int tmp_y_stride = dst_stride_y;
-    int tmp_u_stride = dst_stride_u;
-    int tmp_v_stride = dst_stride_v;
-    uint8_t *rotate_buffer = NULL;
-    const int inv_crop_height =
-            (src_height < 0) ? -abs_crop_height : abs_crop_height;
-
-    if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 ||
-        crop_width <= 0 || src_height == 0 || crop_height == 0) {
-        return -1;
-    }
-
-    // One pass rotation is available for some formats. For the rest, convert
-    // to I420 (with optional vertical flipping) into a temporary I420 buffer,
-    // and then rotate the I420 to the final destination buffer.
-    // For in-place conversion, if destination dst_y is same as source sample,
-    // also enable temporary buffer.
-    if (need_buf) {
-        int y_size = crop_width * abs_crop_height;
-        int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
-        rotate_buffer = (uint8_t *) malloc(y_size + uv_size * 2); /* NOLINT */
-        if (!rotate_buffer) {
-            return 1;  // Out of memory runtime error.
-        }
-        dst_y = rotate_buffer;
-        dst_u = dst_y + y_size;
-        dst_v = dst_u + uv_size;
-        dst_stride_y = crop_width;
-        dst_stride_u = dst_stride_v = ((crop_width + 1) / 2);
-    }
-
-    switch (format) {
-        // Single plane formats
-        case FOURCC_RGBP:
-            src = sample + (src_width * crop_y + crop_x) * 2;
-            r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
-                             dst_stride_u, dst_v, dst_stride_v, crop_width,
-                             inv_crop_height);
-            break;
-        case FOURCC_ARGB:
-            src = sample + (src_width * crop_y + crop_x) * 4;
-            r = ARGBToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
-                           dst_stride_u, dst_v, dst_stride_v, crop_width,
-                           inv_crop_height);
-            break;
-        case FOURCC_ABGR:
-            src = sample + (src_width * crop_y + crop_x) * 4;
-            r = ABGRToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
-                           dst_stride_u, dst_v, dst_stride_v, crop_width,
-                           inv_crop_height);
-            break;
-        default:
-            r = -1;  // unknown fourcc - return failure code.
-    }
-
-    if (need_buf) {
-        if (!r) {
-            r = I420Rotate(dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
-                           dst_stride_v, tmp_y, tmp_y_stride, tmp_u, tmp_u_stride,
-                           tmp_v, tmp_v_stride, crop_width, abs_crop_height,
-                           rotation);
-        }
-        free(rotate_buffer);
-    }
-
-    return r;
-}
diff --git a/pkg/encoder/yuv/libyuv/cpu_id.c b/pkg/encoder/yuv/libyuv/cpu_id.c
deleted file mode 100644
index 166057de5..000000000
--- a/pkg/encoder/yuv/libyuv/cpu_id.c
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "cpu_id.h"
-
-#if !defined(__pnacl__) && !defined(__CLR_VER) && \
-    !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
-    defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
-#include <immintrin.h>  // For _xgetbv()
-#endif
-
-// For ArmCpuCaps() but unittested on all platforms
-#include <stdio.h>  // For fopen()
-#include <string.h>
-
-// For functions that use the stack and have runtime checks for overflow,
-// use SAFEBUFFERS to avoid additional check.
-#define SAFEBUFFERS
-
-// cpu_info_ variable for SIMD instruction sets detected.
-LIBYUV_API int cpu_info_ = 0;
-
-// Low level cpuid for X86.
-#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
-     defined(__x86_64__)) && \
-    !defined(__pnacl__) && !defined(__CLR_VER)
-LIBYUV_API
-void CpuId(int info_eax, int info_ecx, int *cpu_info) {
-#if defined(_MSC_VER)
-    // GCC version uses inline x86 assembly.
-#else  // defined(_MSC_VER)
-    int info_ebx, info_edx;
-    asm volatile(
-#if defined(__i386__) && defined(__PIC__)
-        // Preserve ebx for fpic 32 bit.
-        "mov         %%ebx, %%edi                  \n"
-        "cpuid                                     \n"
-        "xchg        %%edi, %%ebx                  \n"
-        : "=D"(info_ebx),
-#else
-            "cpuid                                     \n"
-            : "=b"(info_ebx),
-#endif  //  defined( __i386__) && defined(__PIC__)
-    "+a"(info_eax), "+c"(info_ecx), "=d"(info_edx));
-    cpu_info[0] = info_eax;
-    cpu_info[1] = info_ebx;
-    cpu_info[2] = info_ecx;
-    cpu_info[3] = info_edx;
-#endif  // defined(_MSC_VER)
-}
-
-#else  // (defined(_M_IX86) || defined(_M_X64) ...
-LIBYUV_API
-void CpuId(int eax, int ecx, int* cpu_info) {
-  (void)eax;
-  (void)ecx;
-  cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
-}
-#endif
-
-// For VS2010 and earlier emit can be used:
-//   _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0  // For VS2010 and earlier.
-//  __asm {
-//    xor        ecx, ecx    // xcr 0
-//    xgetbv
-//    mov        xcr0, eax
-//  }
-// For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code.
-// https://code.google.com/p/libyuv/issues/detail?id=529
-#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900)
-#pragma optimize("g", off)
-#endif
-#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
-     defined(__x86_64__)) && \
-    !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
-
-// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
-static int GetXCR0() {
-    int xcr0 = 0;
-#if defined(__i386__) || defined(__x86_64__)
-    asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx");
-#endif  // defined(__i386__) || defined(__x86_64__)
-    return xcr0;
-}
-
-#else
-// xgetbv unavailable to query for OSSave support.  Return 0.
-#define GetXCR0() 0
-#endif  // defined(_M_IX86) || defined(_M_X64) ..
-// Return optimization to previous setting.
-#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900)
-#pragma optimize("g", on)
-#endif
-
-// Based on libvpx arm_cpudetect.c
-// For Arm, but public to allow testing on any CPU
-LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char *cpuinfo_name) {
-    char cpuinfo_line[512];
-    FILE *f = fopen(cpuinfo_name, "re");
-    if (!f) {
-        // Assume Neon if /proc/cpuinfo is unavailable.
-        // This will occur for Chrome sandbox for Pepper or Render process.
-        return kCpuHasNEON;
-    }
-    memset(cpuinfo_line, 0, sizeof(cpuinfo_line));
-    while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f)) {
-        if (memcmp(cpuinfo_line, "Features", 8) == 0) {
-            char *p = strstr(cpuinfo_line, " neon");
-            if (p && (p[5] == ' ' || p[5] == '\n')) {
-                fclose(f);
-                return kCpuHasNEON;
-            }
-            // aarch64 uses asimd for Neon.
-            p = strstr(cpuinfo_line, " asimd");
-            if (p) {
-                fclose(f);
-                return kCpuHasNEON;
-            }
-        }
-    }
-    fclose(f);
-    return 0;
-}
-
-static SAFEBUFFERS int GetCpuFlags(void) {
-    int cpu_info = 0;
-#if !defined(__pnacl__) && !defined(__CLR_VER) && \
-    (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
-     defined(_M_IX86))
-    int cpu_info0[4] = {0, 0, 0, 0};
-    int cpu_info1[4] = {0, 0, 0, 0};
-    int cpu_info7[4] = {0, 0, 0, 0};
-    CpuId(0, 0, cpu_info0);
-    CpuId(1, 0, cpu_info1);
-    if (cpu_info0[0] >= 7) {
-        CpuId(7, 0, cpu_info7);
-    }
-    cpu_info = kCpuHasX86 | ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
-               ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
-               ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
-               ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
-               ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0);
-
-    // AVX requires OS saves YMM registers.
-    if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) &&  // AVX and OSXSave
-        ((GetXCR0() & 6) == 6)) {  // Test OS saves YMM registers
-        cpu_info |= kCpuHasAVX | ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
-                    ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
-                    ((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0);
-
-        // Detect AVX512bw
-        if ((GetXCR0() & 0xe0) == 0xe0) {
-            cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0;
-            cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0;
-            cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0;
-            cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0;
-            cpu_info |= (cpu_info7[2] & 0x00000800) ? kCpuHasAVX512VNNI : 0;
-            cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0;
-            cpu_info |= (cpu_info7[2] & 0x00004000) ? kCpuHasAVX512VPOPCNTDQ : 0;
-            cpu_info |= (cpu_info7[2] & 0x00000100) ? kCpuHasGFNI : 0;
-        }
-    }
-#endif
-#if defined(__arm__) || defined(__aarch64__)
-    // gcc -mfpu=neon defines __ARM_NEON__
-    // __ARM_NEON__ generates code that requires Neon.  NaCL also requires Neon.
-    // For Linux, /proc/cpuinfo can be tested but without that assume Neon.
-#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
-      cpu_info = kCpuHasNEON;
-    // For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
-    // flag in it.
-    // So for aarch64, neon enabling is hard coded here.
-#endif
-#if defined(__aarch64__)
-      cpu_info = kCpuHasNEON;
-#else
-      // Linux arm parse text file for neon detect.
-      cpu_info = ArmCpuCaps("/proc/cpuinfo");
-#endif
-      cpu_info |= kCpuHasARM;
-#endif  // __arm__
-    cpu_info |= kCpuInitialized;
-    return cpu_info;
-}
-
-// Note that use of this function is not thread safe.
-LIBYUV_API
-int MaskCpuFlags(int enable_flags) {
-    int cpu_info = GetCpuFlags() & enable_flags;
-    SetCpuFlags(cpu_info);
-    return cpu_info;
-}
-
-LIBYUV_API
-int InitCpuFlags(void) {
-    return MaskCpuFlags(-1);
-}
diff --git a/pkg/encoder/yuv/libyuv/cpu_id.h b/pkg/encoder/yuv/libyuv/cpu_id.h
deleted file mode 100644
index bf50b9cd1..000000000
--- a/pkg/encoder/yuv/libyuv/cpu_id.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_CPU_ID_H_
-#define INCLUDE_LIBYUV_CPU_ID_H_
-
-#include "basic_types.h"
-
-// Internal flag to indicate cpuid requires initialization.
-static const int kCpuInitialized = 0x1;
-
-// These flags are only valid on ARM processors.
-static const int kCpuHasARM = 0x2;
-static const int kCpuHasNEON = 0x4;
-// 0x8 reserved for future ARM flag.
-
-// These flags are only valid on x86 processors.
-static const int kCpuHasX86 = 0x10;
-static const int kCpuHasSSE2 = 0x20;
-static const int kCpuHasSSSE3 = 0x40;
-static const int kCpuHasSSE41 = 0x80;
-static const int kCpuHasSSE42 = 0x100;  // unused at this time.
-static const int kCpuHasAVX = 0x200;
-static const int kCpuHasAVX2 = 0x400;
-static const int kCpuHasERMS = 0x800;
-static const int kCpuHasFMA3 = 0x1000;
-static const int kCpuHasF16C = 0x2000;
-static const int kCpuHasGFNI = 0x4000;
-static const int kCpuHasAVX512BW = 0x8000;
-static const int kCpuHasAVX512VL = 0x10000;
-static const int kCpuHasAVX512VNNI = 0x20000;
-static const int kCpuHasAVX512VBMI = 0x40000;
-static const int kCpuHasAVX512VBMI2 = 0x80000;
-static const int kCpuHasAVX512VBITALG = 0x100000;
-static const int kCpuHasAVX512VPOPCNTDQ = 0x200000;
-
-// Optional init function. TestCpuFlag does an auto-init.
-// Returns cpu_info flags.
-LIBYUV_API
-int InitCpuFlags(void);
-
-// Detect CPU has SSE2 etc.
-// Test_flag parameter should be one of kCpuHas constants above.
-// Returns non-zero if instruction set is detected
-static __inline int TestCpuFlag(int test_flag) {
-    LIBYUV_API extern int cpu_info_;
-#ifdef __ATOMIC_RELAXED
-    int cpu_info = __atomic_load_n(&cpu_info_, __ATOMIC_RELAXED);
-#else
-    int cpu_info = cpu_info_;
-#endif
-    return (!cpu_info ? InitCpuFlags() : cpu_info) & test_flag;
-}
-
-// Internal function for parsing /proc/cpuinfo.
-LIBYUV_API
-int ArmCpuCaps(const char *cpuinfo_name);
-
-// For testing, allow CPU flags to be disabled.
-// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
-// MaskCpuFlags(-1) to enable all cpu specific optimizations.
-// MaskCpuFlags(1) to disable all cpu specific optimizations.
-// MaskCpuFlags(0) to reset state so next call will auto init.
-// Returns cpu_info flags.
-LIBYUV_API
-int MaskCpuFlags(int enable_flags);
-
-// Sets the CPU flags to |cpu_flags|, bypassing the detection code. |cpu_flags|
-// should be a valid combination of the kCpuHas constants above and include
-// kCpuInitialized. Use this method when running in a sandboxed process where
-// the detection code might fail (as it might access /proc/cpuinfo). In such
-// cases the cpu_info can be obtained from a non sandboxed process by calling
-// InitCpuFlags() and passed to the sandboxed process (via command line
-// parameters, IPC...) which can then call this method to initialize the CPU
-// flags.
-// Notes:
-// - when specifying 0 for |cpu_flags|, the auto initialization is enabled
-//   again.
-// - enabling CPU features that are not supported by the CPU will result in
-//   undefined behavior.
-// TODO(fbarchard): consider writing a helper function that translates from
-// other library CPU info to libyuv CPU info and add a .md doc that explains
-// CPU detection.
-static __inline void SetCpuFlags(int cpu_flags) {
-    LIBYUV_API extern int cpu_info_;
-#ifdef __ATOMIC_RELAXED
-    __atomic_store_n(&cpu_info_, cpu_flags, __ATOMIC_RELAXED);
-#else
-    cpu_info_ = cpu_flags;
-#endif
-}
-
-// Low level cpuid for X86. Returns zeros on other CPUs.
-// eax is the info type that you want.
-// ecx is typically the cpu number, and should normally be zero.
-LIBYUV_API
-void CpuId(int info_eax, int info_ecx, int *cpu_info);
-
-#endif  // INCLUDE_LIBYUV_CPU_ID_H_
\ No newline at end of file
diff --git a/pkg/encoder/yuv/libyuv/libyuv.go b/pkg/encoder/yuv/libyuv/libyuv.go
index 98d4276ff..8bde0ad89 100644
--- a/pkg/encoder/yuv/libyuv/libyuv.go
+++ b/pkg/encoder/yuv/libyuv/libyuv.go
@@ -1,18 +1,43 @@
-//go:build !darwin && !no_libyuv
-
+// Package libyuv contains the wrapper for: https://chromium.googlesource.com/libyuv/libyuv.
+// Libs are downloaded from: https://packages.macports.org/libyuv/.
 package libyuv
 
-// see: https://chromium.googlesource.com/libyuv/libyuv
-
 /*
-#cgo CFLAGS: -Wall
-#cgo LDFLAGS: -lyuv
+#cgo !darwin LDFLAGS: -lyuv
+
+#cgo darwin CFLAGS: -DINCLUDE_LIBYUV_VERSION_H_
+#cgo darwin LDFLAGS: -L${SRCDIR} -lstdc++
+#cgo darwin,amd64 LDFLAGS: -lyuv_darwin_x86_64 -ljpeg -lstdc++
+#cgo darwin,arm64 LDFLAGS: -lyuv_darwin_arm64 -ljpeg -lstdc++
 
-#include <stdlib.h>
+#include <stdint.h>  // for uintptr_t and C99 types
+
+#if !defined(LIBYUV_API)
+#define LIBYUV_API
+#endif  // LIBYUV_API
+
+#ifndef INCLUDE_LIBYUV_VERSION_H_
 #include "libyuv/version.h"
-#include "libyuv/video_common.h"
+#else
+#define LIBYUV_VERSION 1874 // darwin static libs version
+#endif  // INCLUDE_LIBYUV_VERSION_H_
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define FOURCC(a, b, c, d) \
+	(((uint32_t)(a)) | ((uint32_t)(b) << 8) | ((uint32_t)(c) << 16) | ((uint32_t)(d) << 24))
+
+enum FourCC {
+  FOURCC_I420 = FOURCC('I', '4', '2', '0'),
+  FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
+  FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
+  FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'),  // rgb565 LE.
+  FOURCC_ANY = -1,
+};
 
-//
 typedef enum RotationMode {
  kRotate0 = 0,      // No rotation.
  kRotate90 = 90,    // Rotate 90 degrees clockwise.
@@ -20,7 +45,6 @@ typedef enum RotationMode {
  kRotate270 = 270,  // Rotate 270 degrees clockwise.
 } RotationModeEnum;
 
-//
 LIBYUV_API
 int ConvertToI420(const uint8_t* sample,
                  size_t sample_size,
@@ -65,6 +89,11 @@ int I420Scale(const uint8_t *src_y,
               int dst_width,
               int dst_height,
               enum FilterMode filtering);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
 */
 import "C"
 import "fmt"
diff --git a/pkg/encoder/yuv/libyuv/libyuv2.go b/pkg/encoder/yuv/libyuv/libyuv2.go
deleted file mode 100644
index f4f6a68b5..000000000
--- a/pkg/encoder/yuv/libyuv/libyuv2.go
+++ /dev/null
@@ -1,89 +0,0 @@
-//go:build darwin || no_libyuv
-
-package libyuv
-
-/*
-#cgo CFLAGS: -Wall
-
-#include "basic_types.h"
-#include "version.h"
-#include "video_common.h"
-#include "rotate.h"
-#include "scale.h"
-#include "convert.h"
-
-*/
-import "C"
-import "fmt"
-
-const FourccRgbp uint32 = C.FOURCC_RGBP
-const FourccArgb uint32 = C.FOURCC_ARGB
-const FourccAbgr uint32 = C.FOURCC_ABGR
-
-func Y420(src []byte, dst []byte, _, h, stride int, dw, dh int, rot uint, pix uint32, cx, cy int) {
-	cw := (dw + 1) / 2
-	ch := (dh + 1) / 2
-	i0 := dw * dh
-	i1 := i0 + cw*ch
-	yStride := dw
-	cStride := cw
-
-	C.ConvertToI420(
-		(*C.uchar)(&src[0]),
-		C.size_t(0),
-		(*C.uchar)(&dst[0]),
-		C.int(yStride),
-		(*C.uchar)(&dst[i0]),
-		C.int(cStride),
-		(*C.uchar)(&dst[i1]),
-		C.int(cStride),
-		C.int(0),
-		C.int(0),
-		C.int(stride),
-		C.int(h),
-		C.int(cx),
-		C.int(cy),
-		C.enum_RotationMode(rot),
-		C.uint32_t(pix))
-}
-
-func Y420Scale(src []byte, dst []byte, w, h int, dw, dh int) {
-	srcWidthUV, dstWidthUV := (w+1)>>1, (dw+1)>>1
-	srcHeightUV, dstHeightUV := (h+1)>>1, (dh+1)>>1
-
-	srcYPlaneSize, dstYPlaneSize := w*h, dw*dh
-	srcUVPlaneSize, dstUVPlaneSize := srcWidthUV*srcHeightUV, dstWidthUV*dstHeightUV
-
-	srcStrideY, dstStrideY := w, dw
-	srcStrideU, dstStrideU := srcWidthUV, dstWidthUV
-	srcStrideV, dstStrideV := srcWidthUV, dstWidthUV
-
-	srcY := (*C.uchar)(&src[0])
-	srcU := (*C.uchar)(&src[srcYPlaneSize])
-	srcV := (*C.uchar)(&src[srcYPlaneSize+srcUVPlaneSize])
-
-	dstY := (*C.uchar)(&dst[0])
-	dstU := (*C.uchar)(&dst[dstYPlaneSize])
-	dstV := (*C.uchar)(&dst[dstYPlaneSize+dstUVPlaneSize])
-
-	C.I420Scale(
-		srcY,
-		C.int(srcStrideY),
-		srcU,
-		C.int(srcStrideU),
-		srcV,
-		C.int(srcStrideV),
-		C.int(w),
-		C.int(h),
-		dstY,
-		C.int(dstStrideY),
-		dstU,
-		C.int(dstStrideU),
-		dstV,
-		C.int(dstStrideV),
-		C.int(dw),
-		C.int(dh),
-		C.enum_FilterMode(C.kFilterNone))
-}
-
-func Version() string { return fmt.Sprintf("%v mod", int(C.LIBYUV_VERSION)) }
diff --git a/pkg/encoder/yuv/libyuv/libyuv_darwin_arm64.a b/pkg/encoder/yuv/libyuv/libyuv_darwin_arm64.a
new file mode 100644
index 000000000..f399a41c7
Binary files /dev/null and b/pkg/encoder/yuv/libyuv/libyuv_darwin_arm64.a differ
diff --git a/pkg/encoder/yuv/libyuv/libyuv_darwin_x86_64.a b/pkg/encoder/yuv/libyuv/libyuv_darwin_x86_64.a
new file mode 100644
index 000000000..63cd5c74a
Binary files /dev/null and b/pkg/encoder/yuv/libyuv/libyuv_darwin_x86_64.a differ
diff --git a/pkg/encoder/yuv/libyuv/planar_functions.c b/pkg/encoder/yuv/libyuv/planar_functions.c
deleted file mode 100644
index a5d543cc5..000000000
--- a/pkg/encoder/yuv/libyuv/planar_functions.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "planar_functions.h"
-
-#include "cpu_id.h"
-#include "row.h"
-
-// Copy a plane of data
-LIBYUV_API
-void CopyPlane(const uint8_t *src_y,
-               int src_stride_y,
-               uint8_t *dst_y,
-               int dst_stride_y,
-               int width,
-               int height) {
-    int y;
-    void (*CopyRow)(const uint8_t *src, uint8_t *dst, int width) = CopyRow_C;
-    if (width <= 0 || height == 0) {
-        return;
-    }
-    // Negative height means invert the image.
-    if (height < 0) {
-        height = -height;
-        dst_y = dst_y + (height - 1) * dst_stride_y;
-        dst_stride_y = -dst_stride_y;
-    }
-    // Coalesce rows.
-    if (src_stride_y == width && dst_stride_y == width) {
-        width *= height;
-        height = 1;
-        src_stride_y = dst_stride_y = 0;
-    }
-    // Nothing to do.
-    if (src_y == dst_y && src_stride_y == dst_stride_y) {
-        return;
-    }
-
-#if defined(HAS_COPYROW_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2)) {
-        CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
-    }
-#endif
-#if defined(HAS_COPYROW_AVX)
-    if (TestCpuFlag(kCpuHasAVX)) {
-        CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
-    }
-#endif
-#if defined(HAS_COPYROW_ERMS)
-    if (TestCpuFlag(kCpuHasERMS)) {
-        CopyRow = CopyRow_ERMS;
-    }
-#endif
-
-    // Copy plane
-    for (y = 0; y < height; ++y) {
-        CopyRow(src_y, dst_y, width);
-        src_y += src_stride_y;
-        dst_y += dst_stride_y;
-    }
-}
diff --git a/pkg/encoder/yuv/libyuv/planar_functions.h b/pkg/encoder/yuv/libyuv/planar_functions.h
deleted file mode 100644
index 222109cfc..000000000
--- a/pkg/encoder/yuv/libyuv/planar_functions.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
-#define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
-
-#include "basic_types.h"
-
-// TODO(fbarchard): Move cpu macros to row.h
-#if defined(__pnacl__) || defined(__CLR_VER) || \
-    (defined(__native_client__) && defined(__x86_64__)) || \
-    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
-#define LIBYUV_DISABLE_X86
-#endif
-// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
-#if defined(__has_feature)
-#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON)
-#define LIBYUV_DISABLE_NEON
-#endif
-#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86)
-#define LIBYUV_DISABLE_X86
-#endif
-#endif
-// The following are available on all x86 platforms:
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-#define HAS_ARGBAFFINEROW_SSE2
-#endif
-
-// Copy a plane of data.
-LIBYUV_API
-void CopyPlane(const uint8_t *src_y,
-               int src_stride_y,
-               uint8_t *dst_y,
-               int dst_stride_y,
-               int width,
-               int height);
-
-#endif  // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
\ No newline at end of file
diff --git a/pkg/encoder/yuv/libyuv/rotate.c b/pkg/encoder/yuv/libyuv/rotate.c
deleted file mode 100644
index 4aabae5b0..000000000
--- a/pkg/encoder/yuv/libyuv/rotate.c
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "rotate.h"
-
-#include "convert.h"
-#include "cpu_id.h"
-#include "rotate_row.h"
-#include "row.h"
-
-LIBYUV_API
-void TransposePlane(const uint8_t *src,
-                    int src_stride,
-                    uint8_t *dst,
-                    int dst_stride,
-                    int width,
-                    int height) {
-    int i = height;
-
-    void (*TransposeWx8)(const uint8_t *src, int src_stride, uint8_t *dst,
-                         int dst_stride, int width) = TransposeWx8_C;
-
-#if defined(HAS_TRANSPOSEWX8_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3)) {
-        TransposeWx8 = TransposeWx8_Any_SSSE3;
-        if (IS_ALIGNED(width, 8)) {
-            TransposeWx8 = TransposeWx8_SSSE3;
-        }
-    }
-#endif
-#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3)) {
-        TransposeWx8 = TransposeWx8_Fast_Any_SSSE3;
-        if (IS_ALIGNED(width, 16)) {
-            TransposeWx8 = TransposeWx8_Fast_SSSE3;
-        }
-    }
-#endif
-
-    // Work across the source in 8x8 tiles
-    while (i >= 8) {
-        TransposeWx8(src, src_stride, dst, dst_stride, width);
-        src += 8 * src_stride;  // Go down 8 rows.
-        dst += 8;               // Move over 8 columns.
-        i -= 8;
-    }
-
-    if (i > 0) {
-        TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
-    }
-}
-
-LIBYUV_API
-void RotatePlane90(const uint8_t *src,
-                   int src_stride,
-                   uint8_t *dst,
-                   int dst_stride,
-                   int width,
-                   int height) {
-    // Rotate by 90 is a transpose with the source read
-    // from bottom to top. So set the source pointer to the end
-    // of the buffer and flip the sign of the source stride.
-    src += src_stride * (height - 1);
-    src_stride = -src_stride;
-    TransposePlane(src, src_stride, dst, dst_stride, width, height);
-}
-
-LIBYUV_API
-void RotatePlane270(const uint8_t *src,
-                    int src_stride,
-                    uint8_t *dst,
-                    int dst_stride,
-                    int width,
-                    int height) {
-    // Rotate by 270 is a transpose with the destination written
-    // from bottom to top. So set the destination pointer to the end
-    // of the buffer and flip the sign of the destination stride.
-    dst += dst_stride * (width - 1);
-    dst_stride = -dst_stride;
-    TransposePlane(src, src_stride, dst, dst_stride, width, height);
-}
-
-LIBYUV_API
-void RotatePlane180(const uint8_t *src,
-                    int src_stride,
-                    uint8_t *dst,
-                    int dst_stride,
-                    int width,
-                    int height) {
-    // Swap top and bottom row and mirror the content. Uses a temporary row.
-    align_buffer_64(row, width);
-    const uint8_t *src_bot = src + src_stride * (height - 1);
-    uint8_t *dst_bot = dst + dst_stride * (height - 1);
-    int half_height = (height + 1) >> 1;
-    int y;
-    void (*MirrorRow)(const uint8_t *src, uint8_t *dst, int width) = MirrorRow_C;
-    void (*CopyRow)(const uint8_t *src, uint8_t *dst, int width) = CopyRow_C;
-#if defined(HAS_MIRRORROW_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3)) {
-        MirrorRow = MirrorRow_Any_SSSE3;
-        if (IS_ALIGNED(width, 16)) {
-            MirrorRow = MirrorRow_SSSE3;
-        }
-    }
-#endif
-#if defined(HAS_MIRRORROW_AVX2)
-    if (TestCpuFlag(kCpuHasAVX2)) {
-        MirrorRow = MirrorRow_Any_AVX2;
-        if (IS_ALIGNED(width, 32)) {
-            MirrorRow = MirrorRow_AVX2;
-        }
-    }
-#endif
-#if defined(HAS_COPYROW_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2)) {
-        CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
-    }
-#endif
-#if defined(HAS_COPYROW_AVX)
-    if (TestCpuFlag(kCpuHasAVX)) {
-        CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
-    }
-#endif
-#if defined(HAS_COPYROW_ERMS)
-    if (TestCpuFlag(kCpuHasERMS)) {
-        CopyRow = CopyRow_ERMS;
-    }
-#endif
-#if defined(HAS_COPYROW_NEON)
-#endif
-    // Odd height will harmlessly mirror the middle row twice.
-    for (y = 0; y < half_height; ++y) {
-        CopyRow(src, row, width);        // Copy top row into buffer
-        MirrorRow(src_bot, dst, width);  // Mirror bottom row into top row
-        MirrorRow(row, dst_bot, width);  // Mirror buffer into bottom row
-        src += src_stride;
-        dst += dst_stride;
-        src_bot -= src_stride;
-        dst_bot -= dst_stride;
-    }
-    free_aligned_buffer_64(row);
-}
-
-LIBYUV_API
-int I420Rotate(const uint8_t *src_y,
-               int src_stride_y,
-               const uint8_t *src_u,
-               int src_stride_u,
-               const uint8_t *src_v,
-               int src_stride_v,
-               uint8_t *dst_y,
-               int dst_stride_y,
-               uint8_t *dst_u,
-               int dst_stride_u,
-               uint8_t *dst_v,
-               int dst_stride_v,
-               int width,
-               int height,
-               enum RotationMode mode) {
-    int halfwidth = (width + 1) >> 1;
-    int halfheight = (height + 1) >> 1;
-    if ((!src_y && dst_y) || !src_u || !src_v || width <= 0 || height == 0 ||
-        !dst_y || !dst_u || !dst_v) {
-        return -1;
-    }
-
-    // Negative height means invert the image.
-    if (height < 0) {
-        height = -height;
-        halfheight = (height + 1) >> 1;
-        src_y = src_y + (height - 1) * src_stride_y;
-        src_u = src_u + (halfheight - 1) * src_stride_u;
-        src_v = src_v + (halfheight - 1) * src_stride_v;
-        src_stride_y = -src_stride_y;
-        src_stride_u = -src_stride_u;
-        src_stride_v = -src_stride_v;
-    }
-
-    switch (mode) {
-        case kRotate0:
-            // copy frame
-            return I420Copy(src_y, src_stride_y, src_u, src_stride_u, src_v,
-                            src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
-                            dst_v, dst_stride_v, width, height);
-        case kRotate90:
-            RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-            RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
-                          halfheight);
-            RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
-                          halfheight);
-            return 0;
-        case kRotate270:
-            RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-            RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
-                           halfheight);
-            RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
-                           halfheight);
-            return 0;
-        case kRotate180:
-            RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-            RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
-                           halfheight);
-            RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
-                           halfheight);
-            return 0;
-        default:
-            break;
-    }
-    return -1;
-}
diff --git a/pkg/encoder/yuv/libyuv/rotate.h b/pkg/encoder/yuv/libyuv/rotate.h
deleted file mode 100644
index 59b9ec3cb..000000000
--- a/pkg/encoder/yuv/libyuv/rotate.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_ROTATE_H_
-#define INCLUDE_LIBYUV_ROTATE_H_
-
-#include "basic_types.h"
-
-// Supported rotation.
-typedef enum RotationMode {
-    kRotate0 = 0,      // No rotation.
-    kRotate90 = 90,    // Rotate 90 degrees clockwise.
-    kRotate180 = 180,  // Rotate 180 degrees.
-    kRotate270 = 270,  // Rotate 270 degrees clockwise.
-} RotationModeEnum;
-
-// Rotate I420 frame.
-LIBYUV_API
-int I420Rotate(const uint8_t *src_y,
-               int src_stride_y,
-               const uint8_t *src_u,
-               int src_stride_u,
-               const uint8_t *src_v,
-               int src_stride_v,
-               uint8_t *dst_y,
-               int dst_stride_y,
-               uint8_t *dst_u,
-               int dst_stride_u,
-               uint8_t *dst_v,
-               int dst_stride_v,
-               int width,
-               int height,
-               enum RotationMode mode);
-
-// Rotate planes by 90, 180, 270. Deprecated.
-LIBYUV_API
-void RotatePlane90(const uint8_t *src,
-                   int src_stride,
-                   uint8_t *dst,
-                   int dst_stride,
-                   int width,
-                   int height);
-
-LIBYUV_API
-void RotatePlane180(const uint8_t *src,
-                    int src_stride,
-                    uint8_t *dst,
-                    int dst_stride,
-                    int width,
-                    int height);
-
-LIBYUV_API
-void RotatePlane270(const uint8_t *src,
-                    int src_stride,
-                    uint8_t *dst,
-                    int dst_stride,
-                    int width,
-                    int height);
-
-// The 90 and 270 functions are based on transposes.
-// Doing a transpose with reversing the read/write
-// order will result in a rotation by +- 90 degrees.
-// Deprecated.
-LIBYUV_API
-void TransposePlane(const uint8_t *src,
-                    int src_stride,
-                    uint8_t *dst,
-                    int dst_stride,
-                    int width,
-                    int height);
-
-#endif  // INCLUDE_LIBYUV_ROTATE_H_
diff --git a/pkg/encoder/yuv/libyuv/rotate_any.c b/pkg/encoder/yuv/libyuv/rotate_any.c
deleted file mode 100644
index 9af8c04ab..000000000
--- a/pkg/encoder/yuv/libyuv/rotate_any.c
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "rotate_row.h"
-
-#define TANY(NAMEANY, TPOS_SIMD, MASK)                                        \
-  void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst,              \
-               int dst_stride, int width) {                                   \
-    int r = width & MASK;                                                     \
-    int n = width - r;                                                        \
-    if (n > 0) {                                                              \
-      TPOS_SIMD(src, src_stride, dst, dst_stride, n);                         \
-    }                                                                         \
-    TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \
-  }
-
-#ifdef HAS_TRANSPOSEWX8_SSSE3
-
-TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7)
-
-#endif
-#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
-
-TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15)
-
-#endif
-#undef TANY
-
-#define TUVANY(NAMEANY, TPOS_SIMD, MASK)                                       \
-  void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst_a,             \
-               int dst_stride_a, uint8_t* dst_b, int dst_stride_b,             \
-               int width) {                                                    \
-    int r = width & MASK;                                                      \
-    int n = width - r;                                                         \
-    if (n > 0) {                                                               \
-      TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, n); \
-    }                                                                          \
-    TransposeUVWx8_C(src + n * 2, src_stride, dst_a + n * dst_stride_a,        \
-                     dst_stride_a, dst_b + n * dst_stride_b, dst_stride_b, r); \
-  }
-
-#ifdef HAS_TRANSPOSEUVWX8_SSE2
-
-TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7)
-
-#endif
-#undef TUVANY
diff --git a/pkg/encoder/yuv/libyuv/rotate_common.c b/pkg/encoder/yuv/libyuv/rotate_common.c
deleted file mode 100644
index 20c1481a7..000000000
--- a/pkg/encoder/yuv/libyuv/rotate_common.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "rotate_row.h"
-
-void TransposeWx8_C(const uint8_t *src,
-                    int src_stride,
-                    uint8_t *dst,
-                    int dst_stride,
-                    int width) {
-    int i;
-    for (i = 0; i < width; ++i) {
-        dst[0] = src[0 * src_stride];
-        dst[1] = src[1 * src_stride];
-        dst[2] = src[2 * src_stride];
-        dst[3] = src[3 * src_stride];
-        dst[4] = src[4 * src_stride];
-        dst[5] = src[5 * src_stride];
-        dst[6] = src[6 * src_stride];
-        dst[7] = src[7 * src_stride];
-        ++src;
-        dst += dst_stride;
-    }
-}
-
-void TransposeUVWx8_C(const uint8_t *src,
-                      int src_stride,
-                      uint8_t *dst_a,
-                      int dst_stride_a,
-                      uint8_t *dst_b,
-                      int dst_stride_b,
-                      int width) {
-    int i;
-    for (i = 0; i < width; ++i) {
-        dst_a[0] = src[0 * src_stride + 0];
-        dst_b[0] = src[0 * src_stride + 1];
-        dst_a[1] = src[1 * src_stride + 0];
-        dst_b[1] = src[1 * src_stride + 1];
-        dst_a[2] = src[2 * src_stride + 0];
-        dst_b[2] = src[2 * src_stride + 1];
-        dst_a[3] = src[3 * src_stride + 0];
-        dst_b[3] = src[3 * src_stride + 1];
-        dst_a[4] = src[4 * src_stride + 0];
-        dst_b[4] = src[4 * src_stride + 1];
-        dst_a[5] = src[5 * src_stride + 0];
-        dst_b[5] = src[5 * src_stride + 1];
-        dst_a[6] = src[6 * src_stride + 0];
-        dst_b[6] = src[6 * src_stride + 1];
-        dst_a[7] = src[7 * src_stride + 0];
-        dst_b[7] = src[7 * src_stride + 1];
-        src += 2;
-        dst_a += dst_stride_a;
-        dst_b += dst_stride_b;
-    }
-}
-
-void TransposeWxH_C(const uint8_t *src,
-                    int src_stride,
-                    uint8_t *dst,
-                    int dst_stride,
-                    int width,
-                    int height) {
-    int i;
-    for (i = 0; i < width; ++i) {
-        int j;
-        for (j = 0; j < height; ++j) {
-            dst[i * dst_stride + j] = src[j * src_stride + i];
-        }
-    }
-}
diff --git a/pkg/encoder/yuv/libyuv/rotate_gcc.c b/pkg/encoder/yuv/libyuv/rotate_gcc.c
deleted file mode 100644
index 54fdafff8..000000000
--- a/pkg/encoder/yuv/libyuv/rotate_gcc.c
+++ /dev/null
@@ -1,370 +0,0 @@
-/*
- *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "rotate_row.h"
-#include "row.h"
-
-// This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
-
-// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
-#if defined(HAS_TRANSPOSEWX8_SSSE3)
-
-void TransposeWx8_SSSE3(const uint8_t *src,
-                        int src_stride,
-                        uint8_t *dst,
-                        int dst_stride,
-                        int width) {
-    asm volatile(
-        // Read in the data from the source pointer.
-        // First round of bit swap.
-            LABELALIGN
-            "1:                                        \n"
-            "movq        (%0),%%xmm0                   \n"
-            "movq        (%0,%3),%%xmm1                \n"
-            "lea         (%0,%3,2),%0                  \n"
-            "punpcklbw   %%xmm1,%%xmm0                 \n"
-            "movq        (%0),%%xmm2                   \n"
-            "movdqa      %%xmm0,%%xmm1                 \n"
-            "palignr     $0x8,%%xmm1,%%xmm1            \n"
-            "movq        (%0,%3),%%xmm3                \n"
-            "lea         (%0,%3,2),%0                  \n"
-            "punpcklbw   %%xmm3,%%xmm2                 \n"
-            "movdqa      %%xmm2,%%xmm3                 \n"
-            "movq        (%0),%%xmm4                   \n"
-            "palignr     $0x8,%%xmm3,%%xmm3            \n"
-            "movq        (%0,%3),%%xmm5                \n"
-            "lea         (%0,%3,2),%0                  \n"
-            "punpcklbw   %%xmm5,%%xmm4                 \n"
-            "movdqa      %%xmm4,%%xmm5                 \n"
-            "movq        (%0),%%xmm6                   \n"
-            "palignr     $0x8,%%xmm5,%%xmm5            \n"
-            "movq        (%0,%3),%%xmm7                \n"
-            "lea         (%0,%3,2),%0                  \n"
-            "punpcklbw   %%xmm7,%%xmm6                 \n"
-            "neg         %3                            \n"
-            "movdqa      %%xmm6,%%xmm7                 \n"
-            "lea         0x8(%0,%3,8),%0               \n"
-            "palignr     $0x8,%%xmm7,%%xmm7            \n"
-            "neg         %3                            \n"
-            // Second round of bit swap.
-            "punpcklwd   %%xmm2,%%xmm0                 \n"
-            "punpcklwd   %%xmm3,%%xmm1                 \n"
-            "movdqa      %%xmm0,%%xmm2                 \n"
-            "movdqa      %%xmm1,%%xmm3                 \n"
-            "palignr     $0x8,%%xmm2,%%xmm2            \n"
-            "palignr     $0x8,%%xmm3,%%xmm3            \n"
-            "punpcklwd   %%xmm6,%%xmm4                 \n"
-            "punpcklwd   %%xmm7,%%xmm5                 \n"
-            "movdqa      %%xmm4,%%xmm6                 \n"
-            "movdqa      %%xmm5,%%xmm7                 \n"
-            "palignr     $0x8,%%xmm6,%%xmm6            \n"
-            "palignr     $0x8,%%xmm7,%%xmm7            \n"
-            // Third round of bit swap.
-            // Write to the destination pointer.
-            "punpckldq   %%xmm4,%%xmm0                 \n"
-            "movq        %%xmm0,(%1)                   \n"
-            "movdqa      %%xmm0,%%xmm4                 \n"
-            "palignr     $0x8,%%xmm4,%%xmm4            \n"
-            "movq        %%xmm4,(%1,%4)                \n"
-            "lea         (%1,%4,2),%1                  \n"
-            "punpckldq   %%xmm6,%%xmm2                 \n"
-            "movdqa      %%xmm2,%%xmm6                 \n"
-            "movq        %%xmm2,(%1)                   \n"
-            "palignr     $0x8,%%xmm6,%%xmm6            \n"
-            "punpckldq   %%xmm5,%%xmm1                 \n"
-            "movq        %%xmm6,(%1,%4)                \n"
-            "lea         (%1,%4,2),%1                  \n"
-            "movdqa      %%xmm1,%%xmm5                 \n"
-            "movq        %%xmm1,(%1)                   \n"
-            "palignr     $0x8,%%xmm5,%%xmm5            \n"
-            "movq        %%xmm5,(%1,%4)                \n"
-            "lea         (%1,%4,2),%1                  \n"
-            "punpckldq   %%xmm7,%%xmm3                 \n"
-            "movq        %%xmm3,(%1)                   \n"
-            "movdqa      %%xmm3,%%xmm7                 \n"
-            "palignr     $0x8,%%xmm7,%%xmm7            \n"
-            "sub         $0x8,%2                       \n"
-            "movq        %%xmm7,(%1,%4)                \n"
-            "lea         (%1,%4,2),%1                  \n"
-            "jg          1b                            \n"
-            : "+r"(src),                    // %0
-    "+r"(dst),                    // %1
-    "+r"(width)                   // %2
-            : "r"((intptr_t) (src_stride)),  // %3
-    "r"((intptr_t) (dst_stride))   // %4
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-    "xmm7");
-}
-
-#endif  // defined(HAS_TRANSPOSEWX8_SSSE3)
-
-// Transpose 16x8. 64 bit
-#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
-
-void TransposeWx8_Fast_SSSE3(const uint8_t *src,
-                             int src_stride,
-                             uint8_t *dst,
-                             int dst_stride,
-                             int width) {
-    asm volatile(
-        // Read in the data from the source pointer.
-        // First round of bit swap.
-            LABELALIGN
-            "1:                                        \n"
-            "movdqu      (%0),%%xmm0                   \n"
-            "movdqu      (%0,%3),%%xmm1                \n"
-            "lea         (%0,%3,2),%0                  \n"
-            "movdqa      %%xmm0,%%xmm8                 \n"
-            "punpcklbw   %%xmm1,%%xmm0                 \n"
-            "punpckhbw   %%xmm1,%%xmm8                 \n"
-            "movdqu      (%0),%%xmm2                   \n"
-            "movdqa      %%xmm0,%%xmm1                 \n"
-            "movdqa      %%xmm8,%%xmm9                 \n"
-            "palignr     $0x8,%%xmm1,%%xmm1            \n"
-            "palignr     $0x8,%%xmm9,%%xmm9            \n"
-            "movdqu      (%0,%3),%%xmm3                \n"
-            "lea         (%0,%3,2),%0                  \n"
-            "movdqa      %%xmm2,%%xmm10                \n"
-            "punpcklbw   %%xmm3,%%xmm2                 \n"
-            "punpckhbw   %%xmm3,%%xmm10                \n"
-            "movdqa      %%xmm2,%%xmm3                 \n"
-            "movdqa      %%xmm10,%%xmm11               \n"
-            "movdqu      (%0),%%xmm4                   \n"
-            "palignr     $0x8,%%xmm3,%%xmm3            \n"
-            "palignr     $0x8,%%xmm11,%%xmm11          \n"
-            "movdqu      (%0,%3),%%xmm5                \n"
-            "lea         (%0,%3,2),%0                  \n"
-            "movdqa      %%xmm4,%%xmm12                \n"
-            "punpcklbw   %%xmm5,%%xmm4                 \n"
-            "punpckhbw   %%xmm5,%%xmm12                \n"
-            "movdqa      %%xmm4,%%xmm5                 \n"
-            "movdqa      %%xmm12,%%xmm13               \n"
-            "movdqu      (%0),%%xmm6                   \n"
-            "palignr     $0x8,%%xmm5,%%xmm5            \n"
-            "palignr     $0x8,%%xmm13,%%xmm13          \n"
-            "movdqu      (%0,%3),%%xmm7                \n"
-            "lea         (%0,%3,2),%0                  \n"
-            "movdqa      %%xmm6,%%xmm14                \n"
-            "punpcklbw   %%xmm7,%%xmm6                 \n"
-            "punpckhbw   %%xmm7,%%xmm14                \n"
-            "neg         %3                            \n"
-            "movdqa      %%xmm6,%%xmm7                 \n"
-            "movdqa      %%xmm14,%%xmm15               \n"
-            "lea         0x10(%0,%3,8),%0              \n"
-            "palignr     $0x8,%%xmm7,%%xmm7            \n"
-            "palignr     $0x8,%%xmm15,%%xmm15          \n"
-            "neg         %3                            \n"
-            // Second round of bit swap.
-            "punpcklwd   %%xmm2,%%xmm0                 \n"
-            "punpcklwd   %%xmm3,%%xmm1                 \n"
-            "movdqa      %%xmm0,%%xmm2                 \n"
-            "movdqa      %%xmm1,%%xmm3                 \n"
-            "palignr     $0x8,%%xmm2,%%xmm2            \n"
-            "palignr     $0x8,%%xmm3,%%xmm3            \n"
-            "punpcklwd   %%xmm6,%%xmm4                 \n"
-            "punpcklwd   %%xmm7,%%xmm5                 \n"
-            "movdqa      %%xmm4,%%xmm6                 \n"
-            "movdqa      %%xmm5,%%xmm7                 \n"
-            "palignr     $0x8,%%xmm6,%%xmm6            \n"
-            "palignr     $0x8,%%xmm7,%%xmm7            \n"
-            "punpcklwd   %%xmm10,%%xmm8                \n"
-            "punpcklwd   %%xmm11,%%xmm9                \n"
-            "movdqa      %%xmm8,%%xmm10                \n"
-            "movdqa      %%xmm9,%%xmm11                \n"
-            "palignr     $0x8,%%xmm10,%%xmm10          \n"
-            "palignr     $0x8,%%xmm11,%%xmm11          \n"
-            "punpcklwd   %%xmm14,%%xmm12               \n"
-            "punpcklwd   %%xmm15,%%xmm13               \n"
-            "movdqa      %%xmm12,%%xmm14               \n"
-            "movdqa      %%xmm13,%%xmm15               \n"
-            "palignr     $0x8,%%xmm14,%%xmm14          \n"
-            "palignr     $0x8,%%xmm15,%%xmm15          \n"
-            // Third round of bit swap.
-            // Write to the destination pointer.
-            "punpckldq   %%xmm4,%%xmm0                 \n"
-            "movq        %%xmm0,(%1)                   \n"
-            "movdqa      %%xmm0,%%xmm4                 \n"
-            "palignr     $0x8,%%xmm4,%%xmm4            \n"
-            "movq        %%xmm4,(%1,%4)                \n"
-            "lea         (%1,%4,2),%1                  \n"
-            "punpckldq   %%xmm6,%%xmm2                 \n"
-            "movdqa      %%xmm2,%%xmm6                 \n"
-            "movq        %%xmm2,(%1)                   \n"
-            "palignr     $0x8,%%xmm6,%%xmm6            \n"
-            "punpckldq   %%xmm5,%%xmm1                 \n"
-            "movq        %%xmm6,(%1,%4)                \n"
-            "lea         (%1,%4,2),%1                  \n"
-            "movdqa      %%xmm1,%%xmm5                 \n"
-            "movq        %%xmm1,(%1)                   \n"
-            "palignr     $0x8,%%xmm5,%%xmm5            \n"
-            "movq        %%xmm5,(%1,%4)                \n"
-            "lea         (%1,%4,2),%1                  \n"
-            "punpckldq   %%xmm7,%%xmm3                 \n"
-            "movq        %%xmm3,(%1)                   \n"
-            "movdqa      %%xmm3,%%xmm7                 \n"
-            "palignr     $0x8,%%xmm7,%%xmm7            \n"
-            "movq        %%xmm7,(%1,%4)                \n"
-            "lea         (%1,%4,2),%1                  \n"
-            "punpckldq   %%xmm12,%%xmm8                \n"
-            "movq        %%xmm8,(%1)                   \n"
-            "movdqa      %%xmm8,%%xmm12                \n"
-            "palignr     $0x8,%%xmm12,%%xmm12          \n"
-            "movq        %%xmm12,(%1,%4)               \n"
-            "lea         (%1,%4,2),%1                  \n"
-            "punpckldq   %%xmm14,%%xmm10               \n"
-            "movdqa      %%xmm10,%%xmm14               \n"
-            "movq        %%xmm10,(%1)                  \n"
-            "palignr     $0x8,%%xmm14,%%xmm14          \n"
-            "punpckldq   %%xmm13,%%xmm9                \n"
-            "movq        %%xmm14,(%1,%4)               \n"
-            "lea         (%1,%4,2),%1                  \n"
-            "movdqa      %%xmm9,%%xmm13                \n"
-            "movq        %%xmm9,(%1)                   \n"
-            "palignr     $0x8,%%xmm13,%%xmm13          \n"
-            "movq        %%xmm13,(%1,%4)               \n"
-            "lea         (%1,%4,2),%1                  \n"
-            "punpckldq   %%xmm15,%%xmm11               \n"
-            "movq        %%xmm11,(%1)                  \n"
-            "movdqa      %%xmm11,%%xmm15               \n"
-            "palignr     $0x8,%%xmm15,%%xmm15          \n"
-            "sub         $0x10,%2                      \n"
-            "movq        %%xmm15,(%1,%4)               \n"
-            "lea         (%1,%4,2),%1                  \n"
-            "jg          1b                            \n"
-            : "+r"(src),                    // %0
-    "+r"(dst),                    // %1
-    "+r"(width)                   // %2
-            : "r"((intptr_t) (src_stride)),  // %3
-    "r"((intptr_t) (dst_stride))   // %4
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-    "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
-    "xmm15");
-}
-
-#endif  // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
-
-// Transpose UV 8x8.  64 bit.
-#if defined(HAS_TRANSPOSEUVWX8_SSE2)
-
-void TransposeUVWx8_SSE2(const uint8_t *src,
-                         int src_stride,
-                         uint8_t *dst_a,
-                         int dst_stride_a,
-                         uint8_t *dst_b,
-                         int dst_stride_b,
-                         int width) {
-    asm volatile(
-        // Read in the data from the source pointer.
-        // First round of bit swap.
-            LABELALIGN
-            "1:                                        \n"
-            "movdqu      (%0),%%xmm0                   \n"
-            "movdqu      (%0,%4),%%xmm1                \n"
-            "lea         (%0,%4,2),%0                  \n"
-            "movdqa      %%xmm0,%%xmm8                 \n"
-            "punpcklbw   %%xmm1,%%xmm0                 \n"
-            "punpckhbw   %%xmm1,%%xmm8                 \n"
-            "movdqa      %%xmm8,%%xmm1                 \n"
-            "movdqu      (%0),%%xmm2                   \n"
-            "movdqu      (%0,%4),%%xmm3                \n"
-            "lea         (%0,%4,2),%0                  \n"
-            "movdqa      %%xmm2,%%xmm8                 \n"
-            "punpcklbw   %%xmm3,%%xmm2                 \n"
-            "punpckhbw   %%xmm3,%%xmm8                 \n"
-            "movdqa      %%xmm8,%%xmm3                 \n"
-            "movdqu      (%0),%%xmm4                   \n"
-            "movdqu      (%0,%4),%%xmm5                \n"
-            "lea         (%0,%4,2),%0                  \n"
-            "movdqa      %%xmm4,%%xmm8                 \n"
-            "punpcklbw   %%xmm5,%%xmm4                 \n"
-            "punpckhbw   %%xmm5,%%xmm8                 \n"
-            "movdqa      %%xmm8,%%xmm5                 \n"
-            "movdqu      (%0),%%xmm6                   \n"
-            "movdqu      (%0,%4),%%xmm7                \n"
-            "lea         (%0,%4,2),%0                  \n"
-            "movdqa      %%xmm6,%%xmm8                 \n"
-            "punpcklbw   %%xmm7,%%xmm6                 \n"
-            "neg         %4                            \n"
-            "lea         0x10(%0,%4,8),%0              \n"
-            "punpckhbw   %%xmm7,%%xmm8                 \n"
-            "movdqa      %%xmm8,%%xmm7                 \n"
-            "neg         %4                            \n"
-            // Second round of bit swap.
-            "movdqa      %%xmm0,%%xmm8                 \n"
-            "movdqa      %%xmm1,%%xmm9                 \n"
-            "punpckhwd   %%xmm2,%%xmm8                 \n"
-            "punpckhwd   %%xmm3,%%xmm9                 \n"
-            "punpcklwd   %%xmm2,%%xmm0                 \n"
-            "punpcklwd   %%xmm3,%%xmm1                 \n"
-            "movdqa      %%xmm8,%%xmm2                 \n"
-            "movdqa      %%xmm9,%%xmm3                 \n"
-            "movdqa      %%xmm4,%%xmm8                 \n"
-            "movdqa      %%xmm5,%%xmm9                 \n"
-            "punpckhwd   %%xmm6,%%xmm8                 \n"
-            "punpckhwd   %%xmm7,%%xmm9                 \n"
-            "punpcklwd   %%xmm6,%%xmm4                 \n"
-            "punpcklwd   %%xmm7,%%xmm5                 \n"
-            "movdqa      %%xmm8,%%xmm6                 \n"
-            "movdqa      %%xmm9,%%xmm7                 \n"
-            // Third round of bit swap.
-            // Write to the destination pointer.
-            "movdqa      %%xmm0,%%xmm8                 \n"
-            "punpckldq   %%xmm4,%%xmm0                 \n"
-            "movlpd      %%xmm0,(%1)                   \n"  // Write back U channel
-            "movhpd      %%xmm0,(%2)                   \n"  // Write back V channel
-            "punpckhdq   %%xmm4,%%xmm8                 \n"
-            "movlpd      %%xmm8,(%1,%5)                \n"
-            "lea         (%1,%5,2),%1                  \n"
-            "movhpd      %%xmm8,(%2,%6)                \n"
-            "lea         (%2,%6,2),%2                  \n"
-            "movdqa      %%xmm2,%%xmm8                 \n"
-            "punpckldq   %%xmm6,%%xmm2                 \n"
-            "movlpd      %%xmm2,(%1)                   \n"
-            "movhpd      %%xmm2,(%2)                   \n"
-            "punpckhdq   %%xmm6,%%xmm8                 \n"
-            "movlpd      %%xmm8,(%1,%5)                \n"
-            "lea         (%1,%5,2),%1                  \n"
-            "movhpd      %%xmm8,(%2,%6)                \n"
-            "lea         (%2,%6,2),%2                  \n"
-            "movdqa      %%xmm1,%%xmm8                 \n"
-            "punpckldq   %%xmm5,%%xmm1                 \n"
-            "movlpd      %%xmm1,(%1)                   \n"
-            "movhpd      %%xmm1,(%2)                   \n"
-            "punpckhdq   %%xmm5,%%xmm8                 \n"
-            "movlpd      %%xmm8,(%1,%5)                \n"
-            "lea         (%1,%5,2),%1                  \n"
-            "movhpd      %%xmm8,(%2,%6)                \n"
-            "lea         (%2,%6,2),%2                  \n"
-            "movdqa      %%xmm3,%%xmm8                 \n"
-            "punpckldq   %%xmm7,%%xmm3                 \n"
-            "movlpd      %%xmm3,(%1)                   \n"
-            "movhpd      %%xmm3,(%2)                   \n"
-            "punpckhdq   %%xmm7,%%xmm8                 \n"
-            "sub         $0x8,%3                       \n"
-            "movlpd      %%xmm8,(%1,%5)                \n"
-            "lea         (%1,%5,2),%1                  \n"
-            "movhpd      %%xmm8,(%2,%6)                \n"
-            "lea         (%2,%6,2),%2                  \n"
-            "jg          1b                            \n"
-            : "+r"(src),                      // %0
-    "+r"(dst_a),                    // %1
-    "+r"(dst_b),                    // %2
-    "+r"(width)                     // %3
-            : "r"((intptr_t) (src_stride)),    // %4
-    "r"((intptr_t) (dst_stride_a)),  // %5
-    "r"((intptr_t) (dst_stride_b))   // %6
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-    "xmm7", "xmm8", "xmm9");
-}
-
-#endif  // defined(HAS_TRANSPOSEUVWX8_SSE2)
-
-#endif  // defined(__x86_64__) || defined(__i386__)
diff --git a/pkg/encoder/yuv/libyuv/rotate_row.h b/pkg/encoder/yuv/libyuv/rotate_row.h
deleted file mode 100644
index afdae49f0..000000000
--- a/pkg/encoder/yuv/libyuv/rotate_row.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_
-#define INCLUDE_LIBYUV_ROTATE_ROW_H_
-
-#include "basic_types.h"
-
-#if defined(__pnacl__) || defined(__CLR_VER) || \
-    (defined(__native_client__) && defined(__x86_64__)) || \
-    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
-#define LIBYUV_DISABLE_X86
-#endif
-#if defined(__native_client__)
-#define LIBYUV_DISABLE_NEON
-#endif
-// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
-#if defined(__has_feature)
-#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON)
-#define LIBYUV_DISABLE_NEON
-#endif
-#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86)
-#define LIBYUV_DISABLE_X86
-#endif
-#endif
-
-// The following are available for GCC 32 or 64 bit:
-#if !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__))
-#define HAS_TRANSPOSEWX8_SSSE3
-#endif
-
-// The following are available for 64 bit GCC:
-#if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__)
-#define HAS_TRANSPOSEWX8_FAST_SSSE3
-#define HAS_TRANSPOSEUVWX8_SSE2
-#endif
-
-void TransposeWxH_C(const uint8_t *src,
-                    int src_stride,
-                    uint8_t *dst,
-                    int dst_stride,
-                    int width,
-                    int height);
-
-void TransposeWx8_C(const uint8_t *src,
-                    int src_stride,
-                    uint8_t *dst,
-                    int dst_stride,
-                    int width);
-
-void TransposeWx8_SSSE3(const uint8_t *src,
-                        int src_stride,
-                        uint8_t *dst,
-                        int dst_stride,
-                        int width);
-
-void TransposeWx8_Fast_SSSE3(const uint8_t *src,
-                             int src_stride,
-                             uint8_t *dst,
-                             int dst_stride,
-                             int width);
-
-void TransposeWx8_Any_SSSE3(const uint8_t *src,
-                            int src_stride,
-                            uint8_t *dst,
-                            int dst_stride,
-                            int width);
-
-void TransposeWx8_Fast_Any_SSSE3(const uint8_t *src,
-                                 int src_stride,
-                                 uint8_t *dst,
-                                 int dst_stride,
-                                 int width);
-
-void TransposeUVWx8_C(const uint8_t *src,
-                      int src_stride,
-                      uint8_t *dst_a,
-                      int dst_stride_a,
-                      uint8_t *dst_b,
-                      int dst_stride_b,
-                      int width);
-
-void TransposeUVWx8_SSE2(const uint8_t *src,
-                         int src_stride,
-                         uint8_t *dst_a,
-                         int dst_stride_a,
-                         uint8_t *dst_b,
-                         int dst_stride_b,
-                         int width);
-
-void TransposeUVWx8_Any_SSE2(const uint8_t *src,
-                             int src_stride,
-                             uint8_t *dst_a,
-                             int dst_stride_a,
-                             uint8_t *dst_b,
-                             int dst_stride_b,
-                             int width);
-
-#endif  // INCLUDE_LIBYUV_ROTATE_ROW_H_
diff --git a/pkg/encoder/yuv/libyuv/row.h b/pkg/encoder/yuv/libyuv/row.h
deleted file mode 100644
index ca1c0c298..000000000
--- a/pkg/encoder/yuv/libyuv/row.h
+++ /dev/null
@@ -1,426 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_ROW_H_
-#define INCLUDE_LIBYUV_ROW_H_
-
-#include <stddef.h>  // For NULL
-#include <stdlib.h>  // For malloc
-
-#include "basic_types.h"
-
-#if defined(__pnacl__) || defined(__CLR_VER) || \
-    (defined(__native_client__) && defined(__x86_64__)) || \
-    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
-#define LIBYUV_DISABLE_X86
-#endif
-#if defined(__native_client__)
-#define LIBYUV_DISABLE_NEON
-#endif
-// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
-#if defined(__has_feature)
-#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON)
-#define LIBYUV_DISABLE_NEON
-#endif
-#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86)
-#define LIBYUV_DISABLE_X86
-#endif
-#endif
-
-// GCC >= 4.7.0 required for AVX2.
-#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
-#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
-#define GCC_HAS_AVX2 1
-#endif  // GNUC >= 4.7
-#endif  // __GNUC__
-
-// The following are available on all x86 platforms:
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-// Conversions:
-#define HAS_ABGRTOYROW_SSSE3
-#define HAS_ARGBTOYROW_SSSE3
-#define HAS_BGRATOYROW_SSSE3
-#define HAS_COPYROW_ERMS
-#define HAS_COPYROW_SSE2
-#define HAS_INTERPOLATEROW_SSSE3
-#define HAS_MIRRORROW_SSSE3
-#define HAS_MIRRORSPLITUVROW_SSSE3
-#if !defined(LIBYUV_BIT_EXACT)
-#define HAS_ABGRTOUVROW_SSSE3
-#define HAS_ARGBTOUVROW_SSSE3
-#endif
-
-// Effects:
-#define HAS_ARGBGRAYROW_SSSE3
-#define HAS_ARGBMIRRORROW_SSE2
-
-#endif
-
-// The following are available on all x86 platforms, but
-// require VS2012, clang 3.4 or gcc 4.7.
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
-     defined(GCC_HAS_AVX2))
-#define HAS_ARGBEXTRACTALPHAROW_AVX2
-#define HAS_ARGBMIRRORROW_AVX2
-#define HAS_ARGBTOYROW_AVX2
-#define HAS_COPYROW_AVX
-#define HAS_INTERPOLATEROW_AVX2
-#define HAS_MIRRORROW_AVX2
-#if !defined(LIBYUV_BIT_EXACT)
-#define HAS_ARGBTOUVROW_AVX2
-#endif
-
-#endif
-
-// The following are available for gcc/clang x86 platforms:
-// TODO(fbarchard): Port to Visual C
-#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
-#define HAS_MIRRORUVROW_SSSE3
-
-#endif
-
-// The following are available for AVX2 gcc/clang x86 platforms:
-// TODO(fbarchard): Port to Visual C
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || defined(__i386__)) && \
-    (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
-#define HAS_ABGRTOYROW_AVX2
-#define HAS_MIRRORUVROW_AVX2
-#if !defined(LIBYUV_BIT_EXACT)
-#define HAS_ABGRTOUVROW_AVX2
-#endif
-
-#endif
-
-#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
-                                                                                                                        #if defined(VISUALC_HAS_AVX2)
-#define SIMD_ALIGNED(var) __declspec(align(32)) var
-#else
-#define SIMD_ALIGNED(var) __declspec(align(16)) var
-#endif
-#define LIBYUV_NOINLINE __declspec(noinline)
-typedef __declspec(align(16)) int16_t vec16[8];
-typedef __declspec(align(16)) int32_t vec32[4];
-typedef __declspec(align(16)) float vecf32[4];
-typedef __declspec(align(16)) int8_t vec8[16];
-typedef __declspec(align(16)) uint16_t uvec16[8];
-typedef __declspec(align(16)) uint32_t uvec32[4];
-typedef __declspec(align(16)) uint8_t uvec8[16];
-typedef __declspec(align(32)) int16_t lvec16[16];
-typedef __declspec(align(32)) int32_t lvec32[8];
-typedef __declspec(align(32)) int8_t lvec8[32];
-typedef __declspec(align(32)) uint16_t ulvec16[16];
-typedef __declspec(align(32)) uint32_t ulvec32[8];
-typedef __declspec(align(32)) uint8_t ulvec8[32];
-#elif !defined(__pnacl__) && (defined(__GNUC__) || defined(__clang__))
-// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
-#if defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)
-#define SIMD_ALIGNED(var) var __attribute__((aligned(32)))
-#else
-#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
-#endif
-#define LIBYUV_NOINLINE __attribute__((noinline))
-typedef int16_t __attribute__((vector_size(16))) vec16;
-typedef int32_t __attribute__((vector_size(16))) vec32;
-typedef float __attribute__((vector_size(16))) vecf32;
-typedef int8_t __attribute__((vector_size(16))) vec8;
-typedef uint16_t __attribute__((vector_size(16))) uvec16;
-typedef uint32_t __attribute__((vector_size(16))) uvec32;
-typedef uint8_t __attribute__((vector_size(16))) uvec8;
-typedef int16_t __attribute__((vector_size(32))) lvec16;
-typedef int32_t __attribute__((vector_size(32))) lvec32;
-typedef int8_t __attribute__((vector_size(32))) lvec8;
-typedef uint16_t __attribute__((vector_size(32))) ulvec16;
-typedef uint32_t __attribute__((vector_size(32))) ulvec32;
-typedef uint8_t __attribute__((vector_size(32))) ulvec8;
-#else
-#define SIMD_ALIGNED(var) var
-#define LIBYUV_NOINLINE
-typedef int16_t vec16[8];
-typedef int32_t vec32[4];
-typedef float vecf32[4];
-typedef int8_t vec8[16];
-typedef uint16_t uvec16[8];
-typedef uint32_t uvec32[4];
-typedef uint8_t uvec8[16];
-typedef int16_t lvec16[16];
-typedef int32_t lvec32[8];
-typedef int8_t lvec8[32];
-typedef uint16_t ulvec16[16];
-typedef uint32_t ulvec32[8];
-typedef uint8_t ulvec8[32];
-#endif
-
-#if !defined(__aarch64__) || !defined(__arm__)
-// This struct is for Intel color conversion.
-struct YuvConstants {
-    uint8_t kUVToB[32];
-    uint8_t kUVToG[32];
-    uint8_t kUVToR[32];
-    int16_t kYToRgb[16];
-    int16_t kYBiasToRgb[16];
-};
-#endif
-
-#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
-
-#define align_buffer_64(var, size)                                         \
-  void* var##_mem = malloc((size) + 63);                      /* NOLINT */ \
-  uint8_t* var = (uint8_t*)(((intptr_t)var##_mem + 63) & ~63) /* NOLINT */
-
-#define free_aligned_buffer_64(var) \
-  free(var##_mem);                  \
-  var = NULL
-
-#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
-#define OMITFP
-#else
-#define OMITFP __attribute__((optimize("omit-frame-pointer")))
-#endif
-
-// NaCL macros for GCC x86 and x64.
-#if defined(__native_client__)
-#define LABELALIGN ".p2align 5\n"
-#else
-#define LABELALIGN
-#endif
-
-void ARGBToYRow_AVX2(const uint8_t *src_argb, uint8_t *dst_y, int width);
-
-void ARGBToYRow_Any_AVX2(const uint8_t *src_ptr, uint8_t *dst_ptr, int width);
-
-void ABGRToYRow_AVX2(const uint8_t *src_abgr, uint8_t *dst_y, int width);
-
-void ABGRToYRow_Any_AVX2(const uint8_t *src_ptr, uint8_t *dst_ptr, int width);
-
-void ARGBToYRow_SSSE3(const uint8_t *src_argb, uint8_t *dst_y, int width);
-
-void ABGRToYRow_SSSE3(const uint8_t *src_abgr, uint8_t *dst_y, int width);
-
-void BGRAToYRow_SSSE3(const uint8_t *src_bgra, uint8_t *dst_y, int width);
-
-void ABGRToYRow_SSSE3(const uint8_t *src_abgr, uint8_t *dst_y, int width);
-
-void ARGBToYRow_C(const uint8_t *src_rgb, uint8_t *dst_y, int width);
-
-void ABGRToYRow_C(const uint8_t *src_rgb, uint8_t *dst_y, int width);
-
-void RGB565ToYRow_C(const uint8_t *src_rgb565, uint8_t *dst_y, int width);
-
-void ARGBToYRow_Any_SSSE3(const uint8_t *src_ptr, uint8_t *dst_ptr, int width);
-
-void BGRAToYRow_Any_SSSE3(const uint8_t *src_ptr, uint8_t *dst_ptr, int width);
-
-void ABGRToYRow_Any_SSSE3(const uint8_t *src_ptr, uint8_t *dst_ptr, int width);
-
-void ARGBToUVRow_AVX2(const uint8_t *src_argb,
-                      int src_stride_argb,
-                      uint8_t *dst_u,
-                      uint8_t *dst_v,
-                      int width);
-
-void ABGRToUVRow_AVX2(const uint8_t *src_abgr,
-                      int src_stride_abgr,
-                      uint8_t *dst_u,
-                      uint8_t *dst_v,
-                      int width);
-
-void ARGBToUVRow_SSSE3(const uint8_t *src_argb,
-                       int src_stride_argb,
-                       uint8_t *dst_u,
-                       uint8_t *dst_v,
-                       int width);
-
-void BGRAToUVRow_SSSE3(const uint8_t *src_bgra,
-                       int src_stride_bgra,
-                       uint8_t *dst_u,
-                       uint8_t *dst_v,
-                       int width);
-
-void ABGRToUVRow_SSSE3(const uint8_t *src_abgr,
-                       int src_stride_abgr,
-                       uint8_t *dst_u,
-                       uint8_t *dst_v,
-                       int width);
-
-void RGBAToUVRow_SSSE3(const uint8_t *src_rgba,
-                       int src_stride_rgba,
-                       uint8_t *dst_u,
-                       uint8_t *dst_v,
-                       int width);
-
-void ARGBToUVRow_Any_AVX2(const uint8_t *src_ptr,
-                          int src_stride,
-                          uint8_t *dst_u,
-                          uint8_t *dst_v,
-                          int width);
-
-void ABGRToUVRow_Any_AVX2(const uint8_t *src_ptr,
-                          int src_stride,
-                          uint8_t *dst_u,
-                          uint8_t *dst_v,
-                          int width);
-
-void ARGBToUVRow_Any_SSSE3(const uint8_t *src_ptr,
-                           int src_stride,
-                           uint8_t *dst_u,
-                           uint8_t *dst_v,
-                           int width);
-
-void BGRAToUVRow_Any_SSSE3(const uint8_t *src_ptr,
-                           int src_stride,
-                           uint8_t *dst_u,
-                           uint8_t *dst_v,
-                           int width);
-
-void ABGRToUVRow_Any_SSSE3(const uint8_t *src_ptr,
-                           int src_stride,
-                           uint8_t *dst_u,
-                           uint8_t *dst_v,
-                           int width);
-
-void RGBAToUVRow_Any_SSSE3(const uint8_t *src_ptr,
-                           int src_stride,
-                           uint8_t *dst_u,
-                           uint8_t *dst_v,
-                           int width);
-
-void ARGBToUVRow_C(const uint8_t *src_rgb,
-                   int src_stride_rgb,
-                   uint8_t *dst_u,
-                   uint8_t *dst_v,
-                   int width);
-
-void ARGBToUVRow_C(const uint8_t *src_rgb,
-                   int src_stride_rgb,
-                   uint8_t *dst_u,
-                   uint8_t *dst_v,
-                   int width);
-
-void BGRAToUVRow_C(const uint8_t *src_rgb,
-                   int src_stride_rgb,
-                   uint8_t *dst_u,
-                   uint8_t *dst_v,
-                   int width);
-
-void ABGRToUVRow_C(const uint8_t *src_rgb,
-                   int src_stride_rgb,
-                   uint8_t *dst_u,
-                   uint8_t *dst_v,
-                   int width);
-
-void RGBAToUVRow_C(const uint8_t *src_rgb,
-                   int src_stride_rgb,
-                   uint8_t *dst_u,
-                   uint8_t *dst_v,
-                   int width);
-
-void RGB565ToUVRow_C(const uint8_t *src_rgb565,
-                     int src_stride_rgb565,
-                     uint8_t *dst_u,
-                     uint8_t *dst_v,
-                     int width);
-
-void MirrorRow_AVX2(const uint8_t *src, uint8_t *dst, int width);
-
-void MirrorRow_SSSE3(const uint8_t *src, uint8_t *dst, int width);
-
-void MirrorRow_C(const uint8_t *src, uint8_t *dst, int width);
-
-void MirrorRow_Any_AVX2(const uint8_t *src_ptr, uint8_t *dst_ptr, int width);
-
-void MirrorRow_Any_SSSE3(const uint8_t *src_ptr, uint8_t *dst_ptr, int width);
-
-void MirrorRow_Any_SSE2(const uint8_t *src, uint8_t *dst, int width);
-
-void MirrorUVRow_AVX2(const uint8_t *src_uv, uint8_t *dst_uv, int width);
-
-void MirrorUVRow_SSSE3(const uint8_t *src_uv, uint8_t *dst_uv, int width);
-
-void MirrorUVRow_Any_AVX2(const uint8_t *src_ptr, uint8_t *dst_ptr, int width);
-
-void MirrorUVRow_Any_SSSE3(const uint8_t *src_ptr, uint8_t *dst_ptr, int width);
-
-void ARGBMirrorRow_AVX2(const uint8_t *src, uint8_t *dst, int width);
-
-void ARGBMirrorRow_SSE2(const uint8_t *src, uint8_t *dst, int width);
-
-void ARGBMirrorRow_C(const uint8_t *src, uint8_t *dst, int width);
-
-void ARGBMirrorRow_Any_AVX2(const uint8_t *src_ptr,
-                            uint8_t *dst_ptr,
-                            int width);
-
-void ARGBMirrorRow_Any_SSE2(const uint8_t *src_ptr,
-                            uint8_t *dst_ptr,
-                            int width);
-
-void CopyRow_SSE2(const uint8_t *src, uint8_t *dst, int width);
-
-void CopyRow_AVX(const uint8_t *src, uint8_t *dst, int width);
-
-void CopyRow_ERMS(const uint8_t *src, uint8_t *dst, int width);
-
-void CopyRow_C(const uint8_t *src, uint8_t *dst, int count);
-
-void CopyRow_Any_SSE2(const uint8_t *src_ptr, uint8_t *dst_ptr, int width);
-
-void CopyRow_Any_AVX(const uint8_t *src_ptr, uint8_t *dst_ptr, int width);
-
-void RGB565ToARGBRow_SSE2(const uint8_t *src, uint8_t *dst, int width);
-
-void RGB565ToARGBRow_AVX2(const uint8_t *src_rgb565,
-                          uint8_t *dst_argb,
-                          int width);
-
-void RGB565ToARGBRow_C(const uint8_t *src_rgb565, uint8_t *dst_argb, int width);
-
-void RGB565ToARGBRow_Any_SSE2(const uint8_t *src_ptr,
-                              uint8_t *dst_ptr,
-                              int width);
-
-void RGB565ToARGBRow_Any_AVX2(const uint8_t *src_ptr,
-                              uint8_t *dst_ptr,
-                              int width);
-
-// Used for I420Scale, ARGBScale, and ARGBInterpolate.
-void InterpolateRow_C(uint8_t *dst_ptr,
-                      const uint8_t *src_ptr,
-                      ptrdiff_t src_stride,
-                      int width,
-                      int source_y_fraction);
-
-void InterpolateRow_SSSE3(uint8_t *dst_ptr,
-                          const uint8_t *src_ptr,
-                          ptrdiff_t src_stride,
-                          int dst_width,
-                          int source_y_fraction);
-
-void InterpolateRow_AVX2(uint8_t *dst_ptr,
-                         const uint8_t *src_ptr,
-                         ptrdiff_t src_stride,
-                         int dst_width,
-                         int source_y_fraction);
-
-void InterpolateRow_Any_SSSE3(uint8_t *dst_ptr,
-                              const uint8_t *src_ptr,
-                              ptrdiff_t src_stride_ptr,
-                              int width,
-                              int source_y_fraction);
-
-void InterpolateRow_Any_AVX2(uint8_t *dst_ptr,
-                             const uint8_t *src_ptr,
-                             ptrdiff_t src_stride_ptr,
-                             int width,
-                             int source_y_fraction);
-
-#endif  // INCLUDE_LIBYUV_ROW_H_
\ No newline at end of file
diff --git a/pkg/encoder/yuv/libyuv/row_any.c b/pkg/encoder/yuv/libyuv/row_any.c
deleted file mode 100644
index fcc49c672..000000000
--- a/pkg/encoder/yuv/libyuv/row_any.c
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "row.h"
-
-#include <string.h>  // For memset.
-
-// Subsampled source needs to be increase by 1 of not even.
-#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
-
-// Any 1 to 1.
-#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)               \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {    \
-    SIMD_ALIGNED(uint8_t vin[128]);                                      \
-    SIMD_ALIGNED(uint8_t vout[128]);                                     \
-    memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */                 \
-    int r = width & MASK;                                                \
-    int n = width & ~MASK;                                               \
-    if (n > 0) {                                                         \
-      ANY_SIMD(src_ptr, dst_ptr, n);                                     \
-    }                                                                    \
-    memcpy(vin, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
-    ANY_SIMD(vin, vout, MASK + 1);                                       \
-    memcpy(dst_ptr + n * BPP, vout, r * BPP);                            \
-  }
-
-#ifdef HAS_COPYROW_AVX
-
-ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63)
-
-#endif
-#ifdef HAS_COPYROW_SSE2
-
-ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31)
-
-#endif
-
-#ifdef HAS_ARGBTOYROW_AVX2
-
-ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
-
-#endif
-#ifdef HAS_ABGRTOYROW_AVX2
-
-ANY11(ABGRToYRow_Any_AVX2, ABGRToYRow_AVX2, 0, 4, 1, 31)
-
-#endif
-#ifdef HAS_ARGBTOYROW_SSSE3
-
-ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15)
-
-#endif
-#ifdef HAS_BGRATOYROW_SSSE3
-
-ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15)
-
-ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15)
-
-#endif
-
-#undef ANY11
-
-// Any 1 to 1 interpolate.  Takes 2 rows of source via stride.
-#define ANY11I(NAMEANY, ANY_SIMD, TD, TS, SBPP, BPP, MASK)           \
-  void NAMEANY(TD* dst_ptr, const TS* src_ptr, ptrdiff_t src_stride, \
-               int width, int source_y_fraction) {                   \
-    SIMD_ALIGNED(TS vin[64 * 2]);                                    \
-    SIMD_ALIGNED(TD vout[64]);                                       \
-    memset(vin, 0, sizeof(vin)); /* for msan */                      \
-    int r = width & MASK;                                            \
-    int n = width & ~MASK;                                           \
-    if (n > 0) {                                                     \
-      ANY_SIMD(dst_ptr, src_ptr, src_stride, n, source_y_fraction);  \
-    }                                                                \
-    memcpy(vin, src_ptr + n * SBPP, r * SBPP * sizeof(TS));          \
-    if (source_y_fraction) {                                         \
-      memcpy(vin + 64, src_ptr + src_stride + n * SBPP,              \
-             r * SBPP * sizeof(TS));                                 \
-    }                                                                \
-    ANY_SIMD(vout, vin, 64, MASK + 1, source_y_fraction);            \
-    memcpy(dst_ptr + n * BPP, vout, r * BPP * sizeof(TD));           \
-  }
-
-#ifdef HAS_INTERPOLATEROW_AVX2
-
-ANY11I(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, uint8_t, uint8_t, 1, 1, 31)
-
-#endif
-#ifdef HAS_INTERPOLATEROW_SSSE3
-
-ANY11I(InterpolateRow_Any_SSSE3,
-       InterpolateRow_SSSE3,
-       uint8_t,
-       uint8_t,
-       1,
-       1,
-       15)
-
-#endif
-
-#undef ANY11I
-
-// Any 1 to 1 mirror.
-#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                          \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \
-    SIMD_ALIGNED(uint8_t vin[64]);                                    \
-    SIMD_ALIGNED(uint8_t vout[64]);                                   \
-    memset(vin, 0, sizeof(vin)); /* for msan */                       \
-    int r = width & MASK;                                             \
-    int n = width & ~MASK;                                            \
-    if (n > 0) {                                                      \
-      ANY_SIMD(src_ptr + r * BPP, dst_ptr, n);                        \
-    }                                                                 \
-    memcpy(vin, src_ptr, r* BPP);                                     \
-    ANY_SIMD(vin, vout, MASK + 1);                                    \
-    memcpy(dst_ptr + n * BPP, vout + (MASK + 1 - r) * BPP, r * BPP);  \
-  }
-
-#ifdef HAS_MIRRORROW_AVX2
-
-ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
-
-#endif
-#ifdef HAS_MIRRORROW_SSSE3
-
-ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
-
-#endif
-#ifdef HAS_MIRRORUVROW_AVX2
-
-ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15)
-
-#endif
-#ifdef HAS_MIRRORUVROW_SSSE3
-
-ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7)
-
-#endif
-#ifdef HAS_ARGBMIRRORROW_AVX2
-
-ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
-
-#endif
-#ifdef HAS_ARGBMIRRORROW_SSE2
-
-ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
-
-#endif
-#undef ANY11M
-
-// Any 1 to 2 with source stride (2 rows of source).  Outputs UV planes.
-// 128 byte row allows for 32 avx ARGB pixels.
-#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                        \
-  void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u,       \
-               uint8_t* dst_v, int width) {                                  \
-    SIMD_ALIGNED(uint8_t vin[128 * 2]);                                      \
-    SIMD_ALIGNED(uint8_t vout[128 * 2]);                                     \
-    memset(vin, 0, sizeof(vin)); /* for msan */                              \
-    int r = width & MASK;                                                    \
-    int n = width & ~MASK;                                                   \
-    if (n > 0) {                                                             \
-      ANY_SIMD(src_ptr, src_stride, dst_u, dst_v, n);                        \
-    }                                                                        \
-    memcpy(vin, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);       \
-    memcpy(vin + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP,           \
-           SS(r, UVSHIFT) * BPP);                                            \
-    if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
-      memcpy(vin + SS(r, UVSHIFT) * BPP, vin + SS(r, UVSHIFT) * BPP - BPP,   \
-             BPP);                                                           \
-      memcpy(vin + 128 + SS(r, UVSHIFT) * BPP,                               \
-             vin + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                   \
-    }                                                                        \
-    ANY_SIMD(vin, 128, vout, vout + 128, MASK + 1);                          \
-    memcpy(dst_u + (n >> 1), vout, SS(r, 1));                                \
-    memcpy(dst_v + (n >> 1), vout + 128, SS(r, 1));                          \
-  }
-
-#ifdef HAS_ARGBTOUVROW_AVX2
-
-ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
-
-#endif
-#ifdef HAS_ABGRTOUVROW_AVX2
-
-ANY12S(ABGRToUVRow_Any_AVX2, ABGRToUVRow_AVX2, 0, 4, 31)
-
-#endif
-#ifdef HAS_ARGBTOUVROW_SSSE3
-
-ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15)
-
-ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15)
-
-ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15)
-
-ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15)
-
-#endif
-#undef ANY12S
diff --git a/pkg/encoder/yuv/libyuv/row_common.c b/pkg/encoder/yuv/libyuv/row_common.c
deleted file mode 100644
index 34a93a074..000000000
--- a/pkg/encoder/yuv/libyuv/row_common.c
+++ /dev/null
@@ -1,887 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "row.h"
-
-#include <assert.h>
-#include <string.h>  // For memcpy and memset.
-
-#define STATIC_CAST(type, expr) (type)(expr)
-
-// This macro controls YUV to RGB using unsigned math to extend range of
-// YUV to RGB coefficients to 0 to 4 instead of 0 to 2 for more accuracy on B:
-// LIBYUV_UNLIMITED_DATA
-
-// Macros to enable unlimited data for each colorspace
-// LIBYUV_UNLIMITED_BT601
-// LIBYUV_UNLIMITED_BT709
-// LIBYUV_UNLIMITED_BT2020
-
-#if !defined(LIBYUV_BIT_EXACT) && (defined(__x86_64__) || defined(_M_X64) || \
-                                   defined(__i386__) || defined(_M_IX86))
-#define LIBYUV_ARGBTOUV_PAVGB 1
-#define LIBYUV_RGBTOU_TRUNCATE 1
-#endif
-#if defined(LIBYUV_BIT_EXACT)
-#define LIBYUV_UNATTENUATE_DUP 1
-#endif
-
-// llvm x86 is poor at ternary operator, so use branchless min/max.
-
-#define USE_BRANCHLESS 1
-#if USE_BRANCHLESS
-
-static __inline int32_t clamp0(int32_t v) {
-    return -(v >= 0) & v;
-}
-
-// TODO(fbarchard): make clamp255 preserve negative values.
-static __inline int32_t clamp255(int32_t v) {
-    return (-(v >= 255) | v) & 255;
-}
-
-static __inline int32_t clamp1023(int32_t v) {
-    return (-(v >= 1023) | v) & 1023;
-}
-
-// clamp to max
-static __inline int32_t ClampMax(int32_t v, int32_t max) {
-    return (-(v >= max) | v) & max;
-}
-
-static __inline uint32_t Abs(int32_t v) {
-    int m = -(v < 0);
-    return (v + m) ^ m;
-}
-
-#else   // USE_BRANCHLESS
-static __inline int32_t clamp0(int32_t v) {
-  return (v < 0) ? 0 : v;
-}
-
-static __inline int32_t clamp255(int32_t v) {
-  return (v > 255) ? 255 : v;
-}
-
-static __inline int32_t clamp1023(int32_t v) {
-  return (v > 1023) ? 1023 : v;
-}
-
-static __inline int32_t ClampMax(int32_t v, int32_t max) {
-  return (v > max) ? max : v;
-}
-
-static __inline uint32_t Abs(int32_t v) {
-  return (v < 0) ? -v : v;
-}
-#endif  // USE_BRANCHLESS
-
-static __inline uint32_t Clamp(int32_t val) {
-    int v = clamp0(val);
-    return (uint32_t) (clamp255(v));
-}
-
-static __inline uint32_t Clamp10(int32_t val) {
-    int v = clamp0(val);
-    return (uint32_t) (clamp1023(v));
-}
-
-// Little Endian
-#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
-    defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) || \
-    (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
-#define WRITEWORD(p, v) *(uint32_t*)(p) = v
-#else
-static inline void WRITEWORD(uint8_t* p, uint32_t v) {
-  p[0] = (uint8_t)(v & 255);
-  p[1] = (uint8_t)((v >> 8) & 255);
-  p[2] = (uint8_t)((v >> 16) & 255);
-  p[3] = (uint8_t)((v >> 24) & 255);
-}
-#endif
-
-void RGB565ToARGBRow_C(const uint8_t *src_rgb565,
-                       uint8_t *dst_argb,
-                       int width) {
-    int x;
-    for (x = 0; x < width; ++x) {
-        uint8_t b = STATIC_CAST(uint8_t, src_rgb565[0] & 0x1f);
-        uint8_t g = STATIC_CAST(
-                uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3));
-        uint8_t r = STATIC_CAST(uint8_t, src_rgb565[1] >> 3);
-        dst_argb[0] = STATIC_CAST(uint8_t, (b << 3) | (b >> 2));
-        dst_argb[1] = STATIC_CAST(uint8_t, (g << 2) | (g >> 4));
-        dst_argb[2] = STATIC_CAST(uint8_t, (r << 3) | (r >> 2));
-        dst_argb[3] = 255u;
-        dst_argb += 4;
-        src_rgb565 += 2;
-    }
-}
-
-// 8 bit
-// Intel SSE/AVX uses the following equivalent formula
-// 0x7e80 = (66 + 129 + 25) * -128 + 0x1000 (for +16) and 0x0080 for round.
-//  return (66 * ((int)r - 128) + 129 * ((int)g - 128) + 25 * ((int)b - 128) +
-//  0x7e80) >> 8;
-
-static __inline uint8_t RGBToY(uint8_t r, uint8_t g, uint8_t b) {
-    return STATIC_CAST(uint8_t, (66 * r + 129 * g + 25 * b + 0x1080) >> 8);
-}
-
-#define AVGB(a, b) (((a) + (b) + 1) >> 1)
-
-// LIBYUV_RGBTOU_TRUNCATE mimics x86 code that does not round.
-#ifdef LIBYUV_RGBTOU_TRUNCATE
-
-static __inline uint8_t RGBToU(uint8_t r, uint8_t g, uint8_t b) {
-    return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8000) >> 8);
-}
-
-static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) {
-    return STATIC_CAST(uint8_t, (112 * r - 94 * g - 18 * b + 0x8000) >> 8);
-}
-
-#else
-// TODO(fbarchard): Add rounding to x86 SIMD and use this
-static __inline uint8_t RGBToU(uint8_t r, uint8_t g, uint8_t b) {
-  return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8080) >> 8);
-}
-static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) {
-  return STATIC_CAST(uint8_t, (112 * r - 94 * g - 18 * b + 0x8080) >> 8);
-}
-#endif
-
-// LIBYUV_ARGBTOUV_PAVGB mimics x86 code that subsamples with 2 pavgb.
-#if !defined(LIBYUV_ARGBTOUV_PAVGB)
-static __inline int RGB2xToU(uint16_t r, uint16_t g, uint16_t b) {
-  return STATIC_CAST(
-      uint8_t, ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8);
-}
-static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) {
-  return STATIC_CAST(
-      uint8_t, ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8);
-}
-#endif
-
-// ARGBToY_C and ARGBToUV_C
-// Intel version mimic SSE/AVX which does 2 pavgb
-#if LIBYUV_ARGBTOUV_PAVGB
-#define MAKEROWY(NAME, R, G, B, BPP)                                       \
-  void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
-    int x;                                                                 \
-    for (x = 0; x < width; ++x) {                                          \
-      dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]);               \
-      src_rgb += BPP;                                                      \
-      dst_y += 1;                                                          \
-    }                                                                      \
-  }                                                                        \
-  void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
-                       uint8_t* dst_u, uint8_t* dst_v, int width) {        \
-    const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                    \
-    int x;                                                                 \
-    for (x = 0; x < width - 1; x += 2) {                                   \
-      uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]),                     \
-                        AVGB(src_rgb[B + BPP], src_rgb1[B + BPP]));        \
-      uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]),                     \
-                        AVGB(src_rgb[G + BPP], src_rgb1[G + BPP]));        \
-      uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]),                     \
-                        AVGB(src_rgb[R + BPP], src_rgb1[R + BPP]));        \
-      dst_u[0] = RGBToU(ar, ag, ab);                                       \
-      dst_v[0] = RGBToV(ar, ag, ab);                                       \
-      src_rgb += BPP * 2;                                                  \
-      src_rgb1 += BPP * 2;                                                 \
-      dst_u += 1;                                                          \
-      dst_v += 1;                                                          \
-    }                                                                      \
-    if (width & 1) {                                                       \
-      uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]);                          \
-      uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]);                          \
-      uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]);                          \
-      dst_u[0] = RGBToU(ar, ag, ab);                                       \
-      dst_v[0] = RGBToV(ar, ag, ab);                                       \
-    }                                                                      \
-  }
-#else
-// ARM version does sum / 2 then multiply by 2x smaller coefficients
-#define MAKEROWY(NAME, R, G, B, BPP)                                       \
-  void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
-    int x;                                                                 \
-    for (x = 0; x < width; ++x) {                                          \
-      dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]);               \
-      src_rgb += BPP;                                                      \
-      dst_y += 1;                                                          \
-    }                                                                      \
-  }                                                                        \
-  void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
-                       uint8_t* dst_u, uint8_t* dst_v, int width) {        \
-    const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                    \
-    int x;                                                                 \
-    for (x = 0; x < width - 1; x += 2) {                                   \
-      uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] +         \
-                     src_rgb1[B + BPP] + 1) >>                             \
-                    1;                                                     \
-      uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] +         \
-                     src_rgb1[G + BPP] + 1) >>                             \
-                    1;                                                     \
-      uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] +         \
-                     src_rgb1[R + BPP] + 1) >>                             \
-                    1;                                                     \
-      dst_u[0] = RGB2xToU(ar, ag, ab);                                     \
-      dst_v[0] = RGB2xToV(ar, ag, ab);                                     \
-      src_rgb += BPP * 2;                                                  \
-      src_rgb1 += BPP * 2;                                                 \
-      dst_u += 1;                                                          \
-      dst_v += 1;                                                          \
-    }                                                                      \
-    if (width & 1) {                                                       \
-      uint16_t ab = src_rgb[B] + src_rgb1[B];                              \
-      uint16_t ag = src_rgb[G] + src_rgb1[G];                              \
-      uint16_t ar = src_rgb[R] + src_rgb1[R];                              \
-      dst_u[0] = RGB2xToU(ar, ag, ab);                                     \
-      dst_v[0] = RGB2xToV(ar, ag, ab);                                     \
-    }                                                                      \
-  }
-#endif
-
-MAKEROWY(ARGB, 2, 1, 0, 4)
-
-MAKEROWY(BGRA, 1, 2, 3, 4)
-
-MAKEROWY(ABGR, 0, 1, 2, 4)
-
-MAKEROWY(RGBA, 3, 2, 1, 4)
-
-#undef MAKEROWY
-
-// JPeg uses a variation on BT.601-1 full range
-// y =  0.29900 * r + 0.58700 * g + 0.11400 * b
-// u = -0.16874 * r - 0.33126 * g + 0.50000 * b  + center
-// v =  0.50000 * r - 0.41869 * g - 0.08131 * b  + center
-// BT.601 Mpeg range uses:
-// b 0.1016 * 255 = 25.908 = 25
-// g 0.5078 * 255 = 129.489 = 129
-// r 0.2578 * 255 = 65.739 = 66
-// JPeg 7 bit Y (deprecated)
-// b 0.11400 * 128 = 14.592 = 15
-// g 0.58700 * 128 = 75.136 = 75
-// r 0.29900 * 128 = 38.272 = 38
-// JPeg 8 bit Y:
-// b 0.11400 * 256 = 29.184 = 29
-// g 0.58700 * 256 = 150.272 = 150
-// r 0.29900 * 256 = 76.544 = 77
-// JPeg 8 bit U:
-// b  0.50000 * 255 = 127.5 = 127
-// g -0.33126 * 255 = -84.4713 = -84
-// r -0.16874 * 255 = -43.0287 = -43
-// JPeg 8 bit V:
-// b -0.08131 * 255 = -20.73405 = -20
-// g -0.41869 * 255 = -106.76595 = -107
-// r  0.50000 * 255 = 127.5 = 127
-
-// 8 bit
-static __inline uint8_t RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
-    return (77 * r + 150 * g + 29 * b + 128) >> 8;
-}
-
-#if defined(LIBYUV_ARGBTOUV_PAVGB)
-
-static __inline uint8_t RGBToUJ(uint8_t r, uint8_t g, uint8_t b) {
-    return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
-}
-
-static __inline uint8_t RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
-    return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
-}
-
-#else
-static __inline uint8_t RGB2xToUJ(uint16_t r, uint16_t g, uint16_t b) {
-  return ((127 / 2) * b - (84 / 2) * g - (43 / 2) * r + 0x8080) >> 8;
-}
-static __inline uint8_t RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) {
-  return ((127 / 2) * r - (107 / 2) * g - (20 / 2) * b + 0x8080) >> 8;
-}
-#endif
-
-// ARGBToYJ_C and ARGBToUVJ_C
-// Intel version mimic SSE/AVX which does 2 pavgb
-#if LIBYUV_ARGBTOUV_PAVGB
-#define MAKEROWYJ(NAME, R, G, B, BPP)                                       \
-  void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
-    int x;                                                                  \
-    for (x = 0; x < width; ++x) {                                           \
-      dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]);               \
-      src_rgb += BPP;                                                       \
-      dst_y += 1;                                                           \
-    }                                                                       \
-  }                                                                         \
-  void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
-                        uint8_t* dst_u, uint8_t* dst_v, int width) {        \
-    const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                     \
-    int x;                                                                  \
-    for (x = 0; x < width - 1; x += 2) {                                    \
-      uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]),                      \
-                        AVGB(src_rgb[B + BPP], src_rgb1[B + BPP]));         \
-      uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]),                      \
-                        AVGB(src_rgb[G + BPP], src_rgb1[G + BPP]));         \
-      uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]),                      \
-                        AVGB(src_rgb[R + BPP], src_rgb1[R + BPP]));         \
-      dst_u[0] = RGBToUJ(ar, ag, ab);                                       \
-      dst_v[0] = RGBToVJ(ar, ag, ab);                                       \
-      src_rgb += BPP * 2;                                                   \
-      src_rgb1 += BPP * 2;                                                  \
-      dst_u += 1;                                                           \
-      dst_v += 1;                                                           \
-    }                                                                       \
-    if (width & 1) {                                                        \
-      uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]);                           \
-      uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]);                           \
-      uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]);                           \
-      dst_u[0] = RGBToUJ(ar, ag, ab);                                       \
-      dst_v[0] = RGBToVJ(ar, ag, ab);                                       \
-    }                                                                       \
-  }
-#else
-// ARM version does sum / 2 then multiply by 2x smaller coefficients
-#define MAKEROWYJ(NAME, R, G, B, BPP)                                       \
-  void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
-    int x;                                                                  \
-    for (x = 0; x < width; ++x) {                                           \
-      dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]);               \
-      src_rgb += BPP;                                                       \
-      dst_y += 1;                                                           \
-    }                                                                       \
-  }                                                                         \
-  void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb,         \
-                        uint8_t* dst_u, uint8_t* dst_v, int width) {        \
-    const uint8_t* src_rgb1 = src_rgb + src_stride_rgb;                     \
-    int x;                                                                  \
-    for (x = 0; x < width - 1; x += 2) {                                    \
-      uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] +          \
-                     src_rgb1[B + BPP] + 1) >>                              \
-                    1;                                                      \
-      uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] +          \
-                     src_rgb1[G + BPP] + 1) >>                              \
-                    1;                                                      \
-      uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] +          \
-                     src_rgb1[R + BPP] + 1) >>                              \
-                    1;                                                      \
-      dst_u[0] = RGB2xToUJ(ar, ag, ab);                                     \
-      dst_v[0] = RGB2xToVJ(ar, ag, ab);                                     \
-      src_rgb += BPP * 2;                                                   \
-      src_rgb1 += BPP * 2;                                                  \
-      dst_u += 1;                                                           \
-      dst_v += 1;                                                           \
-    }                                                                       \
-    if (width & 1) {                                                        \
-      uint16_t ab = (src_rgb[B] + src_rgb1[B]);                             \
-      uint16_t ag = (src_rgb[G] + src_rgb1[G]);                             \
-      uint16_t ar = (src_rgb[R] + src_rgb1[R]);                             \
-      dst_u[0] = RGB2xToUJ(ar, ag, ab);                                     \
-      dst_v[0] = RGB2xToVJ(ar, ag, ab);                                     \
-    }                                                                       \
-  }
-
-#endif
-
-MAKEROWYJ(ARGB, 2, 1, 0, 4)
-
-MAKEROWYJ(ABGR, 0, 1, 2, 4)
-
-MAKEROWYJ(RGBA, 3, 2, 1, 4)
-
-MAKEROWYJ(RGB24, 2, 1, 0, 3)
-
-MAKEROWYJ(RAW, 0, 1, 2, 3)
-
-#undef MAKEROWYJ
-
-void RGB565ToYRow_C(const uint8_t *src_rgb565, uint8_t *dst_y, int width) {
-    int x;
-    for (x = 0; x < width; ++x) {
-        uint8_t b = src_rgb565[0] & 0x1f;
-        uint8_t g = STATIC_CAST(
-                uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3));
-        uint8_t r = src_rgb565[1] >> 3;
-        b = STATIC_CAST(uint8_t, (b << 3) | (b >> 2));
-        g = STATIC_CAST(uint8_t, (g << 2) | (g >> 4));
-        r = STATIC_CAST(uint8_t, (r << 3) | (r >> 2));
-        dst_y[0] = RGBToY(r, g, b);
-        src_rgb565 += 2;
-        dst_y += 1;
-    }
-}
-
-void RGB565ToUVRow_C(const uint8_t *src_rgb565,
-                     int src_stride_rgb565,
-                     uint8_t *dst_u,
-                     uint8_t *dst_v,
-                     int width) {
-    const uint8_t *next_rgb565 = src_rgb565 + src_stride_rgb565;
-    int x;
-    for (x = 0; x < width - 1; x += 2) {
-        uint8_t b0 = STATIC_CAST(uint8_t, src_rgb565[0] & 0x1f);
-        uint8_t g0 = STATIC_CAST(
-                uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3));
-        uint8_t r0 = STATIC_CAST(uint8_t, src_rgb565[1] >> 3);
-        uint8_t b1 = STATIC_CAST(uint8_t, src_rgb565[2] & 0x1f);
-        uint8_t g1 = STATIC_CAST(
-                uint8_t, (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3));
-        uint8_t r1 = STATIC_CAST(uint8_t, src_rgb565[3] >> 3);
-        uint8_t b2 = STATIC_CAST(uint8_t, next_rgb565[0] & 0x1f);
-        uint8_t g2 = STATIC_CAST(
-                uint8_t, (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3));
-        uint8_t r2 = STATIC_CAST(uint8_t, next_rgb565[1] >> 3);
-        uint8_t b3 = STATIC_CAST(uint8_t, next_rgb565[2] & 0x1f);
-        uint8_t g3 = STATIC_CAST(
-                uint8_t, (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3));
-        uint8_t r3 = STATIC_CAST(uint8_t, next_rgb565[3] >> 3);
-
-        b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2));
-        g0 = STATIC_CAST(uint8_t, (g0 << 2) | (g0 >> 4));
-        r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2));
-        b1 = STATIC_CAST(uint8_t, (b1 << 3) | (b1 >> 2));
-        g1 = STATIC_CAST(uint8_t, (g1 << 2) | (g1 >> 4));
-        r1 = STATIC_CAST(uint8_t, (r1 << 3) | (r1 >> 2));
-        b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2));
-        g2 = STATIC_CAST(uint8_t, (g2 << 2) | (g2 >> 4));
-        r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2));
-        b3 = STATIC_CAST(uint8_t, (b3 << 3) | (b3 >> 2));
-        g3 = STATIC_CAST(uint8_t, (g3 << 2) | (g3 >> 4));
-        r3 = STATIC_CAST(uint8_t, (r3 << 3) | (r3 >> 2));
-
-#if LIBYUV_ARGBTOUV_PAVGB
-        uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
-        uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
-        uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
-        dst_u[0] = RGBToU(ar, ag, ab);
-        dst_v[0] = RGBToV(ar, ag, ab);
-#else
-        uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
-        uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
-        uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
-        dst_u[0] = RGB2xToU(r, g, b);
-        dst_v[0] = RGB2xToV(r, g, b);
-#endif
-
-        src_rgb565 += 4;
-        next_rgb565 += 4;
-        dst_u += 1;
-        dst_v += 1;
-    }
-    if (width & 1) {
-        uint8_t b0 = STATIC_CAST(uint8_t, src_rgb565[0] & 0x1f);
-        uint8_t g0 = STATIC_CAST(
-                uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3));
-        uint8_t r0 = STATIC_CAST(uint8_t, src_rgb565[1] >> 3);
-        uint8_t b2 = STATIC_CAST(uint8_t, next_rgb565[0] & 0x1f);
-        uint8_t g2 = STATIC_CAST(
-                uint8_t, (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3));
-        uint8_t r2 = STATIC_CAST(uint8_t, next_rgb565[1] >> 3);
-        b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2));
-        g0 = STATIC_CAST(uint8_t, (g0 << 2) | (g0 >> 4));
-        r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2));
-        b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2));
-        g2 = STATIC_CAST(uint8_t, (g2 << 2) | (g2 >> 4));
-        r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2));
-
-#if LIBYUV_ARGBTOUV_PAVGB
-        uint8_t ab = AVGB(b0, b2);
-        uint8_t ag = AVGB(g0, g2);
-        uint8_t ar = AVGB(r0, r2);
-        dst_u[0] = RGBToU(ar, ag, ab);
-        dst_v[0] = RGBToV(ar, ag, ab);
-#else
-        uint16_t b = b0 + b2;
-        uint16_t g = g0 + g2;
-        uint16_t r = r0 + r2;
-        dst_u[0] = RGB2xToU(r, g, b);
-        dst_v[0] = RGB2xToV(r, g, b);
-#endif
-    }
-}
-
-#define REPEAT8(v) (v) | ((v) << 8)
-#define SHADE(f, v) v* f >> 24
-
-#undef REPEAT8
-#undef SHADE
-
-#define REPEAT8(v) (v) | ((v) << 8)
-#define SHADE(f, v) v* f >> 16
-
-#undef REPEAT8
-#undef SHADE
-
-#define SHADE(f, v) clamp255(v + f)
-
-#undef SHADE
-
-#define SHADE(f, v) clamp0(f - v)
-
-#undef SHADE
-
-// Macros to create SIMD specific yuv to rgb conversion constants.
-
-// clang-format off
-
-#if defined(__aarch64__) || defined(__arm__)
-// Bias values include subtract 128 from U and V, bias from Y and rounding.
-// For B and R bias is negative. For G bias is positive.
-#define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR)                             \
-  {{UB, VR, UG, VG, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},                     \
-   {YG, (UB * 128 - YB), (UG * 128 + VG * 128 + YB), (VR * 128 - YB), YB, 0, \
-    0, 0}}
-#else
-#define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR)                     \
-  {{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,          \
-    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},         \
-   {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,  \
-    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, \
-   {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,          \
-    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},         \
-   {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \
-   {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}}
-#endif
-
-// clang-format on
-
-#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR)            \
-  const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = \
-      YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR);                   \
-  const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = \
-      YUVCONSTANTSBODY(YG, YB, VR, VG, UG, UB);
-
-// TODO(fbarchard): Generate SIMD structures from float matrix.
-
-// BT.601 limited range YUV to RGB reference
-//  R = (Y - 16) * 1.164             + V * 1.596
-//  G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
-//  B = (Y - 16) * 1.164 + U * 2.018
-// KR = 0.299; KB = 0.114
-
-// U and V contributions to R,G,B.
-#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT601)
-#define UB 129 /* round(2.018 * 64) */
-#else
-#define UB 128 /* max(128, round(2.018 * 64)) */
-#endif
-#define UG 25  /* round(0.391 * 64) */
-#define VG 52  /* round(0.813 * 64) */
-#define VR 102 /* round(1.596 * 64) */
-
-// Y contribution to R,G,B.  Scale and bias.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
-
-MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR)
-
-#undef YG
-#undef YB
-#undef UB
-#undef UG
-#undef VG
-#undef VR
-
-// BT.601 full range YUV to RGB reference (aka JPEG)
-// *  R = Y               + V * 1.40200
-// *  G = Y - U * 0.34414 - V * 0.71414
-// *  B = Y + U * 1.77200
-// KR = 0.299; KB = 0.114
-
-// U and V contributions to R,G,B.
-#define UB 113 /* round(1.77200 * 64) */
-#define UG 22  /* round(0.34414 * 64) */
-#define VG 46  /* round(0.71414 * 64) */
-#define VR 90  /* round(1.40200 * 64) */
-
-// Y contribution to R,G,B.  Scale and bias.
-#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-#define YB 32    /* 64 / 2 */
-
-MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR)
-
-#undef YG
-#undef YB
-#undef UB
-#undef UG
-#undef VG
-#undef VR
-
-// BT.709 limited range YUV to RGB reference
-//  R = (Y - 16) * 1.164             + V * 1.793
-//  G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533
-//  B = (Y - 16) * 1.164 + U * 2.112
-//  KR = 0.2126, KB = 0.0722
-
-// U and V contributions to R,G,B.
-#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT709)
-#define UB 135 /* round(2.112 * 64) */
-#else
-#define UB 128 /* max(128, round(2.112 * 64)) */
-#endif
-#define UG 14  /* round(0.213 * 64) */
-#define VG 34  /* round(0.533 * 64) */
-#define VR 115 /* round(1.793 * 64) */
-
-// Y contribution to R,G,B.  Scale and bias.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
-
-MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR)
-
-#undef YG
-#undef YB
-#undef UB
-#undef UG
-#undef VG
-#undef VR
-
-// BT.709 full range YUV to RGB reference
-//  R = Y               + V * 1.5748
-//  G = Y - U * 0.18732 - V * 0.46812
-//  B = Y + U * 1.8556
-//  KR = 0.2126, KB = 0.0722
-
-// U and V contributions to R,G,B.
-#define UB 119 /* round(1.8556 * 64) */
-#define UG 12  /* round(0.18732 * 64) */
-#define VG 30  /* round(0.46812 * 64) */
-#define VR 101 /* round(1.5748 * 64) */
-
-// Y contribution to R,G,B.  Scale and bias.  (same as jpeg)
-#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
-#define YB 32    /* 64 / 2 */
-
-MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR)
-
-#undef YG
-#undef YB
-#undef UB
-#undef UG
-#undef VG
-#undef VR
-
-// BT.2020 limited range YUV to RGB reference
-//  R = (Y - 16) * 1.164384                + V * 1.67867
-//  G = (Y - 16) * 1.164384 - U * 0.187326 - V * 0.65042
-//  B = (Y - 16) * 1.164384 + U * 2.14177
-// KR = 0.2627; KB = 0.0593
-
-// U and V contributions to R,G,B.
-#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT2020)
-#define UB 137 /* round(2.142 * 64) */
-#else
-#define UB 128 /* max(128, round(2.142 * 64)) */
-#endif
-#define UG 12  /* round(0.187326 * 64) */
-#define VG 42  /* round(0.65042 * 64) */
-#define VR 107 /* round(1.67867 * 64) */
-
-// Y contribution to R,G,B.  Scale and bias.
-#define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */
-#define YB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */
-
-MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR)
-
-#undef YG
-#undef YB
-#undef UB
-#undef UG
-#undef VG
-#undef VR
-
-// BT.2020 full range YUV to RGB reference
-//  R = Y                + V * 1.474600
-//  G = Y - U * 0.164553 - V * 0.571353
-//  B = Y + U * 1.881400
-// KR = 0.2627; KB = 0.0593
-
-#define UB 120 /* round(1.881400 * 64) */
-#define UG 11  /* round(0.164553 * 64) */
-#define VG 37  /* round(0.571353 * 64) */
-#define VR 94  /* round(1.474600 * 64) */
-
-// Y contribution to R,G,B.  Scale and bias.  (same as jpeg)
-#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
-#define YB 32    /* 64 / 2 */
-
-MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR)
-
-#undef YG
-#undef YB
-#undef UB
-#undef UG
-#undef VG
-#undef VR
-
-#undef BB
-#undef BG
-#undef BR
-
-#undef MAKEYUVCONSTANTS
-
-#if defined(__aarch64__) || defined(__arm__)
-#define LOAD_YUV_CONSTANTS                 \
-  int ub = yuvconstants->kUVCoeff[0];      \
-  int vr = yuvconstants->kUVCoeff[1];      \
-  int ug = yuvconstants->kUVCoeff[2];      \
-  int vg = yuvconstants->kUVCoeff[3];      \
-  int yg = yuvconstants->kRGBCoeffBias[0]; \
-  int bb = yuvconstants->kRGBCoeffBias[1]; \
-  int bg = yuvconstants->kRGBCoeffBias[2]; \
-  int br = yuvconstants->kRGBCoeffBias[3]
-
-#define CALC_RGB16                         \
-  int32_t y1 = (uint32_t)(y32 * yg) >> 16; \
-  int b16 = y1 + (u * ub) - bb;            \
-  int g16 = y1 + bg - (u * ug + v * vg);   \
-  int r16 = y1 + (v * vr) - br
-#else
-#define LOAD_YUV_CONSTANTS           \
-  int ub = yuvconstants->kUVToB[0];  \
-  int ug = yuvconstants->kUVToG[0];  \
-  int vg = yuvconstants->kUVToG[1];  \
-  int vr = yuvconstants->kUVToR[1];  \
-  int yg = yuvconstants->kYToRgb[0]; \
-  int yb = yuvconstants->kYBiasToRgb[0]
-
-#define CALC_RGB16                                \
-  int32_t y1 = ((uint32_t)(y32 * yg) >> 16) + yb; \
-  int8_t ui = (int8_t)u;                          \
-  int8_t vi = (int8_t)v;                          \
-  ui -= 0x80;                                     \
-  vi -= 0x80;                                     \
-  int b16 = y1 + (ui * ub);                       \
-  int g16 = y1 - (ui * ug + vi * vg);             \
-  int r16 = y1 + (vi * vr)
-#endif
-
-void MirrorRow_C(const uint8_t *src, uint8_t *dst, int width) {
-    int x;
-    src += width - 1;
-    for (x = 0; x < width - 1; x += 2) {
-        dst[x] = src[0];
-        dst[x + 1] = src[-1];
-        src -= 2;
-    }
-    if (width & 1) {
-        dst[width - 1] = src[0];
-    }
-}
-
-// Use scale to convert lsb formats to msb, depending how many bits there are:
-// 32768 = 9 bits
-// 16384 = 10 bits
-// 4096 = 12 bits
-// 256 = 16 bits
-// TODO(fbarchard): change scale to bits
-#define C16TO8(v, scale) clamp255(((v) * (scale)) >> 16)
-
-void CopyRow_C(const uint8_t *src, uint8_t *dst, int count) {
-    memcpy(dst, src, count);
-}
-
-// Divide source RGB by alpha and store to destination.
-// b = (b * 255 + (a / 2)) / a;
-// g = (g * 255 + (a / 2)) / a;
-// r = (r * 255 + (a / 2)) / a;
-// Reciprocal method is off by 1 on some values. ie 125
-// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
-#define T(a) 0x01000000 + (0x10000 / a)
-const uint32_t fixed_invtbl8[256] = {
-        0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06),
-        T(0x07), T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d),
-        T(0x0e), T(0x0f), T(0x10), T(0x11), T(0x12), T(0x13), T(0x14),
-        T(0x15), T(0x16), T(0x17), T(0x18), T(0x19), T(0x1a), T(0x1b),
-        T(0x1c), T(0x1d), T(0x1e), T(0x1f), T(0x20), T(0x21), T(0x22),
-        T(0x23), T(0x24), T(0x25), T(0x26), T(0x27), T(0x28), T(0x29),
-        T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f), T(0x30),
-        T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
-        T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e),
-        T(0x3f), T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45),
-        T(0x46), T(0x47), T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c),
-        T(0x4d), T(0x4e), T(0x4f), T(0x50), T(0x51), T(0x52), T(0x53),
-        T(0x54), T(0x55), T(0x56), T(0x57), T(0x58), T(0x59), T(0x5a),
-        T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f), T(0x60), T(0x61),
-        T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67), T(0x68),
-        T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
-        T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76),
-        T(0x77), T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d),
-        T(0x7e), T(0x7f), T(0x80), T(0x81), T(0x82), T(0x83), T(0x84),
-        T(0x85), T(0x86), T(0x87), T(0x88), T(0x89), T(0x8a), T(0x8b),
-        T(0x8c), T(0x8d), T(0x8e), T(0x8f), T(0x90), T(0x91), T(0x92),
-        T(0x93), T(0x94), T(0x95), T(0x96), T(0x97), T(0x98), T(0x99),
-        T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f), T(0xa0),
-        T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
-        T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae),
-        T(0xaf), T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5),
-        T(0xb6), T(0xb7), T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc),
-        T(0xbd), T(0xbe), T(0xbf), T(0xc0), T(0xc1), T(0xc2), T(0xc3),
-        T(0xc4), T(0xc5), T(0xc6), T(0xc7), T(0xc8), T(0xc9), T(0xca),
-        T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf), T(0xd0), T(0xd1),
-        T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7), T(0xd8),
-        T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
-        T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6),
-        T(0xe7), T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed),
-        T(0xee), T(0xef), T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4),
-        T(0xf5), T(0xf6), T(0xf7), T(0xf8), T(0xf9), T(0xfa), T(0xfb),
-        T(0xfc), T(0xfd), T(0xfe), 0x01000100};
-#undef T
-
-// Blend 2 rows into 1.
-static void HalfRow_C(const uint8_t *src_uv,
-                      ptrdiff_t src_uv_stride,
-                      uint8_t *dst_uv,
-                      int width) {
-    int x;
-    for (x = 0; x < width; ++x) {
-        dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
-    }
-}
-
-// C version 2x2 -> 2x1.
-void InterpolateRow_C(uint8_t *dst_ptr,
-                      const uint8_t *src_ptr,
-                      ptrdiff_t src_stride,
-                      int width,
-                      int source_y_fraction) {
-    int y1_fraction = source_y_fraction;
-    int y0_fraction = 256 - y1_fraction;
-    const uint8_t *src_ptr1 = src_ptr + src_stride;
-    int x;
-    assert(source_y_fraction >= 0);
-    assert(source_y_fraction < 256);
-
-    if (y1_fraction == 0) {
-        memcpy(dst_ptr, src_ptr, width);
-        return;
-    }
-    if (y1_fraction == 128) {
-        HalfRow_C(src_ptr, src_stride, dst_ptr, width);
-        return;
-    }
-    for (x = 0; x < width; ++x) {
-        dst_ptr[0] = STATIC_CAST(
-                uint8_t,
-                (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8);
-        ++src_ptr;
-        ++src_ptr1;
-        ++dst_ptr;
-    }
-}
-
-// Work around GCC 7 punning warning -Wstrict-aliasing
-#if defined(__GNUC__)
-typedef uint32_t __attribute__((__may_alias__)) uint32_alias_t;
-#else
-typedef uint32_t uint32_alias_t;
-#endif
-
-#undef STATIC_CAST
diff --git a/pkg/encoder/yuv/libyuv/row_gcc.c b/pkg/encoder/yuv/libyuv/row_gcc.c
deleted file mode 100644
index 07e795e60..000000000
--- a/pkg/encoder/yuv/libyuv/row_gcc.c
+++ /dev/null
@@ -1,1090 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "row.h"
-
-// This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
-
-#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
-
-// Constants for ARGB
-static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u,
-                               25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u};
-
-
-#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
-
-#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
-
-static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
-                              112, -74, -38, 0, 112, -74, -38, 0};
-
-static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
-                              -18, -94, 112, 0, -18, -94, 112, 0};
-
-// Constants for BGRA
-static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u,
-                               0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u};
-
-static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
-                              0, -38, -74, 112, 0, -38, -74, 112};
-
-static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
-                              0, 112, -94, -18, 0, 112, -94, -18};
-
-// Constants for ABGR
-static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u,
-                               66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u};
-
-static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
-                              -38, -74, 112, 0, -38, -74, 112, 0};
-
-static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
-                              112, -94, -18, 0, 112, -94, -18, 0};
-
-// Constants for RGBA.
-//static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u,
-//                               0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u};
-
-static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
-                              0, 112, -74, -38, 0, 112, -74, -38};
-
-static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
-                              0, -18, -94, 112, 0, -18, -94, 112};
-
-static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u,
-                               0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u};
-
-static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
-                                128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
-
-static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
-                               0x8080u, 0x8080u, 0x8080u, 0x8080u};
-
-#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
-
-// clang-format off
-
-// TODO(mraptis): Consider passing R, G, B multipliers as parameter.
-// round parameter is register containing value to add before shift.
-#define RGBTOY(round)                            \
-  "1:                                        \n" \
-  "movdqu    (%0),%%xmm0                     \n" \
-  "movdqu    0x10(%0),%%xmm1                 \n" \
-  "movdqu    0x20(%0),%%xmm2                 \n" \
-  "movdqu    0x30(%0),%%xmm3                 \n" \
-  "psubb     %%xmm5,%%xmm0                   \n" \
-  "psubb     %%xmm5,%%xmm1                   \n" \
-  "psubb     %%xmm5,%%xmm2                   \n" \
-  "psubb     %%xmm5,%%xmm3                   \n" \
-  "movdqu    %%xmm4,%%xmm6                   \n" \
-  "pmaddubsw %%xmm0,%%xmm6                   \n" \
-  "movdqu    %%xmm4,%%xmm0                   \n" \
-  "pmaddubsw %%xmm1,%%xmm0                   \n" \
-  "movdqu    %%xmm4,%%xmm1                   \n" \
-  "pmaddubsw %%xmm2,%%xmm1                   \n" \
-  "movdqu    %%xmm4,%%xmm2                   \n" \
-  "pmaddubsw %%xmm3,%%xmm2                   \n" \
-  "lea       0x40(%0),%0                     \n" \
-  "phaddw    %%xmm0,%%xmm6                   \n" \
-  "phaddw    %%xmm2,%%xmm1                   \n" \
-  "prefetcht0 1280(%0)                       \n" \
-  "paddw     %%" #round ",%%xmm6             \n" \
-  "paddw     %%" #round ",%%xmm1             \n" \
-  "psrlw     $0x8,%%xmm6                     \n" \
-  "psrlw     $0x8,%%xmm1                     \n" \
-  "packuswb  %%xmm1,%%xmm6                   \n" \
-  "movdqu    %%xmm6,(%1)                     \n" \
-  "lea       0x10(%1),%1                     \n" \
-  "sub       $0x10,%2                        \n" \
-  "jg        1b                              \n"
-
-#define RGBTOY_AVX2(round)                                       \
-  "1:                                        \n"                 \
-  "vmovdqu    (%0),%%ymm0                    \n"                 \
-  "vmovdqu    0x20(%0),%%ymm1                \n"                 \
-  "vmovdqu    0x40(%0),%%ymm2                \n"                 \
-  "vmovdqu    0x60(%0),%%ymm3                \n"                 \
-  "vpsubb     %%ymm5, %%ymm0, %%ymm0         \n"                 \
-  "vpsubb     %%ymm5, %%ymm1, %%ymm1         \n"                 \
-  "vpsubb     %%ymm5, %%ymm2, %%ymm2         \n"                 \
-  "vpsubb     %%ymm5, %%ymm3, %%ymm3         \n"                 \
-  "vpmaddubsw %%ymm0,%%ymm4,%%ymm0           \n"                 \
-  "vpmaddubsw %%ymm1,%%ymm4,%%ymm1           \n"                 \
-  "vpmaddubsw %%ymm2,%%ymm4,%%ymm2           \n"                 \
-  "vpmaddubsw %%ymm3,%%ymm4,%%ymm3           \n"                 \
-  "lea       0x80(%0),%0                     \n"                 \
-  "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n" /* mutates. */  \
-  "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"                 \
-  "prefetcht0 1280(%0)                       \n"                 \
-  "vpaddw     %%" #round ",%%ymm0,%%ymm0     \n" /* Add .5 for rounding. */             \
-  "vpaddw     %%" #round ",%%ymm2,%%ymm2     \n" \
-  "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"                 \
-  "vpsrlw     $0x8,%%ymm2,%%ymm2             \n"                 \
-  "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n" /* mutates. */  \
-  "vpermd     %%ymm0,%%ymm6,%%ymm0           \n" /* unmutate. */ \
-  "vmovdqu    %%ymm0,(%1)                    \n"                 \
-  "lea       0x20(%1),%1                     \n"                 \
-  "sub       $0x20,%2                        \n"                 \
-  "jg        1b                              \n"                 \
-  "vzeroupper                                \n"
-
-// clang-format on
-
-#ifdef HAS_ARGBTOYROW_SSSE3
-
-// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-void ARGBToYRow_SSSE3(const uint8_t *src_argb, uint8_t *dst_y, int width) {
-    asm volatile(
-            "movdqa      %3,%%xmm4                     \n"
-            "movdqa      %4,%%xmm5                     \n"
-            "movdqa      %5,%%xmm7                     \n"
-
-            LABELALIGN RGBTOY(xmm7)
-            : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)      // %2
-            : "m"(kARGBToY),   // %3
-    "m"(kSub128),    // %4
-    "m"(kAddY16)     // %5
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-    "xmm7");
-}
-
-#endif  // HAS_ARGBTOYROW_SSSE3
-
-#if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ABGRTOYROW_AVX2) || \
-    defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
-// vpermd for vphaddw + vpackuswb vpermd.
-static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
-#endif
-
-#ifdef HAS_ARGBTOYROW_AVX2
-
-// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-void ARGBToYRow_AVX2(const uint8_t *src_argb, uint8_t *dst_y, int width) {
-    asm volatile(
-            "vbroadcastf128 %3,%%ymm4                  \n"
-            "vbroadcastf128 %4,%%ymm5                  \n"
-            "vbroadcastf128 %5,%%ymm7                  \n"
-            "vmovdqu     %6,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
-                    ymm7) "vzeroupper                                \n"
-            : "+r"(src_argb),         // %0
-    "+r"(dst_y),            // %1
-    "+r"(width)             // %2
-            : "m"(kARGBToY),          // %3
-    "m"(kSub128),           // %4
-    "m"(kAddY16),           // %5
-    "m"(kPermdARGBToY_AVX)  // %6
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-    "xmm7");
-}
-
-#endif  // HAS_ARGBTOYROW_AVX2
-
-#ifdef HAS_ABGRTOYROW_AVX2
-
-// Convert 32 ABGR pixels (128 bytes) to 32 Y values.
-void ABGRToYRow_AVX2(const uint8_t *src_abgr, uint8_t *dst_y, int width) {
-    asm volatile(
-            "vbroadcastf128 %3,%%ymm4                  \n"
-            "vbroadcastf128 %4,%%ymm5                  \n"
-            "vbroadcastf128 %5,%%ymm7                  \n"
-            "vmovdqu     %6,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
-                    ymm7) "vzeroupper                                \n"
-            : "+r"(src_abgr),         // %0
-    "+r"(dst_y),            // %1
-    "+r"(width)             // %2
-            : "m"(kABGRToY),          // %3
-    "m"(kSub128),           // %4
-    "m"(kAddY16),           // %5
-    "m"(kPermdARGBToY_AVX)  // %6
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-    "xmm7");
-}
-
-#endif  // HAS_ABGRTOYROW_AVX2
-
-#ifdef HAS_ARGBTOUVROW_SSSE3
-
-void ARGBToUVRow_SSSE3(const uint8_t *src_argb,
-                       int src_stride_argb,
-                       uint8_t *dst_u,
-                       uint8_t *dst_v,
-                       int width) {
-    asm volatile(
-            "movdqa      %5,%%xmm3                     \n"
-            "movdqa      %6,%%xmm4                     \n"
-            "movdqa      %7,%%xmm5                     \n"
-            "sub         %1,%2                         \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "movdqu      (%0),%%xmm0                   \n"
-            "movdqu      0x00(%0,%4,1),%%xmm7          \n"
-            "pavgb       %%xmm7,%%xmm0                 \n"
-            "movdqu      0x10(%0),%%xmm1               \n"
-            "movdqu      0x10(%0,%4,1),%%xmm7          \n"
-            "pavgb       %%xmm7,%%xmm1                 \n"
-            "movdqu      0x20(%0),%%xmm2               \n"
-            "movdqu      0x20(%0,%4,1),%%xmm7          \n"
-            "pavgb       %%xmm7,%%xmm2                 \n"
-            "movdqu      0x30(%0),%%xmm6               \n"
-            "movdqu      0x30(%0,%4,1),%%xmm7          \n"
-            "pavgb       %%xmm7,%%xmm6                 \n"
-
-            "lea         0x40(%0),%0                   \n"
-            "movdqa      %%xmm0,%%xmm7                 \n"
-            "shufps      $0x88,%%xmm1,%%xmm0           \n"
-            "shufps      $0xdd,%%xmm1,%%xmm7           \n"
-            "pavgb       %%xmm7,%%xmm0                 \n"
-            "movdqa      %%xmm2,%%xmm7                 \n"
-            "shufps      $0x88,%%xmm6,%%xmm2           \n"
-            "shufps      $0xdd,%%xmm6,%%xmm7           \n"
-            "pavgb       %%xmm7,%%xmm2                 \n"
-            "movdqa      %%xmm0,%%xmm1                 \n"
-            "movdqa      %%xmm2,%%xmm6                 \n"
-            "pmaddubsw   %%xmm4,%%xmm0                 \n"
-            "pmaddubsw   %%xmm4,%%xmm2                 \n"
-            "pmaddubsw   %%xmm3,%%xmm1                 \n"
-            "pmaddubsw   %%xmm3,%%xmm6                 \n"
-            "phaddw      %%xmm2,%%xmm0                 \n"
-            "phaddw      %%xmm6,%%xmm1                 \n"
-            "psraw       $0x8,%%xmm0                   \n"
-            "psraw       $0x8,%%xmm1                   \n"
-            "packsswb    %%xmm1,%%xmm0                 \n"
-            "paddb       %%xmm5,%%xmm0                 \n"
-            "movlps      %%xmm0,(%1)                   \n"
-            "movhps      %%xmm0,0x00(%1,%2,1)          \n"
-            "lea         0x8(%1),%1                    \n"
-            "sub         $0x10,%3                      \n"
-            "jg          1b                            \n"
-            : "+r"(src_argb),                    // %0
-    "+r"(dst_u),                       // %1
-    "+r"(dst_v),                       // %2
-    "+rm"(width)                       // %3
-            : "r"((intptr_t) (src_stride_argb)),  // %4
-    "m"(kARGBToV),                     // %5
-    "m"(kARGBToU),                     // %6
-    "m"(kAddUV128)                     // %7
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
-}
-
-#endif  // HAS_ARGBTOUVROW_SSSE3
-
-#if defined(HAS_ARGBTOUVROW_AVX2) || defined(HAS_ABGRTOUVROW_AVX2) || \
-    defined(HAS_ARGBTOUVJROW_AVX2) || defined(HAS_ABGRTOUVJROW_AVX2)
-// vpshufb for vphaddw + vpackuswb packed to shorts.
-static const lvec8 kShufARGBToUV_AVX = {
-        0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
-        0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
-#endif
-
-#if defined(HAS_ARGBTOUVROW_AVX2)
-
-void ARGBToUVRow_AVX2(const uint8_t *src_argb,
-                      int src_stride_argb,
-                      uint8_t *dst_u,
-                      uint8_t *dst_v,
-                      int width) {
-    asm volatile(
-            "vbroadcastf128 %5,%%ymm5                  \n"
-            "vbroadcastf128 %6,%%ymm6                  \n"
-            "vbroadcastf128 %7,%%ymm7                  \n"
-            "sub         %1,%2                         \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "vmovdqu     (%0),%%ymm0                   \n"
-            "vmovdqu     0x20(%0),%%ymm1               \n"
-            "vmovdqu     0x40(%0),%%ymm2               \n"
-            "vmovdqu     0x60(%0),%%ymm3               \n"
-            "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
-            "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
-            "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
-            "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
-            "lea         0x80(%0),%0                   \n"
-            "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
-            "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
-            "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
-            "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
-            "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
-            "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
-
-            "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
-            "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
-            "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
-            "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
-            "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
-            "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
-            "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
-            "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
-            "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
-            "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-            "vpshufb     %8,%%ymm0,%%ymm0              \n"
-            "vpaddb      %%ymm5,%%ymm0,%%ymm0          \n"
-
-            "vextractf128 $0x0,%%ymm0,(%1)             \n"
-            "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
-            "lea         0x10(%1),%1                   \n"
-            "sub         $0x20,%3                      \n"
-            "jg          1b                            \n"
-            "vzeroupper                                \n"
-            : "+r"(src_argb),                    // %0
-    "+r"(dst_u),                       // %1
-    "+r"(dst_v),                       // %2
-    "+rm"(width)                       // %3
-            : "r"((intptr_t) (src_stride_argb)),  // %4
-    "m"(kAddUV128),                    // %5
-    "m"(kARGBToV),                     // %6
-    "m"(kARGBToU),                     // %7
-    "m"(kShufARGBToUV_AVX)             // %8
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-    "xmm7");
-}
-
-#endif  // HAS_ARGBTOUVROW_AVX2
-
-#ifdef HAS_ABGRTOUVROW_AVX2
-
-void ABGRToUVRow_AVX2(const uint8_t *src_abgr,
-                      int src_stride_abgr,
-                      uint8_t *dst_u,
-                      uint8_t *dst_v,
-                      int width) {
-    asm volatile(
-            "vbroadcastf128 %5,%%ymm5                  \n"
-            "vbroadcastf128 %6,%%ymm6                  \n"
-            "vbroadcastf128 %7,%%ymm7                  \n"
-            "sub         %1,%2                         \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "vmovdqu     (%0),%%ymm0                   \n"
-            "vmovdqu     0x20(%0),%%ymm1               \n"
-            "vmovdqu     0x40(%0),%%ymm2               \n"
-            "vmovdqu     0x60(%0),%%ymm3               \n"
-            "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
-            "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
-            "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
-            "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
-            "lea         0x80(%0),%0                   \n"
-            "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
-            "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
-            "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
-            "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
-            "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
-            "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
-
-            "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
-            "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
-            "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
-            "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
-            "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
-            "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
-            "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
-            "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
-            "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
-            "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-            "vpshufb     %8,%%ymm0,%%ymm0              \n"
-            "vpaddb      %%ymm5,%%ymm0,%%ymm0          \n"
-
-            "vextractf128 $0x0,%%ymm0,(%1)             \n"
-            "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
-            "lea         0x10(%1),%1                   \n"
-            "sub         $0x20,%3                      \n"
-            "jg          1b                            \n"
-            "vzeroupper                                \n"
-            : "+r"(src_abgr),                    // %0
-    "+r"(dst_u),                       // %1
-    "+r"(dst_v),                       // %2
-    "+rm"(width)                       // %3
-            : "r"((intptr_t) (src_stride_abgr)),  // %4
-    "m"(kAddUV128),                    // %5
-    "m"(kABGRToV),                     // %6
-    "m"(kABGRToU),                     // %7
-    "m"(kShufARGBToUV_AVX)             // %8
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-    "xmm7");
-}
-
-#endif  // HAS_ABGRTOUVROW_AVX2
-
-void BGRAToYRow_SSSE3(const uint8_t *src_bgra, uint8_t *dst_y, int width) {
-    asm volatile(
-            "movdqa      %3,%%xmm4                     \n"
-            "movdqa      %4,%%xmm5                     \n"
-            "movdqa      %5,%%xmm7                     \n"
-
-            LABELALIGN RGBTOY(xmm7)
-            : "+r"(src_bgra),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)      // %2
-            : "m"(kBGRAToY),   // %3
-    "m"(kSub128),    // %4
-    "m"(kAddY16)     // %5
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-    "xmm7");
-}
-
-void BGRAToUVRow_SSSE3(const uint8_t *src_bgra,
-                       int src_stride_bgra,
-                       uint8_t *dst_u,
-                       uint8_t *dst_v,
-                       int width) {
-    asm volatile(
-            "movdqa      %5,%%xmm3                     \n"
-            "movdqa      %6,%%xmm4                     \n"
-            "movdqa      %7,%%xmm5                     \n"
-            "sub         %1,%2                         \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "movdqu      (%0),%%xmm0                   \n"
-            "movdqu      0x00(%0,%4,1),%%xmm7          \n"
-            "pavgb       %%xmm7,%%xmm0                 \n"
-            "movdqu      0x10(%0),%%xmm1               \n"
-            "movdqu      0x10(%0,%4,1),%%xmm7          \n"
-            "pavgb       %%xmm7,%%xmm1                 \n"
-            "movdqu      0x20(%0),%%xmm2               \n"
-            "movdqu      0x20(%0,%4,1),%%xmm7          \n"
-            "pavgb       %%xmm7,%%xmm2                 \n"
-            "movdqu      0x30(%0),%%xmm6               \n"
-            "movdqu      0x30(%0,%4,1),%%xmm7          \n"
-            "pavgb       %%xmm7,%%xmm6                 \n"
-
-            "lea         0x40(%0),%0                   \n"
-            "movdqa      %%xmm0,%%xmm7                 \n"
-            "shufps      $0x88,%%xmm1,%%xmm0           \n"
-            "shufps      $0xdd,%%xmm1,%%xmm7           \n"
-            "pavgb       %%xmm7,%%xmm0                 \n"
-            "movdqa      %%xmm2,%%xmm7                 \n"
-            "shufps      $0x88,%%xmm6,%%xmm2           \n"
-            "shufps      $0xdd,%%xmm6,%%xmm7           \n"
-            "pavgb       %%xmm7,%%xmm2                 \n"
-            "movdqa      %%xmm0,%%xmm1                 \n"
-            "movdqa      %%xmm2,%%xmm6                 \n"
-            "pmaddubsw   %%xmm4,%%xmm0                 \n"
-            "pmaddubsw   %%xmm4,%%xmm2                 \n"
-            "pmaddubsw   %%xmm3,%%xmm1                 \n"
-            "pmaddubsw   %%xmm3,%%xmm6                 \n"
-            "phaddw      %%xmm2,%%xmm0                 \n"
-            "phaddw      %%xmm6,%%xmm1                 \n"
-            "psraw       $0x8,%%xmm0                   \n"
-            "psraw       $0x8,%%xmm1                   \n"
-            "packsswb    %%xmm1,%%xmm0                 \n"
-            "paddb       %%xmm5,%%xmm0                 \n"
-            "movlps      %%xmm0,(%1)                   \n"
-            "movhps      %%xmm0,0x00(%1,%2,1)          \n"
-            "lea         0x8(%1),%1                    \n"
-            "sub         $0x10,%3                      \n"
-            "jg          1b                            \n"
-            : "+r"(src_bgra),                    // %0
-    "+r"(dst_u),                       // %1
-    "+r"(dst_v),                       // %2
-    "+rm"(width)                       // %3
-            : "r"((intptr_t) (src_stride_bgra)),  // %4
-    "m"(kBGRAToV),                     // %5
-    "m"(kBGRAToU),                     // %6
-    "m"(kAddUV128)                     // %7
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
-}
-
-void ABGRToYRow_SSSE3(const uint8_t *src_abgr, uint8_t *dst_y, int width) {
-    asm volatile(
-            "movdqa      %3,%%xmm4                     \n"
-            "movdqa      %4,%%xmm5                     \n"
-            "movdqa      %5,%%xmm7                     \n"
-
-            LABELALIGN RGBTOY(xmm7)
-            : "+r"(src_abgr),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)      // %2
-            : "m"(kABGRToY),   // %3
-    "m"(kSub128),    // %4
-    "m"(kAddY16)     // %5
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-    "xmm7");
-}
-
-void ABGRToUVRow_SSSE3(const uint8_t *src_abgr,
-                       int src_stride_abgr,
-                       uint8_t *dst_u,
-                       uint8_t *dst_v,
-                       int width) {
-    asm volatile(
-            "movdqa      %5,%%xmm3                     \n"
-            "movdqa      %6,%%xmm4                     \n"
-            "movdqa      %7,%%xmm5                     \n"
-            "sub         %1,%2                         \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "movdqu      (%0),%%xmm0                   \n"
-            "movdqu      0x00(%0,%4,1),%%xmm7          \n"
-            "pavgb       %%xmm7,%%xmm0                 \n"
-            "movdqu      0x10(%0),%%xmm1               \n"
-            "movdqu      0x10(%0,%4,1),%%xmm7          \n"
-            "pavgb       %%xmm7,%%xmm1                 \n"
-            "movdqu      0x20(%0),%%xmm2               \n"
-            "movdqu      0x20(%0,%4,1),%%xmm7          \n"
-            "pavgb       %%xmm7,%%xmm2                 \n"
-            "movdqu      0x30(%0),%%xmm6               \n"
-            "movdqu      0x30(%0,%4,1),%%xmm7          \n"
-            "pavgb       %%xmm7,%%xmm6                 \n"
-
-            "lea         0x40(%0),%0                   \n"
-            "movdqa      %%xmm0,%%xmm7                 \n"
-            "shufps      $0x88,%%xmm1,%%xmm0           \n"
-            "shufps      $0xdd,%%xmm1,%%xmm7           \n"
-            "pavgb       %%xmm7,%%xmm0                 \n"
-            "movdqa      %%xmm2,%%xmm7                 \n"
-            "shufps      $0x88,%%xmm6,%%xmm2           \n"
-            "shufps      $0xdd,%%xmm6,%%xmm7           \n"
-            "pavgb       %%xmm7,%%xmm2                 \n"
-            "movdqa      %%xmm0,%%xmm1                 \n"
-            "movdqa      %%xmm2,%%xmm6                 \n"
-            "pmaddubsw   %%xmm4,%%xmm0                 \n"
-            "pmaddubsw   %%xmm4,%%xmm2                 \n"
-            "pmaddubsw   %%xmm3,%%xmm1                 \n"
-            "pmaddubsw   %%xmm3,%%xmm6                 \n"
-            "phaddw      %%xmm2,%%xmm0                 \n"
-            "phaddw      %%xmm6,%%xmm1                 \n"
-            "psraw       $0x8,%%xmm0                   \n"
-            "psraw       $0x8,%%xmm1                   \n"
-            "packsswb    %%xmm1,%%xmm0                 \n"
-            "paddb       %%xmm5,%%xmm0                 \n"
-            "movlps      %%xmm0,(%1)                   \n"
-            "movhps      %%xmm0,0x00(%1,%2,1)          \n"
-            "lea         0x8(%1),%1                    \n"
-            "sub         $0x10,%3                      \n"
-            "jg          1b                            \n"
-            : "+r"(src_abgr),                    // %0
-    "+r"(dst_u),                       // %1
-    "+r"(dst_v),                       // %2
-    "+rm"(width)                       // %3
-            : "r"((intptr_t) (src_stride_abgr)),  // %4
-    "m"(kABGRToV),                     // %5
-    "m"(kABGRToU),                     // %6
-    "m"(kAddUV128)                     // %7
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
-}
-
-void RGBAToUVRow_SSSE3(const uint8_t *src_rgba,
-                       int src_stride_rgba,
-                       uint8_t *dst_u,
-                       uint8_t *dst_v,
-                       int width) {
-    asm volatile(
-            "movdqa      %5,%%xmm3                     \n"
-            "movdqa      %6,%%xmm4                     \n"
-            "movdqa      %7,%%xmm5                     \n"
-            "sub         %1,%2                         \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "movdqu      (%0),%%xmm0                   \n"
-            "movdqu      0x00(%0,%4,1),%%xmm7          \n"
-            "pavgb       %%xmm7,%%xmm0                 \n"
-            "movdqu      0x10(%0),%%xmm1               \n"
-            "movdqu      0x10(%0,%4,1),%%xmm7          \n"
-            "pavgb       %%xmm7,%%xmm1                 \n"
-            "movdqu      0x20(%0),%%xmm2               \n"
-            "movdqu      0x20(%0,%4,1),%%xmm7          \n"
-            "pavgb       %%xmm7,%%xmm2                 \n"
-            "movdqu      0x30(%0),%%xmm6               \n"
-            "movdqu      0x30(%0,%4,1),%%xmm7          \n"
-            "pavgb       %%xmm7,%%xmm6                 \n"
-
-            "lea         0x40(%0),%0                   \n"
-            "movdqa      %%xmm0,%%xmm7                 \n"
-            "shufps      $0x88,%%xmm1,%%xmm0           \n"
-            "shufps      $0xdd,%%xmm1,%%xmm7           \n"
-            "pavgb       %%xmm7,%%xmm0                 \n"
-            "movdqa      %%xmm2,%%xmm7                 \n"
-            "shufps      $0x88,%%xmm6,%%xmm2           \n"
-            "shufps      $0xdd,%%xmm6,%%xmm7           \n"
-            "pavgb       %%xmm7,%%xmm2                 \n"
-            "movdqa      %%xmm0,%%xmm1                 \n"
-            "movdqa      %%xmm2,%%xmm6                 \n"
-            "pmaddubsw   %%xmm4,%%xmm0                 \n"
-            "pmaddubsw   %%xmm4,%%xmm2                 \n"
-            "pmaddubsw   %%xmm3,%%xmm1                 \n"
-            "pmaddubsw   %%xmm3,%%xmm6                 \n"
-            "phaddw      %%xmm2,%%xmm0                 \n"
-            "phaddw      %%xmm6,%%xmm1                 \n"
-            "psraw       $0x8,%%xmm0                   \n"
-            "psraw       $0x8,%%xmm1                   \n"
-            "packsswb    %%xmm1,%%xmm0                 \n"
-            "paddb       %%xmm5,%%xmm0                 \n"
-            "movlps      %%xmm0,(%1)                   \n"
-            "movhps      %%xmm0,0x00(%1,%2,1)          \n"
-            "lea         0x8(%1),%1                    \n"
-            "sub         $0x10,%3                      \n"
-            "jg          1b                            \n"
-            : "+r"(src_rgba),                    // %0
-    "+r"(dst_u),                       // %1
-    "+r"(dst_v),                       // %2
-    "+rm"(width)                       // %3
-            : "r"((intptr_t) (src_stride_rgba)),  // %4
-    "m"(kRGBAToV),                     // %5
-    "m"(kRGBAToU),                     // %6
-    "m"(kAddUV128)                     // %7
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
-}
-
-#ifdef HAS_MIRRORROW_SSSE3
-// Shuffle table for reversing the bytes.
-static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
-                                     7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
-
-void MirrorRow_SSSE3(const uint8_t *src, uint8_t *dst, int width) {
-    intptr_t temp_width = (intptr_t) (width);
-    asm volatile(
-
-            "movdqa      %3,%%xmm5                     \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "movdqu      -0x10(%0,%2,1),%%xmm0         \n"
-            "pshufb      %%xmm5,%%xmm0                 \n"
-            "movdqu      %%xmm0,(%1)                   \n"
-            "lea         0x10(%1),%1                   \n"
-            "sub         $0x10,%2                      \n"
-            "jg          1b                            \n"
-            : "+r"(src),           // %0
-    "+r"(dst),           // %1
-    "+r"(temp_width)     // %2
-            : "m"(kShuffleMirror)  // %3
-            : "memory", "cc", "xmm0", "xmm5");
-}
-
-#endif  // HAS_MIRRORROW_SSSE3
-
-#ifdef HAS_MIRRORROW_AVX2
-
-void MirrorRow_AVX2(const uint8_t *src, uint8_t *dst, int width) {
-    intptr_t temp_width = (intptr_t) (width);
-    asm volatile(
-
-            "vbroadcastf128 %3,%%ymm5                  \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "vmovdqu     -0x20(%0,%2,1),%%ymm0         \n"
-            "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
-            "vpermq      $0x4e,%%ymm0,%%ymm0           \n"
-            "vmovdqu     %%ymm0,(%1)                   \n"
-            "lea         0x20(%1),%1                   \n"
-            "sub         $0x20,%2                      \n"
-            "jg          1b                            \n"
-            "vzeroupper                                \n"
-            : "+r"(src),           // %0
-    "+r"(dst),           // %1
-    "+r"(temp_width)     // %2
-            : "m"(kShuffleMirror)  // %3
-            : "memory", "cc", "xmm0", "xmm5");
-}
-
-#endif  // HAS_MIRRORROW_AVX2
-
-#ifdef HAS_MIRRORUVROW_SSSE3
-// Shuffle table for reversing the UV.
-static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
-                                       6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u};
-
-void MirrorUVRow_SSSE3(const uint8_t *src_uv, uint8_t *dst_uv, int width) {
-    intptr_t temp_width = (intptr_t) (width);
-    asm volatile(
-
-            "movdqa      %3,%%xmm5                     \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "movdqu      -0x10(%0,%2,2),%%xmm0         \n"
-            "pshufb      %%xmm5,%%xmm0                 \n"
-            "movdqu      %%xmm0,(%1)                   \n"
-            "lea         0x10(%1),%1                   \n"
-            "sub         $0x8,%2                       \n"
-            "jg          1b                            \n"
-            : "+r"(src_uv),          // %0
-    "+r"(dst_uv),          // %1
-    "+r"(temp_width)       // %2
-            : "m"(kShuffleMirrorUV)  // %3
-            : "memory", "cc", "xmm0", "xmm5");
-}
-
-#endif  // HAS_MIRRORUVROW_SSSE3
-
-#ifdef HAS_MIRRORUVROW_AVX2
-
-void MirrorUVRow_AVX2(const uint8_t *src_uv, uint8_t *dst_uv, int width) {
-    intptr_t temp_width = (intptr_t) (width);
-    asm volatile(
-
-            "vbroadcastf128 %3,%%ymm5                  \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "vmovdqu     -0x20(%0,%2,2),%%ymm0         \n"
-            "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
-            "vpermq      $0x4e,%%ymm0,%%ymm0           \n"
-            "vmovdqu     %%ymm0,(%1)                   \n"
-            "lea         0x20(%1),%1                   \n"
-            "sub         $0x10,%2                      \n"
-            "jg          1b                            \n"
-            "vzeroupper                                \n"
-            : "+r"(src_uv),          // %0
-    "+r"(dst_uv),          // %1
-    "+r"(temp_width)       // %2
-            : "m"(kShuffleMirrorUV)  // %3
-            : "memory", "cc", "xmm0", "xmm5");
-}
-
-#endif  // HAS_MIRRORUVROW_AVX2
-
-#ifdef HAS_MIRRORSPLITUVROW_SSSE3
-// Shuffle table for reversing the bytes of UV channels.
-static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
-                                            15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
-
-void MirrorSplitUVRow_SSSE3(const uint8_t *src,
-                            uint8_t *dst_u,
-                            uint8_t *dst_v,
-                            int width) {
-    intptr_t temp_width = (intptr_t) (width);
-    asm volatile(
-            "movdqa      %4,%%xmm1                     \n"
-            "lea         -0x10(%0,%3,2),%0             \n"
-            "sub         %1,%2                         \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "movdqu      (%0),%%xmm0                   \n"
-            "lea         -0x10(%0),%0                  \n"
-            "pshufb      %%xmm1,%%xmm0                 \n"
-            "movlpd      %%xmm0,(%1)                   \n"
-            "movhpd      %%xmm0,0x00(%1,%2,1)          \n"
-            "lea         0x8(%1),%1                    \n"
-            "sub         $8,%3                         \n"
-            "jg          1b                            \n"
-            : "+r"(src),                  // %0
-    "+r"(dst_u),                // %1
-    "+r"(dst_v),                // %2
-    "+r"(temp_width)            // %3
-            : "m"(kShuffleMirrorSplitUV)  // %4
-            : "memory", "cc", "xmm0", "xmm1");
-}
-
-#endif  // HAS_MIRRORSPLITUVROW_SSSE3
-
-#ifdef HAS_ARGBMIRRORROW_SSE2
-
-void ARGBMirrorRow_SSE2(const uint8_t *src, uint8_t *dst, int width) {
-    intptr_t temp_width = (intptr_t) (width);
-    asm volatile(
-
-            "lea         -0x10(%0,%2,4),%0             \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "movdqu      (%0),%%xmm0                   \n"
-            "pshufd      $0x1b,%%xmm0,%%xmm0           \n"
-            "lea         -0x10(%0),%0                  \n"
-            "movdqu      %%xmm0,(%1)                   \n"
-            "lea         0x10(%1),%1                   \n"
-            "sub         $0x4,%2                       \n"
-            "jg          1b                            \n"
-            : "+r"(src),        // %0
-    "+r"(dst),        // %1
-    "+r"(temp_width)  // %2
-            :
-            : "memory", "cc", "xmm0");
-}
-
-#endif  // HAS_ARGBMIRRORROW_SSE2
-
-#ifdef HAS_ARGBMIRRORROW_AVX2
-// Shuffle table for reversing the bytes.
-static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
-
-void ARGBMirrorRow_AVX2(const uint8_t *src, uint8_t *dst, int width) {
-    intptr_t temp_width = (intptr_t) (width);
-    asm volatile(
-
-            "vmovdqu     %3,%%ymm5                     \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "vpermd      -0x20(%0,%2,4),%%ymm5,%%ymm0  \n"
-            "vmovdqu     %%ymm0,(%1)                   \n"
-            "lea         0x20(%1),%1                   \n"
-            "sub         $0x8,%2                       \n"
-            "jg          1b                            \n"
-            "vzeroupper                                \n"
-            : "+r"(src),                    // %0
-    "+r"(dst),                    // %1
-    "+r"(temp_width)              // %2
-            : "m"(kARGBShuffleMirror_AVX2)  // %3
-            : "memory", "cc", "xmm0", "xmm5");
-}
-
-#endif  // HAS_ARGBMIRRORROW_AVX2
-
-
-#ifdef HAS_COPYROW_SSE2
-
-void CopyRow_SSE2(const uint8_t *src, uint8_t *dst, int width) {
-    asm volatile(
-            "test        $0xf,%0                       \n"
-            "jne         2f                            \n"
-            "test        $0xf,%1                       \n"
-            "jne         2f                            \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "movdqa      (%0),%%xmm0                   \n"
-            "movdqa      0x10(%0),%%xmm1               \n"
-            "lea         0x20(%0),%0                   \n"
-            "movdqa      %%xmm0,(%1)                   \n"
-            "movdqa      %%xmm1,0x10(%1)               \n"
-            "lea         0x20(%1),%1                   \n"
-            "sub         $0x20,%2                      \n"
-            "jg          1b                            \n"
-            "jmp         9f                            \n"
-
-            LABELALIGN
-            "2:                                        \n"
-            "movdqu      (%0),%%xmm0                   \n"
-            "movdqu      0x10(%0),%%xmm1               \n"
-            "lea         0x20(%0),%0                   \n"
-            "movdqu      %%xmm0,(%1)                   \n"
-            "movdqu      %%xmm1,0x10(%1)               \n"
-            "lea         0x20(%1),%1                   \n"
-            "sub         $0x20,%2                      \n"
-            "jg          2b                            \n"
-
-            LABELALIGN "9:                                        \n"
-            : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-            :
-            : "memory", "cc", "xmm0", "xmm1");
-}
-
-#endif  // HAS_COPYROW_SSE2
-
-#ifdef HAS_COPYROW_AVX
-
-void CopyRow_AVX(const uint8_t *src, uint8_t *dst, int width) {
-    asm volatile(
-
-            LABELALIGN
-            "1:                                        \n"
-            "vmovdqu     (%0),%%ymm0                   \n"
-            "vmovdqu     0x20(%0),%%ymm1               \n"
-            "lea         0x40(%0),%0                   \n"
-            "vmovdqu     %%ymm0,(%1)                   \n"
-            "vmovdqu     %%ymm1,0x20(%1)               \n"
-            "lea         0x40(%1),%1                   \n"
-            "sub         $0x40,%2                      \n"
-            "jg          1b                            \n"
-            "vzeroupper                                \n"
-            : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-            :
-            : "memory", "cc", "xmm0", "xmm1");
-}
-
-#endif  // HAS_COPYROW_AVX
-
-#ifdef HAS_COPYROW_ERMS
-
-// Multiple of 1.
-void CopyRow_ERMS(const uint8_t *src, uint8_t *dst, int width) {
-    size_t width_tmp = (size_t) (width);
-    asm volatile(
-
-            "rep         movsb                         \n"
-            : "+S"(src),       // %0
-    "+D"(dst),       // %1
-    "+c"(width_tmp)  // %2
-            :
-            : "memory", "cc");
-}
-
-#endif  // HAS_COPYROW_ERMS
-
-#ifdef HAS_INTERPOLATEROW_SSSE3
-
-// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_SSSE3(uint8_t *dst_ptr,
-                          const uint8_t *src_ptr,
-                          ptrdiff_t src_stride,
-                          int width,
-                          int source_y_fraction) {
-    asm volatile(
-            "sub         %1,%0                         \n"
-            "cmp         $0x0,%3                       \n"
-            "je          100f                          \n"
-            "cmp         $0x80,%3                      \n"
-            "je          50f                           \n"
-
-            "movd        %3,%%xmm0                     \n"
-            "neg         %3                            \n"
-            "add         $0x100,%3                     \n"
-            "movd        %3,%%xmm5                     \n"
-            "punpcklbw   %%xmm0,%%xmm5                 \n"
-            "punpcklwd   %%xmm5,%%xmm5                 \n"
-            "pshufd      $0x0,%%xmm5,%%xmm5            \n"
-            "mov         $0x80808080,%%eax             \n"
-            "movd        %%eax,%%xmm4                  \n"
-            "pshufd      $0x0,%%xmm4,%%xmm4            \n"
-
-            // General purpose row blend.
-            LABELALIGN
-            "1:                                        \n"
-            "movdqu      (%1),%%xmm0                   \n"
-            "movdqu      0x00(%1,%4,1),%%xmm2          \n"
-            "movdqa      %%xmm0,%%xmm1                 \n"
-            "punpcklbw   %%xmm2,%%xmm0                 \n"
-            "punpckhbw   %%xmm2,%%xmm1                 \n"
-            "psubb       %%xmm4,%%xmm0                 \n"
-            "psubb       %%xmm4,%%xmm1                 \n"
-            "movdqa      %%xmm5,%%xmm2                 \n"
-            "movdqa      %%xmm5,%%xmm3                 \n"
-            "pmaddubsw   %%xmm0,%%xmm2                 \n"
-            "pmaddubsw   %%xmm1,%%xmm3                 \n"
-            "paddw       %%xmm4,%%xmm2                 \n"
-            "paddw       %%xmm4,%%xmm3                 \n"
-            "psrlw       $0x8,%%xmm2                   \n"
-            "psrlw       $0x8,%%xmm3                   \n"
-            "packuswb    %%xmm3,%%xmm2                 \n"
-            "movdqu      %%xmm2,0x00(%1,%0,1)          \n"
-            "lea         0x10(%1),%1                   \n"
-            "sub         $0x10,%2                      \n"
-            "jg          1b                            \n"
-            "jmp         99f                           \n"
-
-            // Blend 50 / 50.
-            LABELALIGN
-            "50:                                       \n"
-            "movdqu      (%1),%%xmm0                   \n"
-            "movdqu      0x00(%1,%4,1),%%xmm1          \n"
-            "pavgb       %%xmm1,%%xmm0                 \n"
-            "movdqu      %%xmm0,0x00(%1,%0,1)          \n"
-            "lea         0x10(%1),%1                   \n"
-            "sub         $0x10,%2                      \n"
-            "jg          50b                           \n"
-            "jmp         99f                           \n"
-
-            // Blend 100 / 0 - Copy row unchanged.
-            LABELALIGN
-            "100:                                      \n"
-            "movdqu      (%1),%%xmm0                   \n"
-            "movdqu      %%xmm0,0x00(%1,%0,1)          \n"
-            "lea         0x10(%1),%1                   \n"
-            "sub         $0x10,%2                      \n"
-            "jg          100b                          \n"
-
-            "99:                                       \n"
-            : "+r"(dst_ptr),               // %0
-    "+r"(src_ptr),               // %1
-    "+rm"(width),                // %2
-    "+r"(source_y_fraction)      // %3
-            : "r"((intptr_t) (src_stride))  // %4
-            : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-#endif  // HAS_INTERPOLATEROW_SSSE3
-
-#ifdef HAS_INTERPOLATEROW_AVX2
-
-// Bilinear filter 32x2 -> 32x1
-void InterpolateRow_AVX2(uint8_t *dst_ptr,
-                         const uint8_t *src_ptr,
-                         ptrdiff_t src_stride,
-                         int width,
-                         int source_y_fraction) {
-    asm volatile(
-            "sub         %1,%0                         \n"
-            "cmp         $0x0,%3                       \n"
-            "je          100f                          \n"
-            "cmp         $0x80,%3                      \n"
-            "je          50f                           \n"
-
-            "vmovd       %3,%%xmm0                     \n"
-            "neg         %3                            \n"
-            "add         $0x100,%3                     \n"
-            "vmovd       %3,%%xmm5                     \n"
-            "vpunpcklbw  %%xmm0,%%xmm5,%%xmm5          \n"
-            "vpunpcklwd  %%xmm5,%%xmm5,%%xmm5          \n"
-            "vbroadcastss %%xmm5,%%ymm5                \n"
-            "mov         $0x80808080,%%eax             \n"
-            "vmovd       %%eax,%%xmm4                  \n"
-            "vbroadcastss %%xmm4,%%ymm4                \n"
-
-            // General purpose row blend.
-            LABELALIGN
-            "1:                                        \n"
-            "vmovdqu     (%1),%%ymm0                   \n"
-            "vmovdqu     0x00(%1,%4,1),%%ymm2          \n"
-            "vpunpckhbw  %%ymm2,%%ymm0,%%ymm1          \n"
-            "vpunpcklbw  %%ymm2,%%ymm0,%%ymm0          \n"
-            "vpsubb      %%ymm4,%%ymm1,%%ymm1          \n"
-            "vpsubb      %%ymm4,%%ymm0,%%ymm0          \n"
-            "vpmaddubsw  %%ymm1,%%ymm5,%%ymm1          \n"
-            "vpmaddubsw  %%ymm0,%%ymm5,%%ymm0          \n"
-            "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"
-            "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"
-            "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
-            "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
-            "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-            "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
-            "lea         0x20(%1),%1                   \n"
-            "sub         $0x20,%2                      \n"
-            "jg          1b                            \n"
-            "jmp         99f                           \n"
-
-            // Blend 50 / 50.
-            LABELALIGN
-            "50:                                       \n"
-            "vmovdqu     (%1),%%ymm0                   \n"
-            "vpavgb      0x00(%1,%4,1),%%ymm0,%%ymm0   \n"
-            "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
-            "lea         0x20(%1),%1                   \n"
-            "sub         $0x20,%2                      \n"
-            "jg          50b                           \n"
-            "jmp         99f                           \n"
-
-            // Blend 100 / 0 - Copy row unchanged.
-            LABELALIGN
-            "100:                                      \n"
-            "vmovdqu     (%1),%%ymm0                   \n"
-            "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
-            "lea         0x20(%1),%1                   \n"
-            "sub         $0x20,%2                      \n"
-            "jg          100b                          \n"
-
-            "99:                                       \n"
-            "vzeroupper                                \n"
-            : "+r"(dst_ptr),               // %0
-    "+r"(src_ptr),               // %1
-    "+r"(width),                 // %2
-    "+r"(source_y_fraction)      // %3
-            : "r"((intptr_t) (src_stride))  // %4
-            : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
-}
-
-#endif  // HAS_INTERPOLATEROW_AVX2
-
-#endif  // defined(__x86_64__) || defined(__i386__)
diff --git a/pkg/encoder/yuv/libyuv/scale.c b/pkg/encoder/yuv/libyuv/scale.c
deleted file mode 100644
index c4bd5b0b4..000000000
--- a/pkg/encoder/yuv/libyuv/scale.c
+++ /dev/null
@@ -1,946 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "scale.h"
-
-#include <assert.h>
-#include <string.h>
-
-#include "cpu_id.h"
-#include "planar_functions.h"  // For CopyPlane
-#include "row.h"
-#include "scale_row.h"
-
-static __inline int Abs(int v) {
-    return v >= 0 ? v : -v;
-}
-
-#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
-#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
-
-// Scale plane, 1/2
-// This is an optimized version for scaling down a plane to 1/2 of
-// its original size.
-
-static void ScalePlaneDown2(int src_width,
-                            int src_height,
-                            int dst_width,
-                            int dst_height,
-                            int src_stride,
-                            int dst_stride,
-                            const uint8_t *src_ptr,
-                            uint8_t *dst_ptr,
-                            enum FilterMode filtering) {
-    int y;
-    void (*ScaleRowDown2)(const uint8_t *src_ptr, ptrdiff_t src_stride,
-                          uint8_t *dst_ptr, int dst_width) =
-    filtering == kFilterNone
-    ? ScaleRowDown2_C
-    : (filtering == kFilterLinear ? ScaleRowDown2Linear_C
-                                  : ScaleRowDown2Box_C);
-    int row_stride = src_stride * 2;
-    (void) src_width;
-    (void) src_height;
-    if (!filtering) {
-        src_ptr += src_stride;  // Point to odd rows.
-        src_stride = 0;
-    }
-
-
-#if defined(HAS_SCALEROWDOWN2_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3)) {
-        ScaleRowDown2 =
-                filtering == kFilterNone
-                ? ScaleRowDown2_Any_SSSE3
-                : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3
-                                              : ScaleRowDown2Box_Any_SSSE3);
-        if (IS_ALIGNED(dst_width, 16)) {
-            ScaleRowDown2 =
-                    filtering == kFilterNone
-                    ? ScaleRowDown2_SSSE3
-                    : (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3
-                                                  : ScaleRowDown2Box_SSSE3);
-        }
-    }
-#endif
-#if defined(HAS_SCALEROWDOWN2_AVX2)
-    if (TestCpuFlag(kCpuHasAVX2)) {
-        ScaleRowDown2 =
-                filtering == kFilterNone
-                ? ScaleRowDown2_Any_AVX2
-                : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2
-                                              : ScaleRowDown2Box_Any_AVX2);
-        if (IS_ALIGNED(dst_width, 32)) {
-            ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2
-                                                     : (filtering == kFilterLinear
-                                                        ? ScaleRowDown2Linear_AVX2
-                                                        : ScaleRowDown2Box_AVX2);
-        }
-    }
-#endif
-
-    if (filtering == kFilterLinear) {
-        src_stride = 0;
-    }
-    // TODO(fbarchard): Loop through source height to allow odd height.
-    for (y = 0; y < dst_height; ++y) {
-        ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
-        src_ptr += row_stride;
-        dst_ptr += dst_stride;
-    }
-}
-
-// Scale plane, 1/4
-// This is an optimized version for scaling down a plane to 1/4 of
-// its original size.
-
-static void ScalePlaneDown4(int src_width,
-                            int src_height,
-                            int dst_width,
-                            int dst_height,
-                            int src_stride,
-                            int dst_stride,
-                            const uint8_t *src_ptr,
-                            uint8_t *dst_ptr,
-                            enum FilterMode filtering) {
-    int y;
-    void (*ScaleRowDown4)(const uint8_t *src_ptr, ptrdiff_t src_stride,
-                          uint8_t *dst_ptr, int dst_width) =
-    filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;
-    int row_stride = src_stride * 4;
-    (void) src_width;
-    (void) src_height;
-    if (!filtering) {
-        src_ptr += src_stride * 2;  // Point to row 2.
-        src_stride = 0;
-    }
-
-#if defined(HAS_SCALEROWDOWN4_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3)) {
-        ScaleRowDown4 =
-                filtering ? ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3;
-        if (IS_ALIGNED(dst_width, 8)) {
-            ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3;
-        }
-    }
-#endif
-#if defined(HAS_SCALEROWDOWN4_AVX2)
-    if (TestCpuFlag(kCpuHasAVX2)) {
-        ScaleRowDown4 =
-                filtering ? ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;
-        if (IS_ALIGNED(dst_width, 16)) {
-            ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2;
-        }
-    }
-#endif
-
-    if (filtering == kFilterLinear) {
-        src_stride = 0;
-    }
-    for (y = 0; y < dst_height; ++y) {
-        ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
-        src_ptr += row_stride;
-        dst_ptr += dst_stride;
-    }
-}
-
-// Scale plane down, 3/4
-static void ScalePlaneDown34(int src_width,
-                             int src_height,
-                             int dst_width,
-                             int dst_height,
-                             int src_stride,
-                             int dst_stride,
-                             const uint8_t *src_ptr,
-                             uint8_t *dst_ptr,
-                             enum FilterMode filtering) {
-    int y;
-    void (*ScaleRowDown34_0)(const uint8_t *src_ptr, ptrdiff_t src_stride,
-                             uint8_t *dst_ptr, int dst_width);
-    void (*ScaleRowDown34_1)(const uint8_t *src_ptr, ptrdiff_t src_stride,
-                             uint8_t *dst_ptr, int dst_width);
-    const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
-    (void) src_width;
-    (void) src_height;
-    assert(dst_width % 3 == 0);
-    if (!filtering) {
-        ScaleRowDown34_0 = ScaleRowDown34_C;
-        ScaleRowDown34_1 = ScaleRowDown34_C;
-    } else {
-        ScaleRowDown34_0 = ScaleRowDown34_0_Box_C;
-        ScaleRowDown34_1 = ScaleRowDown34_1_Box_C;
-    }
-
-#if defined(HAS_SCALEROWDOWN34_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3)) {
-        if (!filtering) {
-            ScaleRowDown34_0 = ScaleRowDown34_Any_SSSE3;
-            ScaleRowDown34_1 = ScaleRowDown34_Any_SSSE3;
-        } else {
-            ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_SSSE3;
-            ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_SSSE3;
-        }
-        if (dst_width % 24 == 0) {
-            if (!filtering) {
-                ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
-                ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
-            } else {
-                ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3;
-                ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3;
-            }
-        }
-    }
-#endif
-
-    for (y = 0; y < dst_height - 2; y += 3) {
-        ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
-        src_ptr += src_stride;
-        dst_ptr += dst_stride;
-        ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
-        src_ptr += src_stride;
-        dst_ptr += dst_stride;
-        ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width);
-        src_ptr += src_stride * 2;
-        dst_ptr += dst_stride;
-    }
-
-    // Remainder 1 or 2 rows with last row vertically unfiltered
-    if ((dst_height % 3) == 2) {
-        ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
-        src_ptr += src_stride;
-        dst_ptr += dst_stride;
-        ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
-    } else if ((dst_height % 3) == 1) {
-        ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
-    }
-}
-
-// Scale plane, 3/8
-// This is an optimized version for scaling down a plane to 3/8
-// of its original size.
-//
-// Uses box filter arranges like this
-// aaabbbcc -> abc
-// aaabbbcc    def
-// aaabbbcc    ghi
-// dddeeeff
-// dddeeeff
-// dddeeeff
-// ggghhhii
-// ggghhhii
-// Boxes are 3x3, 2x3, 3x2 and 2x2
-
-static void ScalePlaneDown38(int src_width,
-                             int src_height,
-                             int dst_width,
-                             int dst_height,
-                             int src_stride,
-                             int dst_stride,
-                             const uint8_t *src_ptr,
-                             uint8_t *dst_ptr,
-                             enum FilterMode filtering) {
-    int y;
-    void (*ScaleRowDown38_3)(const uint8_t *src_ptr, ptrdiff_t src_stride,
-                             uint8_t *dst_ptr, int dst_width);
-    void (*ScaleRowDown38_2)(const uint8_t *src_ptr, ptrdiff_t src_stride,
-                             uint8_t *dst_ptr, int dst_width);
-    const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
-    assert(dst_width % 3 == 0);
-    (void) src_width;
-    (void) src_height;
-    if (!filtering) {
-        ScaleRowDown38_3 = ScaleRowDown38_C;
-        ScaleRowDown38_2 = ScaleRowDown38_C;
-    } else {
-        ScaleRowDown38_3 = ScaleRowDown38_3_Box_C;
-        ScaleRowDown38_2 = ScaleRowDown38_2_Box_C;
-    }
-
-#if defined(HAS_SCALEROWDOWN38_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3)) {
-        if (!filtering) {
-            ScaleRowDown38_3 = ScaleRowDown38_Any_SSSE3;
-            ScaleRowDown38_2 = ScaleRowDown38_Any_SSSE3;
-        } else {
-            ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_SSSE3;
-            ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_SSSE3;
-        }
-        if (dst_width % 12 == 0 && !filtering) {
-            ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
-            ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
-        }
-        if (dst_width % 6 == 0 && filtering) {
-            ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3;
-            ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3;
-        }
-    }
-#endif
-
-    for (y = 0; y < dst_height - 2; y += 3) {
-        ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
-        src_ptr += src_stride * 3;
-        dst_ptr += dst_stride;
-        ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
-        src_ptr += src_stride * 3;
-        dst_ptr += dst_stride;
-        ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
-        src_ptr += src_stride * 2;
-        dst_ptr += dst_stride;
-    }
-
-    // Remainder 1 or 2 rows with last row vertically unfiltered
-    if ((dst_height % 3) == 2) {
-        ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
-        src_ptr += src_stride * 3;
-        dst_ptr += dst_stride;
-        ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
-    } else if ((dst_height % 3) == 1) {
-        ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
-    }
-}
-
-#define MIN1(x) ((x) < 1 ? 1 : (x))
-
-static __inline uint32_t SumPixels(int iboxwidth, const uint16_t *src_ptr) {
-    uint32_t sum = 0u;
-    int x;
-    assert(iboxwidth > 0);
-    for (x = 0; x < iboxwidth; ++x) {
-        sum += src_ptr[x];
-    }
-    return sum;
-}
-
-static __inline uint32_t SumPixels_16(int iboxwidth, const uint32_t *src_ptr) {
-    uint32_t sum = 0u;
-    int x;
-    assert(iboxwidth > 0);
-    for (x = 0; x < iboxwidth; ++x) {
-        sum += src_ptr[x];
-    }
-    return sum;
-}
-
-static void ScaleAddCols2_C(int dst_width,
-                            int boxheight,
-                            int x,
-                            int dx,
-                            const uint16_t *src_ptr,
-                            uint8_t *dst_ptr) {
-    int i;
-    int scaletbl[2];
-    int minboxwidth = dx >> 16;
-    int boxwidth;
-    scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
-    scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
-    for (i = 0; i < dst_width; ++i) {
-        int ix = x >> 16;
-        x += dx;
-        boxwidth = MIN1((x >> 16) - ix);
-        int scaletbl_index = boxwidth - minboxwidth;
-        assert((scaletbl_index == 0) || (scaletbl_index == 1));
-        *dst_ptr++ = (uint8_t) (SumPixels(boxwidth, src_ptr + ix) *
-                                scaletbl[scaletbl_index] >>
-                                                         16);
-    }
-}
-
-static void ScaleAddCols0_C(int dst_width,
-                            int boxheight,
-                            int x,
-                            int dx,
-                            const uint16_t *src_ptr,
-                            uint8_t *dst_ptr) {
-    int scaleval = 65536 / boxheight;
-    int i;
-    (void) dx;
-    src_ptr += (x >> 16);
-    for (i = 0; i < dst_width; ++i) {
-        *dst_ptr++ = (uint8_t) (src_ptr[i] * scaleval >> 16);
-    }
-}
-
-static void ScaleAddCols1_C(int dst_width,
-                            int boxheight,
-                            int x,
-                            int dx,
-                            const uint16_t *src_ptr,
-                            uint8_t *dst_ptr) {
-    int boxwidth = MIN1(dx >> 16);
-    int scaleval = 65536 / (boxwidth * boxheight);
-    int i;
-    x >>= 16;
-    for (i = 0; i < dst_width; ++i) {
-        *dst_ptr++ = (uint8_t) (SumPixels(boxwidth, src_ptr + x) * scaleval >> 16);
-        x += boxwidth;
-    }
-}
-
-// Scale plane down to any dimensions, with interpolation.
-// (boxfilter).
-//
-// Same method as SimpleScale, which is fixed point, outputting
-// one pixel of destination using fixed point (16.16) to step
-// through source, sampling a box of pixel with simple
-// averaging.
-static void ScalePlaneBox(int src_width,
-                          int src_height,
-                          int dst_width,
-                          int dst_height,
-                          int src_stride,
-                          int dst_stride,
-                          const uint8_t *src_ptr,
-                          uint8_t *dst_ptr) {
-    int j, k;
-    // Initial source x/y coordinate and step values as 16.16 fixed point.
-    int x = 0;
-    int y = 0;
-    int dx = 0;
-    int dy = 0;
-    const int max_y = (src_height << 16);
-    ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y,
-               &dx, &dy);
-    src_width = Abs(src_width);
-    {
-        // Allocate a row buffer of uint16_t.
-        align_buffer_64(row16, src_width * 2);
-        void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
-                             const uint16_t *src_ptr, uint8_t *dst_ptr) =
-        (dx & 0xffff) ? ScaleAddCols2_C
-                      : ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
-        void (*ScaleAddRow)(const uint8_t *src_ptr, uint16_t *dst_ptr,
-                            int src_width) = ScaleAddRow_C;
-#if defined(HAS_SCALEADDROW_SSE2)
-        if (TestCpuFlag(kCpuHasSSE2)) {
-            ScaleAddRow = ScaleAddRow_Any_SSE2;
-            if (IS_ALIGNED(src_width, 16)) {
-                ScaleAddRow = ScaleAddRow_SSE2;
-            }
-        }
-#endif
-#if defined(HAS_SCALEADDROW_AVX2)
-        if (TestCpuFlag(kCpuHasAVX2)) {
-            ScaleAddRow = ScaleAddRow_Any_AVX2;
-            if (IS_ALIGNED(src_width, 32)) {
-                ScaleAddRow = ScaleAddRow_AVX2;
-            }
-        }
-#endif
-
-
-        for (j = 0; j < dst_height; ++j) {
-            int boxheight;
-            int iy = y >> 16;
-            const uint8_t *src = src_ptr + iy * (int64_t) src_stride;
-            y += dy;
-            if (y > max_y) {
-                y = max_y;
-            }
-            boxheight = MIN1((y >> 16) - iy);
-            memset(row16, 0, src_width * 2);
-            for (k = 0; k < boxheight; ++k) {
-                ScaleAddRow(src, (uint16_t *) (row16), src_width);
-                src += src_stride;
-            }
-            ScaleAddCols(dst_width, boxheight, x, dx, (uint16_t *) (row16), dst_ptr);
-            dst_ptr += dst_stride;
-        }
-        free_aligned_buffer_64(row16);
-    }
-}
-
-// Scale plane down with bilinear interpolation.
-static void ScalePlaneBilinearDown(int src_width,
-                                   int src_height,
-                                   int dst_width,
-                                   int dst_height,
-                                   int src_stride,
-                                   int dst_stride,
-                                   const uint8_t *src_ptr,
-                                   uint8_t *dst_ptr,
-                                   enum FilterMode filtering) {
-    // Initial source x/y coordinate and step values as 16.16 fixed point.
-    int x = 0;
-    int y = 0;
-    int dx = 0;
-    int dy = 0;
-    // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
-    // Allocate a row buffer.
-    align_buffer_64(row, src_width);
-
-    const int max_y = (src_height - 1) << 16;
-    int j;
-    void (*ScaleFilterCols)(uint8_t *dst_ptr, const uint8_t *src_ptr,
-                            int dst_width, int x, int dx) =
-    (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
-    void (*InterpolateRow)(uint8_t *dst_ptr, const uint8_t *src_ptr,
-                           ptrdiff_t src_stride, int dst_width,
-                           int source_y_fraction) = InterpolateRow_C;
-    ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
-               &dx, &dy);
-    src_width = Abs(src_width);
-
-#if defined(HAS_INTERPOLATEROW_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3)) {
-        InterpolateRow = InterpolateRow_Any_SSSE3;
-        if (IS_ALIGNED(src_width, 16)) {
-            InterpolateRow = InterpolateRow_SSSE3;
-        }
-    }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
-    if (TestCpuFlag(kCpuHasAVX2)) {
-        InterpolateRow = InterpolateRow_Any_AVX2;
-        if (IS_ALIGNED(src_width, 32)) {
-            InterpolateRow = InterpolateRow_AVX2;
-        }
-    }
-#endif
-
-#if defined(HAS_SCALEFILTERCOLS_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
-        ScaleFilterCols = ScaleFilterCols_SSSE3;
-    }
-#endif
-
-    if (y > max_y) {
-        y = max_y;
-    }
-
-    for (j = 0; j < dst_height; ++j) {
-        int yi = y >> 16;
-        const uint8_t *src = src_ptr + yi * (int64_t) src_stride;
-        if (filtering == kFilterLinear) {
-            ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
-        } else {
-            int yf = (y >> 8) & 255;
-            InterpolateRow(row, src, src_stride, src_width, yf);
-            ScaleFilterCols(dst_ptr, row, dst_width, x, dx);
-        }
-        dst_ptr += dst_stride;
-        y += dy;
-        if (y > max_y) {
-            y = max_y;
-        }
-    }
-    free_aligned_buffer_64(row);
-}
-
-// Scale up down with bilinear interpolation.
-static void ScalePlaneBilinearUp(int src_width,
-                                 int src_height,
-                                 int dst_width,
-                                 int dst_height,
-                                 int src_stride,
-                                 int dst_stride,
-                                 const uint8_t *src_ptr,
-                                 uint8_t *dst_ptr,
-                                 enum FilterMode filtering) {
-    int j;
-    // Initial source x/y coordinate and step values as 16.16 fixed point.
-    int x = 0;
-    int y = 0;
-    int dx = 0;
-    int dy = 0;
-    const int max_y = (src_height - 1) << 16;
-    void (*InterpolateRow)(uint8_t *dst_ptr, const uint8_t *src_ptr,
-                           ptrdiff_t src_stride, int dst_width,
-                           int source_y_fraction) = InterpolateRow_C;
-    void (*ScaleFilterCols)(uint8_t *dst_ptr, const uint8_t *src_ptr,
-                            int dst_width, int x, int dx) =
-    filtering ? ScaleFilterCols_C : ScaleCols_C;
-    ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
-               &dx, &dy);
-    src_width = Abs(src_width);
-
-#if defined(HAS_INTERPOLATEROW_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3)) {
-        InterpolateRow = InterpolateRow_Any_SSSE3;
-        if (IS_ALIGNED(dst_width, 16)) {
-            InterpolateRow = InterpolateRow_SSSE3;
-        }
-    }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
-    if (TestCpuFlag(kCpuHasAVX2)) {
-        InterpolateRow = InterpolateRow_Any_AVX2;
-        if (IS_ALIGNED(dst_width, 32)) {
-            InterpolateRow = InterpolateRow_AVX2;
-        }
-    }
-#endif
-
-    if (filtering && src_width >= 32768) {
-        ScaleFilterCols = ScaleFilterCols64_C;
-    }
-#if defined(HAS_SCALEFILTERCOLS_SSSE3)
-    if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
-        ScaleFilterCols = ScaleFilterCols_SSSE3;
-    }
-#endif
-
-    if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
-        ScaleFilterCols = ScaleColsUp2_C;
-#if defined(HAS_SCALECOLS_SSE2)
-        if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
-          ScaleFilterCols = ScaleColsUp2_SSE2;
-        }
-#endif
-    }
-
-    if (y > max_y) {
-        y = max_y;
-    }
-    {
-        int yi = y >> 16;
-        const uint8_t *src = src_ptr + yi * (int64_t) src_stride;
-
-        // Allocate 2 row buffers.
-        const int row_size = (dst_width + 31) & ~31;
-        align_buffer_64(row, row_size * 2);
-
-        uint8_t *rowptr = row;
-        int rowstride = row_size;
-        int lasty = yi;
-
-        ScaleFilterCols(rowptr, src, dst_width, x, dx);
-        if (src_height > 1) {
-            src += src_stride;
-        }
-        ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
-        if (src_height > 2) {
-            src += src_stride;
-        }
-
-        for (j = 0; j < dst_height; ++j) {
-            yi = y >> 16;
-            if (yi != lasty) {
-                if (y > max_y) {
-                    y = max_y;
-                    yi = y >> 16;
-                    src = src_ptr + yi * (int64_t) src_stride;
-                }
-                if (yi != lasty) {
-                    ScaleFilterCols(rowptr, src, dst_width, x, dx);
-                    rowptr += rowstride;
-                    rowstride = -rowstride;
-                    lasty = yi;
-                    if ((y + 65536) < max_y) {
-                        src += src_stride;
-                    }
-                }
-            }
-            if (filtering == kFilterLinear) {
-                InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
-            } else {
-                int yf = (y >> 8) & 255;
-                InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
-            }
-            dst_ptr += dst_stride;
-            y += dy;
-        }
-        free_aligned_buffer_64(row);
-    }
-}
-
-// Scale plane, horizontally up by 2 times.
-// Uses linear filter horizontally, nearest vertically.
-// This is an optimized version for scaling up a plane to 2 times of
-// its original width, using linear interpolation.
-// This is used to scale U and V planes of I422 to I444.
-static void ScalePlaneUp2_Linear(int src_width,
-                                 int src_height,
-                                 int dst_width,
-                                 int dst_height,
-                                 int src_stride,
-                                 int dst_stride,
-                                 const uint8_t *src_ptr,
-                                 uint8_t *dst_ptr) {
-    void (*ScaleRowUp)(const uint8_t *src_ptr, uint8_t *dst_ptr, int dst_width) =
-    ScaleRowUp2_Linear_Any_C;
-    int i;
-    int y;
-    int dy;
-
-    (void) src_width;
-    // This function can only scale up by 2 times horizontally.
-    assert(src_width == ((dst_width + 1) / 2));
-
-#ifdef HAS_SCALEROWUP2_LINEAR_SSE2
-    if (TestCpuFlag(kCpuHasSSE2)) {
-        ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2;
-    }
-#endif
-
-#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3
-    if (TestCpuFlag(kCpuHasSSSE3)) {
-        ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3;
-    }
-#endif
-
-#ifdef HAS_SCALEROWUP2_LINEAR_AVX2
-    if (TestCpuFlag(kCpuHasAVX2)) {
-        ScaleRowUp = ScaleRowUp2_Linear_Any_AVX2;
-    }
-#endif
-
-
-    if (dst_height == 1) {
-        ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t) src_stride, dst_ptr,
-                   dst_width);
-    } else {
-        dy = FixedDiv(src_height - 1, dst_height - 1);
-        y = (1 << 15) - 1;
-        for (i = 0; i < dst_height; ++i) {
-            ScaleRowUp(src_ptr + (y >> 16) * (int64_t) src_stride, dst_ptr, dst_width);
-            dst_ptr += dst_stride;
-            y += dy;
-        }
-    }
-}
-
-// Scale plane, up by 2 times.
-// This is an optimized version for scaling up a plane to 2 times of
-// its original size, using bilinear interpolation.
-// This is used to scale U and V planes of I420 to I444.
-static void ScalePlaneUp2_Bilinear(int src_width,
-                                   int src_height,
-                                   int dst_width,
-                                   int dst_height,
-                                   int src_stride,
-                                   int dst_stride,
-                                   const uint8_t *src_ptr,
-                                   uint8_t *dst_ptr) {
-    void (*Scale2RowUp)(const uint8_t *src_ptr, ptrdiff_t src_stride,
-                        uint8_t *dst_ptr, ptrdiff_t dst_stride, int dst_width) =
-    ScaleRowUp2_Bilinear_Any_C;
-    int x;
-
-    (void) src_width;
-    // This function can only scale up by 2 times.
-    assert(src_width == ((dst_width + 1) / 2));
-    assert(src_height == ((dst_height + 1) / 2));
-
-#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2
-    if (TestCpuFlag(kCpuHasSSE2)) {
-        Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2;
-    }
-#endif
-
-#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3
-    if (TestCpuFlag(kCpuHasSSSE3)) {
-        Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3;
-    }
-#endif
-
-#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2
-    if (TestCpuFlag(kCpuHasAVX2)) {
-        Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2;
-    }
-#endif
-
-
-    Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
-    dst_ptr += dst_stride;
-    for (x = 0; x < src_height - 1; ++x) {
-        Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
-        src_ptr += src_stride;
-        // TODO(fbarchard): Test performance of writing one row of destination at a
-        // time.
-        dst_ptr += 2 * dst_stride;
-    }
-    if (!(dst_height & 1)) {
-        Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
-    }
-}
-
-// Scale Plane to/from any dimensions, without interpolation.
-// Fixed point math is used for performance: The upper 16 bits
-// of x and dx is the integer part of the source position and
-// the lower 16 bits are the fixed decimal part.
-
-static void ScalePlaneSimple(int src_width,
-                             int src_height,
-                             int dst_width,
-                             int dst_height,
-                             int src_stride,
-                             int dst_stride,
-                             const uint8_t *src_ptr,
-                             uint8_t *dst_ptr) {
-    int i;
-    void (*ScaleCols)(uint8_t *dst_ptr, const uint8_t *src_ptr, int dst_width,
-                      int x, int dx) = ScaleCols_C;
-    // Initial source x/y coordinate and step values as 16.16 fixed point.
-    int x = 0;
-    int y = 0;
-    int dx = 0;
-    int dy = 0;
-    ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y,
-               &dx, &dy);
-    src_width = Abs(src_width);
-
-    if (src_width * 2 == dst_width && x < 0x8000) {
-        ScaleCols = ScaleColsUp2_C;
-#if defined(HAS_SCALECOLS_SSE2)
-        if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
-          ScaleCols = ScaleColsUp2_SSE2;
-        }
-#endif
-    }
-
-    for (i = 0; i < dst_height; ++i) {
-        ScaleCols(dst_ptr, src_ptr + (y >> 16) * (int64_t) src_stride, dst_width, x,
-                  dx);
-        dst_ptr += dst_stride;
-        y += dy;
-    }
-}
-
-// Scale a plane.
-// This function dispatches to a specialized scaler based on scale factor.
-LIBYUV_API
-void ScalePlane(const uint8_t *src,
-                int src_stride,
-                int src_width,
-                int src_height,
-                uint8_t *dst,
-                int dst_stride,
-                int dst_width,
-                int dst_height,
-                enum FilterMode filtering) {
-    // Simplify filtering when possible.
-    filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
-                                  filtering);
-
-    // Negative height means invert the image.
-    if (src_height < 0) {
-        src_height = -src_height;
-        src = src + (src_height - 1) * (int64_t) src_stride;
-        src_stride = -src_stride;
-    }
-    // Use specialized scales to improve performance for common resolutions.
-    // For example, all the 1/2 scalings will use ScalePlaneDown2()
-    if (dst_width == src_width && dst_height == src_height) {
-        // Straight copy.
-        CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
-        return;
-    }
-    if (dst_width == src_width && filtering != kFilterBox) {
-        int dy = 0;
-        int y = 0;
-        // When scaling down, use the center 2 rows to filter.
-        // When scaling up, last row of destination uses the last 2 source rows.
-        if (dst_height <= src_height) {
-            dy = FixedDiv(src_height, dst_height);
-            y = CENTERSTART(dy, -32768);  // Subtract 0.5 (32768) to center filter.
-        } else if (src_height > 1 && dst_height > 1) {
-            dy = FixedDiv1(src_height, dst_height);
-        }
-        // Arbitrary scale vertically, but unscaled horizontally.
-        ScalePlaneVertical(src_height, dst_width, dst_height, src_stride,
-                           dst_stride, src, dst, 0, y, dy, /*bpp=*/1, filtering);
-        return;
-    }
-    if (dst_width <= Abs(src_width) && dst_height <= src_height) {
-        // Scale down.
-        if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) {
-            // optimized, 3/4
-            ScalePlaneDown34(src_width, src_height, dst_width, dst_height, src_stride,
-                             dst_stride, src, dst, filtering);
-            return;
-        }
-        if (2 * dst_width == src_width && 2 * dst_height == src_height) {
-            // optimized, 1/2
-            ScalePlaneDown2(src_width, src_height, dst_width, dst_height, src_stride,
-                            dst_stride, src, dst, filtering);
-            return;
-        }
-        // 3/8 rounded up for odd sized chroma height.
-        if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {
-            // optimized, 3/8
-            ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride,
-                             dst_stride, src, dst, filtering);
-            return;
-        }
-        if (4 * dst_width == src_width && 4 * dst_height == src_height &&
-            (filtering == kFilterBox || filtering == kFilterNone)) {
-            // optimized, 1/4
-            ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride,
-                            dst_stride, src, dst, filtering);
-            return;
-        }
-    }
-    if (filtering == kFilterBox && dst_height * 2 < src_height) {
-        ScalePlaneBox(src_width, src_height, dst_width, dst_height, src_stride,
-                      dst_stride, src, dst);
-        return;
-    }
-    if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) {
-        ScalePlaneUp2_Linear(src_width, src_height, dst_width, dst_height,
-                             src_stride, dst_stride, src, dst);
-        return;
-    }
-    if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
-        (filtering == kFilterBilinear || filtering == kFilterBox)) {
-        ScalePlaneUp2_Bilinear(src_width, src_height, dst_width, dst_height,
-                               src_stride, dst_stride, src, dst);
-        return;
-    }
-    if (filtering && dst_height > src_height) {
-        ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
-                             src_stride, dst_stride, src, dst, filtering);
-        return;
-    }
-    if (filtering) {
-        ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height,
-                               src_stride, dst_stride, src, dst, filtering);
-        return;
-    }
-    ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride,
-                     dst_stride, src, dst);
-}
-
-LIBYUV_API
-int I420Scale(const uint8_t *src_y,
-              int src_stride_y,
-              const uint8_t *src_u,
-              int src_stride_u,
-              const uint8_t *src_v,
-              int src_stride_v,
-              int src_width,
-              int src_height,
-              uint8_t *dst_y,
-              int dst_stride_y,
-              uint8_t *dst_u,
-              int dst_stride_u,
-              uint8_t *dst_v,
-              int dst_stride_v,
-              int dst_width,
-              int dst_height,
-              enum FilterMode filtering) {
-    int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
-    int src_halfheight = SUBSAMPLE(src_height, 1, 1);
-    int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
-    int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
-
-    if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-        src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
-        dst_width <= 0 || dst_height <= 0) {
-        return -1;
-    }
-
-    ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
-               dst_width, dst_height, filtering);
-    ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
-               dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
-    ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
-               dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
-    return 0;
-}
diff --git a/pkg/encoder/yuv/libyuv/scale.h b/pkg/encoder/yuv/libyuv/scale.h
deleted file mode 100644
index ed0a1983f..000000000
--- a/pkg/encoder/yuv/libyuv/scale.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_SCALE_H_
-#define INCLUDE_LIBYUV_SCALE_H_
-
-#include "basic_types.h"
-
-// Supported filtering.
-typedef enum FilterMode {
-    kFilterNone = 0,      // Point sample; Fastest.
-    kFilterLinear = 1,    // Filter horizontally only.
-    kFilterBilinear = 2,  // Faster than box, but lower quality scaling down.
-    kFilterBox = 3        // Highest quality.
-} FilterModeEnum;
-
-// Scales a YUV 4:2:0 image from the src width and height to the
-// dst width and height.
-// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
-// used. This produces basic (blocky) quality at the fastest speed.
-// If filtering is kFilterBilinear, interpolation is used to produce a better
-// quality image, at the expense of speed.
-// If filtering is kFilterBox, averaging is used to produce ever better
-// quality image, at further expense of speed.
-// Returns 0 if successful.
-
-LIBYUV_API
-int I420Scale(const uint8_t *src_y,
-              int src_stride_y,
-              const uint8_t *src_u,
-              int src_stride_u,
-              const uint8_t *src_v,
-              int src_stride_v,
-              int src_width,
-              int src_height,
-              uint8_t *dst_y,
-              int dst_stride_y,
-              uint8_t *dst_u,
-              int dst_stride_u,
-              uint8_t *dst_v,
-              int dst_stride_v,
-              int dst_width,
-              int dst_height,
-              enum FilterMode filtering);
-
-#endif  // INCLUDE_LIBYUV_SCALE_H_
diff --git a/pkg/encoder/yuv/libyuv/scale_any.c b/pkg/encoder/yuv/libyuv/scale_any.c
deleted file mode 100644
index f05e55b6e..000000000
--- a/pkg/encoder/yuv/libyuv/scale_any.c
+++ /dev/null
@@ -1,632 +0,0 @@
-/*
- *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "scale_row.h"
-
-// Fixed scale down.
-// Mask may be non-power of 2, so use MOD
-#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
-  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \
-               int dst_width) {                                                \
-    int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */          \
-    int n = dst_width - r;                                                     \
-    if (n > 0) {                                                               \
-      SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                      \
-    }                                                                          \
-    SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                   \
-                   dst_ptr + n * BPP, r);                                      \
-  }
-
-// Fixed scale down for odd source width.  Used by I420Blend subsampling.
-// Since dst_width is (width + 1) / 2, this function scales one less pixel
-// and copies the last pixel.
-#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
-  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \
-               int dst_width) {                                                \
-    int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */    \
-    int n = (dst_width - 1) - r;                                               \
-    if (n > 0) {                                                               \
-      SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                      \
-    }                                                                          \
-    SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                   \
-                   dst_ptr + n * BPP, r + 1);                                  \
-  }
-
-#ifdef HAS_SCALEROWDOWN2_SSSE3
-
-SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
-
-SDANY(ScaleRowDown2Linear_Any_SSSE3,
-      ScaleRowDown2Linear_SSSE3,
-      ScaleRowDown2Linear_C,
-      2,
-      1,
-      15)
-
-SDANY(ScaleRowDown2Box_Any_SSSE3,
-      ScaleRowDown2Box_SSSE3,
-      ScaleRowDown2Box_C,
-      2,
-      1,
-      15)
-
-SDODD(ScaleRowDown2Box_Odd_SSSE3,
-      ScaleRowDown2Box_SSSE3,
-      ScaleRowDown2Box_Odd_C,
-      2,
-      1,
-      15)
-
-#endif
-#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
-
-SDANY(ScaleUVRowDown2Box_Any_SSSE3,
-      ScaleUVRowDown2Box_SSSE3,
-      ScaleUVRowDown2Box_C,
-      2,
-      2,
-      3)
-
-#endif
-#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
-
-SDANY(ScaleUVRowDown2Box_Any_AVX2,
-      ScaleUVRowDown2Box_AVX2,
-      ScaleUVRowDown2Box_C,
-      2,
-      2,
-      7)
-
-#endif
-#ifdef HAS_SCALEROWDOWN2_AVX2
-
-SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
-
-SDANY(ScaleRowDown2Linear_Any_AVX2,
-      ScaleRowDown2Linear_AVX2,
-      ScaleRowDown2Linear_C,
-      2,
-      1,
-      31)
-
-SDANY(ScaleRowDown2Box_Any_AVX2,
-      ScaleRowDown2Box_AVX2,
-      ScaleRowDown2Box_C,
-      2,
-      1,
-      31)
-
-SDODD(ScaleRowDown2Box_Odd_AVX2,
-      ScaleRowDown2Box_AVX2,
-      ScaleRowDown2Box_Odd_C,
-      2,
-      1,
-      31)
-
-#endif
-#ifdef HAS_SCALEROWDOWN4_SSSE3
-
-SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
-
-SDANY(ScaleRowDown4Box_Any_SSSE3,
-      ScaleRowDown4Box_SSSE3,
-      ScaleRowDown4Box_C,
-      4,
-      1,
-      7)
-
-#endif
-#ifdef HAS_SCALEROWDOWN4_AVX2
-
-SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
-
-SDANY(ScaleRowDown4Box_Any_AVX2,
-      ScaleRowDown4Box_AVX2,
-      ScaleRowDown4Box_C,
-      4,
-      1,
-      15)
-
-#endif
-#ifdef HAS_SCALEROWDOWN34_SSSE3
-
-SDANY(ScaleRowDown34_Any_SSSE3,
-      ScaleRowDown34_SSSE3,
-      ScaleRowDown34_C,
-      4 / 3,
-      1,
-      23)
-
-SDANY(ScaleRowDown34_0_Box_Any_SSSE3,
-      ScaleRowDown34_0_Box_SSSE3,
-      ScaleRowDown34_0_Box_C,
-      4 / 3,
-      1,
-      23)
-
-SDANY(ScaleRowDown34_1_Box_Any_SSSE3,
-      ScaleRowDown34_1_Box_SSSE3,
-      ScaleRowDown34_1_Box_C,
-      4 / 3,
-      1,
-      23)
-
-#endif
-
-#ifdef HAS_SCALEROWDOWN38_SSSE3
-
-SDANY(ScaleRowDown38_Any_SSSE3,
-      ScaleRowDown38_SSSE3,
-      ScaleRowDown38_C,
-      8 / 3,
-      1,
-      11)
-
-SDANY(ScaleRowDown38_3_Box_Any_SSSE3,
-      ScaleRowDown38_3_Box_SSSE3,
-      ScaleRowDown38_3_Box_C,
-      8 / 3,
-      1,
-      5)
-
-SDANY(ScaleRowDown38_2_Box_Any_SSSE3,
-      ScaleRowDown38_2_Box_SSSE3,
-      ScaleRowDown38_2_Box_C,
-      8 / 3,
-      1,
-      5)
-
-#endif
-
-
-#undef SDANY
-
-// Scale down by even scale factor.
-#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK)       \
-  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, \
-               uint8_t* dst_ptr, int dst_width) {                           \
-    int r = dst_width & MASK;                                               \
-    int n = dst_width & ~MASK;                                              \
-    if (n > 0) {                                                            \
-      SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n);        \
-    }                                                                       \
-    SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx,  \
-                   dst_ptr + n * BPP, r);                                   \
-  }
-
-
-
-#ifdef SASIMDONLY
-// This also works and uses memcpy and SIMD instead of C, but is slower on ARM
-
-// Add rows box filter scale down.  Using macro from row_any
-#define SAROW(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                      \
-  void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int width) { \
-    SIMD_ALIGNED(uint16_t dst_temp[32]);                               \
-    SIMD_ALIGNED(uint8_t src_temp[32]);                                \
-    memset(dst_temp, 0, 32 * 2); /* for msan */                        \
-    int r = width & MASK;                                              \
-    int n = width & ~MASK;                                             \
-    if (n > 0) {                                                       \
-      ANY_SIMD(src_ptr, dst_ptr, n);                                   \
-    }                                                                  \
-    memcpy(src_temp, src_ptr + n * SBPP, r * SBPP);                    \
-    memcpy(dst_temp, dst_ptr + n * BPP, r * BPP);                      \
-    ANY_SIMD(src_temp, dst_temp, MASK + 1);                            \
-    memcpy(dst_ptr + n * BPP, dst_temp, r * BPP);                      \
-  }
-
-#ifdef HAS_SCALEADDROW_SSE2
-SAROW(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, 1, 2, 15)
-#endif
-#ifdef HAS_SCALEADDROW_AVX2
-SAROW(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, 1, 2, 31)
-#endif
-#undef SAANY
-
-#else
-
-// Add rows box filter scale down.
-#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK)              \
-  void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \
-    int n = src_width & ~MASK;                                             \
-    if (n > 0) {                                                           \
-      SCALEADDROW_SIMD(src_ptr, dst_ptr, n);                               \
-    }                                                                      \
-    SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK);             \
-  }
-
-#ifdef HAS_SCALEADDROW_SSE2
-
-SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
-
-#endif
-#ifdef HAS_SCALEADDROW_AVX2
-
-SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
-
-#endif
-#undef SAANY
-
-#endif  // SASIMDONLY
-
-// Scale up horizontally 2 times using linear filter.
-#define SUH2LANY(NAME, SIMD, C, MASK, PTYPE)                       \
-  void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \
-    int work_width = (dst_width - 1) & ~1;                         \
-    int r = work_width & MASK;                                     \
-    int n = work_width & ~MASK;                                    \
-    dst_ptr[0] = src_ptr[0];                                       \
-    if (work_width > 0) {                                          \
-      if (n != 0) {                                                \
-        SIMD(src_ptr, dst_ptr + 1, n);                             \
-      }                                                            \
-      C(src_ptr + (n / 2), dst_ptr + n + 1, r);                    \
-    }                                                              \
-    dst_ptr[dst_width - 1] = src_ptr[(dst_width - 1) / 2];         \
-  }
-
-// Even the C versions need to be wrapped, because boundary pixels have to
-// be handled differently
-
-SUH2LANY(ScaleRowUp2_Linear_Any_C,
-         ScaleRowUp2_Linear_C,
-         ScaleRowUp2_Linear_C,
-         0,
-         uint8_t)
-
-SUH2LANY(ScaleRowUp2_Linear_16_Any_C,
-         ScaleRowUp2_Linear_16_C,
-         ScaleRowUp2_Linear_16_C,
-         0,
-         uint16_t)
-
-#ifdef HAS_SCALEROWUP2_LINEAR_SSE2
-
-SUH2LANY(ScaleRowUp2_Linear_Any_SSE2,
-         ScaleRowUp2_Linear_SSE2,
-         ScaleRowUp2_Linear_C,
-         15,
-         uint8_t)
-
-#endif
-
-#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3
-
-SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3,
-         ScaleRowUp2_Linear_SSSE3,
-         ScaleRowUp2_Linear_C,
-         15,
-         uint8_t)
-
-#endif
-
-#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3
-
-SUH2LANY(ScaleRowUp2_Linear_12_Any_SSSE3,
-         ScaleRowUp2_Linear_12_SSSE3,
-         ScaleRowUp2_Linear_16_C,
-         15,
-         uint16_t)
-
-#endif
-
-#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2
-
-SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2,
-         ScaleRowUp2_Linear_16_SSE2,
-         ScaleRowUp2_Linear_16_C,
-         7,
-         uint16_t)
-
-#endif
-
-#ifdef HAS_SCALEROWUP2_LINEAR_AVX2
-
-SUH2LANY(ScaleRowUp2_Linear_Any_AVX2,
-         ScaleRowUp2_Linear_AVX2,
-         ScaleRowUp2_Linear_C,
-         31,
-         uint8_t)
-
-#endif
-
-#ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2
-
-SUH2LANY(ScaleRowUp2_Linear_12_Any_AVX2,
-         ScaleRowUp2_Linear_12_AVX2,
-         ScaleRowUp2_Linear_16_C,
-         31,
-         uint16_t)
-
-#endif
-
-#ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2
-
-SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2,
-         ScaleRowUp2_Linear_16_AVX2,
-         ScaleRowUp2_Linear_16_C,
-         15,
-         uint16_t)
-
-#endif
-#undef SUH2LANY
-
-// Scale up 2 times using bilinear filter.
-// This function produces 2 rows at a time.
-#define SU2BLANY(NAME, SIMD, C, MASK, PTYPE)                              \
-  void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr,   \
-            ptrdiff_t dst_stride, int dst_width) {                        \
-    int work_width = (dst_width - 1) & ~1;                                \
-    int r = work_width & MASK;                                            \
-    int n = work_width & ~MASK;                                           \
-    const PTYPE* sa = src_ptr;                                            \
-    const PTYPE* sb = src_ptr + src_stride;                               \
-    PTYPE* da = dst_ptr;                                                  \
-    PTYPE* db = dst_ptr + dst_stride;                                     \
-    da[0] = (3 * sa[0] + sb[0] + 2) >> 2;                                 \
-    db[0] = (sa[0] + 3 * sb[0] + 2) >> 2;                                 \
-    if (work_width > 0) {                                                 \
-      if (n != 0) {                                                       \
-        SIMD(sa, sb - sa, da + 1, db - da, n);                            \
-      }                                                                   \
-      C(sa + (n / 2), sb - sa, da + n + 1, db - da, r);                   \
-    }                                                                     \
-    da[dst_width - 1] =                                                   \
-        (3 * sa[(dst_width - 1) / 2] + sb[(dst_width - 1) / 2] + 2) >> 2; \
-    db[dst_width - 1] =                                                   \
-        (sa[(dst_width - 1) / 2] + 3 * sb[(dst_width - 1) / 2] + 2) >> 2; \
-  }
-
-SU2BLANY(ScaleRowUp2_Bilinear_Any_C,
-         ScaleRowUp2_Bilinear_C,
-         ScaleRowUp2_Bilinear_C,
-         0,
-         uint8_t)
-
-SU2BLANY(ScaleRowUp2_Bilinear_16_Any_C,
-         ScaleRowUp2_Bilinear_16_C,
-         ScaleRowUp2_Bilinear_16_C,
-         0,
-         uint16_t)
-
-#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2
-
-SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2,
-         ScaleRowUp2_Bilinear_SSE2,
-         ScaleRowUp2_Bilinear_C,
-         15,
-         uint8_t)
-
-#endif
-
-#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3
-
-SU2BLANY(ScaleRowUp2_Bilinear_12_Any_SSSE3,
-         ScaleRowUp2_Bilinear_12_SSSE3,
-         ScaleRowUp2_Bilinear_16_C,
-         15,
-         uint16_t)
-
-#endif
-
-#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2
-
-SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2,
-         ScaleRowUp2_Bilinear_16_SSE2,
-         ScaleRowUp2_Bilinear_16_C,
-         7,
-         uint16_t)
-
-#endif
-
-#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3
-
-SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3,
-         ScaleRowUp2_Bilinear_SSSE3,
-         ScaleRowUp2_Bilinear_C,
-         15,
-         uint8_t)
-
-#endif
-
-#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2
-
-SU2BLANY(ScaleRowUp2_Bilinear_Any_AVX2,
-         ScaleRowUp2_Bilinear_AVX2,
-         ScaleRowUp2_Bilinear_C,
-         31,
-         uint8_t)
-
-#endif
-
-#ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2
-
-SU2BLANY(ScaleRowUp2_Bilinear_12_Any_AVX2,
-         ScaleRowUp2_Bilinear_12_AVX2,
-         ScaleRowUp2_Bilinear_16_C,
-         15,
-         uint16_t)
-
-#endif
-
-#ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2
-
-SU2BLANY(ScaleRowUp2_Bilinear_16_Any_AVX2,
-         ScaleRowUp2_Bilinear_16_AVX2,
-         ScaleRowUp2_Bilinear_16_C,
-         15,
-         uint16_t)
-
-#endif
-
-#undef SU2BLANY
-
-// Scale bi-planar plane up horizontally 2 times using linear filter.
-#define SBUH2LANY(NAME, SIMD, C, MASK, PTYPE)                         \
-  void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) {    \
-    int work_width = (dst_width - 1) & ~1;                            \
-    int r = work_width & MASK;                                        \
-    int n = work_width & ~MASK;                                       \
-    dst_ptr[0] = src_ptr[0];                                          \
-    dst_ptr[1] = src_ptr[1];                                          \
-    if (work_width > 0) {                                             \
-      if (n != 0) {                                                   \
-        SIMD(src_ptr, dst_ptr + 2, n);                                \
-      }                                                               \
-      C(src_ptr + n, dst_ptr + 2 * n + 2, r);                         \
-    }                                                                 \
-    dst_ptr[2 * dst_width - 2] = src_ptr[((dst_width + 1) & ~1) - 2]; \
-    dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1]; \
-  }
-
-SBUH2LANY(ScaleUVRowUp2_Linear_Any_C,
-          ScaleUVRowUp2_Linear_C,
-          ScaleUVRowUp2_Linear_C,
-          0,
-          uint8_t)
-
-SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_C,
-          ScaleUVRowUp2_Linear_16_C,
-          ScaleUVRowUp2_Linear_16_C,
-          0,
-          uint16_t)
-
-#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3
-
-SBUH2LANY(ScaleUVRowUp2_Linear_Any_SSSE3,
-          ScaleUVRowUp2_Linear_SSSE3,
-          ScaleUVRowUp2_Linear_C,
-          7,
-          uint8_t)
-
-#endif
-
-#ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2
-
-SBUH2LANY(ScaleUVRowUp2_Linear_Any_AVX2,
-          ScaleUVRowUp2_Linear_AVX2,
-          ScaleUVRowUp2_Linear_C,
-          15,
-          uint8_t)
-
-#endif
-
-#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
-
-SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_SSE41,
-          ScaleUVRowUp2_Linear_16_SSE41,
-          ScaleUVRowUp2_Linear_16_C,
-          3,
-          uint16_t)
-
-#endif
-
-#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
-
-SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_AVX2,
-          ScaleUVRowUp2_Linear_16_AVX2,
-          ScaleUVRowUp2_Linear_16_C,
-          7,
-          uint16_t)
-
-#endif
-
-#undef SBUH2LANY
-
-// Scale bi-planar plane up 2 times using bilinear filter.
-// This function produces 2 rows at a time.
-#define SBU2BLANY(NAME, SIMD, C, MASK, PTYPE)                           \
-  void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \
-            ptrdiff_t dst_stride, int dst_width) {                      \
-    int work_width = (dst_width - 1) & ~1;                              \
-    int r = work_width & MASK;                                          \
-    int n = work_width & ~MASK;                                         \
-    const PTYPE* sa = src_ptr;                                          \
-    const PTYPE* sb = src_ptr + src_stride;                             \
-    PTYPE* da = dst_ptr;                                                \
-    PTYPE* db = dst_ptr + dst_stride;                                   \
-    da[0] = (3 * sa[0] + sb[0] + 2) >> 2;                               \
-    db[0] = (sa[0] + 3 * sb[0] + 2) >> 2;                               \
-    da[1] = (3 * sa[1] + sb[1] + 2) >> 2;                               \
-    db[1] = (sa[1] + 3 * sb[1] + 2) >> 2;                               \
-    if (work_width > 0) {                                               \
-      if (n != 0) {                                                     \
-        SIMD(sa, sb - sa, da + 2, db - da, n);                          \
-      }                                                                 \
-      C(sa + n, sb - sa, da + 2 * n + 2, db - da, r);                   \
-    }                                                                   \
-    da[2 * dst_width - 2] = (3 * sa[((dst_width + 1) & ~1) - 2] +       \
-                             sb[((dst_width + 1) & ~1) - 2] + 2) >>     \
-                            2;                                          \
-    db[2 * dst_width - 2] = (sa[((dst_width + 1) & ~1) - 2] +           \
-                             3 * sb[((dst_width + 1) & ~1) - 2] + 2) >> \
-                            2;                                          \
-    da[2 * dst_width - 1] = (3 * sa[((dst_width + 1) & ~1) - 1] +       \
-                             sb[((dst_width + 1) & ~1) - 1] + 2) >>     \
-                            2;                                          \
-    db[2 * dst_width - 1] = (sa[((dst_width + 1) & ~1) - 1] +           \
-                             3 * sb[((dst_width + 1) & ~1) - 1] + 2) >> \
-                            2;                                          \
-  }
-
-SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_C,
-          ScaleUVRowUp2_Bilinear_C,
-          ScaleUVRowUp2_Bilinear_C,
-          0,
-          uint8_t)
-
-SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_C,
-          ScaleUVRowUp2_Bilinear_16_C,
-          ScaleUVRowUp2_Bilinear_16_C,
-          0,
-          uint16_t)
-
-#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3
-
-SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_SSSE3,
-          ScaleUVRowUp2_Bilinear_SSSE3,
-          ScaleUVRowUp2_Bilinear_C,
-          7,
-          uint8_t)
-
-#endif
-
-#ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2
-
-SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_AVX2,
-          ScaleUVRowUp2_Bilinear_AVX2,
-          ScaleUVRowUp2_Bilinear_C,
-          15,
-          uint8_t)
-
-#endif
-
-#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
-
-SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_SSE41,
-          ScaleUVRowUp2_Bilinear_16_SSE41,
-          ScaleUVRowUp2_Bilinear_16_C,
-          7,
-          uint16_t)
-
-#endif
-
-#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
-
-SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_AVX2,
-          ScaleUVRowUp2_Bilinear_16_AVX2,
-          ScaleUVRowUp2_Bilinear_16_C,
-          7,
-          uint16_t)
-
-#endif
-
-#undef SBU2BLANY
diff --git a/pkg/encoder/yuv/libyuv/scale_common.c b/pkg/encoder/yuv/libyuv/scale_common.c
deleted file mode 100644
index 17eedd992..000000000
--- a/pkg/encoder/yuv/libyuv/scale_common.c
+++ /dev/null
@@ -1,930 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "scale.h"
-
-#include <assert.h>
-
-#include "cpu_id.h"
-#include "row.h"
-#include "scale_row.h"
-
-#define STATIC_CAST(type, expr) (type)(expr)
-
-// TODO(fbarchard): make clamp255 preserve negative values.
-static __inline int32_t clamp255(int32_t v) {
-    return (-(v >= 255) | v) & 255;
-}
-
-// Use scale to convert lsb formats to msb, depending how many bits there are:
-// 32768 = 9 bits
-// 16384 = 10 bits
-// 4096 = 12 bits
-// 256 = 16 bits
-// TODO(fbarchard): change scale to bits
-#define C16TO8(v, scale) clamp255(((v) * (scale)) >> 16)
-
-static __inline int Abs(int v) {
-    return v >= 0 ? v : -v;
-}
-
-// CPU agnostic row functions
-void ScaleRowDown2_C(const uint8_t *src_ptr,
-                     ptrdiff_t src_stride,
-                     uint8_t *dst,
-                     int dst_width) {
-    int x;
-    (void) src_stride;
-    for (x = 0; x < dst_width - 1; x += 2) {
-        dst[0] = src_ptr[1];
-        dst[1] = src_ptr[3];
-        dst += 2;
-        src_ptr += 4;
-    }
-    if (dst_width & 1) {
-        dst[0] = src_ptr[1];
-    }
-}
-
-void ScaleRowDown2Linear_C(const uint8_t *src_ptr,
-                           ptrdiff_t src_stride,
-                           uint8_t *dst,
-                           int dst_width) {
-    const uint8_t *s = src_ptr;
-    int x;
-    (void) src_stride;
-    for (x = 0; x < dst_width - 1; x += 2) {
-        dst[0] = (s[0] + s[1] + 1) >> 1;
-        dst[1] = (s[2] + s[3] + 1) >> 1;
-        dst += 2;
-        s += 4;
-    }
-    if (dst_width & 1) {
-        dst[0] = (s[0] + s[1] + 1) >> 1;
-    }
-}
-
-void ScaleRowDown2Box_C(const uint8_t *src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t *dst,
-                        int dst_width) {
-    const uint8_t *s = src_ptr;
-    const uint8_t *t = src_ptr + src_stride;
-    int x;
-    for (x = 0; x < dst_width - 1; x += 2) {
-        dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
-        dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
-        dst += 2;
-        s += 4;
-        t += 4;
-    }
-    if (dst_width & 1) {
-        dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
-    }
-}
-
-void ScaleRowDown2Box_Odd_C(const uint8_t *src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t *dst,
-                            int dst_width) {
-    const uint8_t *s = src_ptr;
-    const uint8_t *t = src_ptr + src_stride;
-    int x;
-    dst_width -= 1;
-    for (x = 0; x < dst_width - 1; x += 2) {
-        dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
-        dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
-        dst += 2;
-        s += 4;
-        t += 4;
-    }
-    if (dst_width & 1) {
-        dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
-        dst += 1;
-        s += 2;
-        t += 2;
-    }
-    dst[0] = (s[0] + t[0] + 1) >> 1;
-}
-
-void ScaleRowDown4_C(const uint8_t *src_ptr,
-                     ptrdiff_t src_stride,
-                     uint8_t *dst,
-                     int dst_width) {
-    int x;
-    (void) src_stride;
-    for (x = 0; x < dst_width - 1; x += 2) {
-        dst[0] = src_ptr[2];
-        dst[1] = src_ptr[6];
-        dst += 2;
-        src_ptr += 8;
-    }
-    if (dst_width & 1) {
-        dst[0] = src_ptr[2];
-    }
-}
-
-void ScaleRowDown4Box_C(const uint8_t *src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t *dst,
-                        int dst_width) {
-    intptr_t stride = src_stride;
-    int x;
-    for (x = 0; x < dst_width - 1; x += 2) {
-        dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-                  src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
-                  src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
-                  src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
-                  src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
-                  src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
-                  src_ptr[stride * 3 + 3] + 8) >>
-                                               4;
-        dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
-                  src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] +
-                  src_ptr[stride + 7] + src_ptr[stride * 2 + 4] +
-                  src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] +
-                  src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] +
-                  src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] +
-                  src_ptr[stride * 3 + 7] + 8) >>
-                                               4;
-        dst += 2;
-        src_ptr += 8;
-    }
-    if (dst_width & 1) {
-        dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-                  src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
-                  src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
-                  src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
-                  src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
-                  src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
-                  src_ptr[stride * 3 + 3] + 8) >>
-                                               4;
-    }
-}
-
-void ScaleRowDown34_C(const uint8_t *src_ptr,
-                      ptrdiff_t src_stride,
-                      uint8_t *dst,
-                      int dst_width) {
-    int x;
-    (void) src_stride;
-    assert((dst_width % 3 == 0) && (dst_width > 0));
-    for (x = 0; x < dst_width; x += 3) {
-        dst[0] = src_ptr[0];
-        dst[1] = src_ptr[1];
-        dst[2] = src_ptr[3];
-        dst += 3;
-        src_ptr += 4;
-    }
-}
-
-// Filter rows 0 and 1 together, 3 : 1
-void ScaleRowDown34_0_Box_C(const uint8_t *src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t *d,
-                            int dst_width) {
-    const uint8_t *s = src_ptr;
-    const uint8_t *t = src_ptr + src_stride;
-    int x;
-    assert((dst_width % 3 == 0) && (dst_width > 0));
-    for (x = 0; x < dst_width; x += 3) {
-        uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
-        uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
-        uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
-        uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
-        uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
-        uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
-        d[0] = (a0 * 3 + b0 + 2) >> 2;
-        d[1] = (a1 * 3 + b1 + 2) >> 2;
-        d[2] = (a2 * 3 + b2 + 2) >> 2;
-        d += 3;
-        s += 4;
-        t += 4;
-    }
-}
-
-// Filter rows 1 and 2 together, 1 : 1
-void ScaleRowDown34_1_Box_C(const uint8_t *src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t *d,
-                            int dst_width) {
-    const uint8_t *s = src_ptr;
-    const uint8_t *t = src_ptr + src_stride;
-    int x;
-    assert((dst_width % 3 == 0) && (dst_width > 0));
-    for (x = 0; x < dst_width; x += 3) {
-        uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
-        uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
-        uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
-        uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
-        uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
-        uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
-        d[0] = (a0 + b0 + 1) >> 1;
-        d[1] = (a1 + b1 + 1) >> 1;
-        d[2] = (a2 + b2 + 1) >> 1;
-        d += 3;
-        s += 4;
-        t += 4;
-    }
-}
-
-// Sample position: (O is src sample position, X is dst sample position)
-//
-//      v dst_ptr at here           v stop at here
-//  X O X   X O X   X O X   X O X   X O X
-//    ^ src_ptr at here
-void ScaleRowUp2_Linear_C(const uint8_t *src_ptr,
-                          uint8_t *dst_ptr,
-                          int dst_width) {
-    int src_width = dst_width >> 1;
-    int x;
-    assert((dst_width % 2 == 0) && (dst_width >= 0));
-    for (x = 0; x < src_width; ++x) {
-        dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2;
-        dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2;
-    }
-}
-
-// Sample position: (O is src sample position, X is dst sample position)
-//
-//    src_ptr at here
-//  X v X   X   X   X   X   X   X   X   X
-//    O       O       O       O       O
-//  X   X   X   X   X   X   X   X   X   X
-//      ^ dst_ptr at here           ^ stop at here
-//  X   X   X   X   X   X   X   X   X   X
-//    O       O       O       O       O
-//  X   X   X   X   X   X   X   X   X   X
-void ScaleRowUp2_Bilinear_C(const uint8_t *src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t *dst_ptr,
-                            ptrdiff_t dst_stride,
-                            int dst_width) {
-    const uint8_t *s = src_ptr;
-    const uint8_t *t = src_ptr + src_stride;
-    uint8_t *d = dst_ptr;
-    uint8_t *e = dst_ptr + dst_stride;
-    int src_width = dst_width >> 1;
-    int x;
-    assert((dst_width % 2 == 0) && (dst_width >= 0));
-    for (x = 0; x < src_width; ++x) {
-        d[2 * x + 0] =
-                (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4;
-        d[2 * x + 1] =
-                (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4;
-        e[2 * x + 0] =
-                (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4;
-        e[2 * x + 1] =
-                (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4;
-    }
-}
-
-// Only suitable for at most 14 bit range.
-void ScaleRowUp2_Linear_16_C(const uint16_t *src_ptr,
-                             uint16_t *dst_ptr,
-                             int dst_width) {
-    int src_width = dst_width >> 1;
-    int x;
-    assert((dst_width % 2 == 0) && (dst_width >= 0));
-    for (x = 0; x < src_width; ++x) {
-        dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2;
-        dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2;
-    }
-}
-
-// Only suitable for at most 12bit range.
-void ScaleRowUp2_Bilinear_16_C(const uint16_t *src_ptr,
-                               ptrdiff_t src_stride,
-                               uint16_t *dst_ptr,
-                               ptrdiff_t dst_stride,
-                               int dst_width) {
-    const uint16_t *s = src_ptr;
-    const uint16_t *t = src_ptr + src_stride;
-    uint16_t *d = dst_ptr;
-    uint16_t *e = dst_ptr + dst_stride;
-    int src_width = dst_width >> 1;
-    int x;
-    assert((dst_width % 2 == 0) && (dst_width >= 0));
-    for (x = 0; x < src_width; ++x) {
-        d[2 * x + 0] =
-                (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4;
-        d[2 * x + 1] =
-                (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4;
-        e[2 * x + 0] =
-                (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4;
-        e[2 * x + 1] =
-                (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4;
-    }
-}
-
-// (1-f)a + fb can be replaced with a + f(b-a)
-#if defined(__arm__) || defined(__aarch64__)
-#define BLENDER(a, b, f) \
-  (uint8_t)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
-#else
-// Intel uses 7 bit math with rounding.
-#define BLENDER(a, b, f) \
-  (uint8_t)((int)(a) + (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))
-#endif
-
-void ScaleFilterCols_C(uint8_t *dst_ptr,
-                       const uint8_t *src_ptr,
-                       int dst_width,
-                       int x,
-                       int dx) {
-    int j;
-    for (j = 0; j < dst_width - 1; j += 2) {
-        int xi = x >> 16;
-        int a = src_ptr[xi];
-        int b = src_ptr[xi + 1];
-        dst_ptr[0] = BLENDER(a, b, x & 0xffff);
-        x += dx;
-        xi = x >> 16;
-        a = src_ptr[xi];
-        b = src_ptr[xi + 1];
-        dst_ptr[1] = BLENDER(a, b, x & 0xffff);
-        x += dx;
-        dst_ptr += 2;
-    }
-    if (dst_width & 1) {
-        int xi = x >> 16;
-        int a = src_ptr[xi];
-        int b = src_ptr[xi + 1];
-        dst_ptr[0] = BLENDER(a, b, x & 0xffff);
-    }
-}
-
-void ScaleFilterCols64_C(uint8_t *dst_ptr,
-                         const uint8_t *src_ptr,
-                         int dst_width,
-                         int x32,
-                         int dx) {
-    int64_t x = (int64_t) (x32);
-    int j;
-    for (j = 0; j < dst_width - 1; j += 2) {
-        int64_t xi = x >> 16;
-        int a = src_ptr[xi];
-        int b = src_ptr[xi + 1];
-        dst_ptr[0] = BLENDER(a, b, x & 0xffff);
-        x += dx;
-        xi = x >> 16;
-        a = src_ptr[xi];
-        b = src_ptr[xi + 1];
-        dst_ptr[1] = BLENDER(a, b, x & 0xffff);
-        x += dx;
-        dst_ptr += 2;
-    }
-    if (dst_width & 1) {
-        int64_t xi = x >> 16;
-        int a = src_ptr[xi];
-        int b = src_ptr[xi + 1];
-        dst_ptr[0] = BLENDER(a, b, x & 0xffff);
-    }
-}
-
-#undef BLENDER
-
-// Same as 8 bit arm blender but return is cast to uint16_t
-#define BLENDER(a, b, f) \
-  (uint16_t)(            \
-      (int)(a) +         \
-      (int)((((int64_t)((f)) * ((int64_t)(b) - (int)(a))) + 0x8000) >> 16))
-#undef BLENDER
-
-void ScaleRowDown38_C(const uint8_t *src_ptr,
-                      ptrdiff_t src_stride,
-                      uint8_t *dst,
-                      int dst_width) {
-    int x;
-    (void) src_stride;
-    assert(dst_width % 3 == 0);
-    for (x = 0; x < dst_width; x += 3) {
-        dst[0] = src_ptr[0];
-        dst[1] = src_ptr[3];
-        dst[2] = src_ptr[6];
-        dst += 3;
-        src_ptr += 8;
-    }
-}
-
-// 8x3 -> 3x1
-void ScaleRowDown38_3_Box_C(const uint8_t *src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t *dst_ptr,
-                            int dst_width) {
-    intptr_t stride = src_stride;
-    int i;
-    assert((dst_width % 3 == 0) && (dst_width > 0));
-    for (i = 0; i < dst_width; i += 3) {
-        dst_ptr[0] =
-                (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
-                 src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
-                 src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
-                (65536 / 9) >>
-                            16;
-        dst_ptr[1] =
-                (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
-                 src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
-                 src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
-                (65536 / 9) >>
-                            16;
-        dst_ptr[2] =
-                (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
-                 src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
-                (65536 / 6) >>
-                            16;
-        src_ptr += 8;
-        dst_ptr += 3;
-    }
-}
-
-// 8x2 -> 3x1
-void ScaleRowDown38_2_Box_C(const uint8_t *src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t *dst_ptr,
-                            int dst_width) {
-    intptr_t stride = src_stride;
-    int i;
-    assert((dst_width % 3 == 0) && (dst_width > 0));
-    for (i = 0; i < dst_width; i += 3) {
-        dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
-                      src_ptr[stride + 1] + src_ptr[stride + 2]) *
-                     (65536 / 6) >>
-                                 16;
-        dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
-                      src_ptr[stride + 4] + src_ptr[stride + 5]) *
-                     (65536 / 6) >>
-                                 16;
-        dst_ptr[2] =
-                (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
-                (65536 / 4) >>
-                            16;
-        src_ptr += 8;
-        dst_ptr += 3;
-    }
-}
-
-void ScaleAddRow_C(const uint8_t *src_ptr, uint16_t *dst_ptr, int src_width) {
-    int x;
-    assert(src_width > 0);
-    for (x = 0; x < src_width - 1; x += 2) {
-        dst_ptr[0] += src_ptr[0];
-        dst_ptr[1] += src_ptr[1];
-        src_ptr += 2;
-        dst_ptr += 2;
-    }
-    if (src_width & 1) {
-        dst_ptr[0] += src_ptr[0];
-    }
-}
-
-// UV scale row functions
-// same as ARGB but 2 channels
-
-void ScaleUVRowDown2_C(const uint8_t *src_uv,
-                       ptrdiff_t src_stride,
-                       uint8_t *dst_uv,
-                       int dst_width) {
-    int x;
-    (void) src_stride;
-    for (x = 0; x < dst_width; ++x) {
-        dst_uv[0] = src_uv[2];  // Store the 2nd UV
-        dst_uv[1] = src_uv[3];
-        src_uv += 4;
-        dst_uv += 2;
-    }
-}
-
-void ScaleUVRowDown2Linear_C(const uint8_t *src_uv,
-                             ptrdiff_t src_stride,
-                             uint8_t *dst_uv,
-                             int dst_width) {
-    int x;
-    (void) src_stride;
-    for (x = 0; x < dst_width; ++x) {
-        dst_uv[0] = (src_uv[0] + src_uv[2] + 1) >> 1;
-        dst_uv[1] = (src_uv[1] + src_uv[3] + 1) >> 1;
-        src_uv += 4;
-        dst_uv += 2;
-    }
-}
-
-void ScaleUVRowDown2Box_C(const uint8_t *src_uv,
-                          ptrdiff_t src_stride,
-                          uint8_t *dst_uv,
-                          int dst_width) {
-    int x;
-    for (x = 0; x < dst_width; ++x) {
-        dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] +
-                     src_uv[src_stride + 2] + 2) >>
-                                                 2;
-        dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] +
-                     src_uv[src_stride + 3] + 2) >>
-                                                 2;
-        src_uv += 4;
-        dst_uv += 2;
-    }
-}
-
-void ScaleUVRowDownEven_C(const uint8_t *src_uv,
-                          ptrdiff_t src_stride,
-                          int src_stepx,
-                          uint8_t *dst_uv,
-                          int dst_width) {
-    const uint16_t *src = (const uint16_t *) (src_uv);
-    uint16_t *dst = (uint16_t *) (dst_uv);
-    (void) src_stride;
-    int x;
-    for (x = 0; x < dst_width - 1; x += 2) {
-        dst[0] = src[0];
-        dst[1] = src[src_stepx];
-        src += src_stepx * 2;
-        dst += 2;
-    }
-    if (dst_width & 1) {
-        dst[0] = src[0];
-    }
-}
-
-// Scales a single row of pixels using point sampling.
-void ScaleCols_C(uint8_t *dst_ptr,
-                 const uint8_t *src_ptr,
-                 int dst_width,
-                 int x,
-                 int dx) {
-    int j;
-    for (j = 0; j < dst_width - 1; j += 2) {
-        dst_ptr[0] = src_ptr[x >> 16];
-        x += dx;
-        dst_ptr[1] = src_ptr[x >> 16];
-        x += dx;
-        dst_ptr += 2;
-    }
-    if (dst_width & 1) {
-        dst_ptr[0] = src_ptr[x >> 16];
-    }
-}
-
-// Scales a single row of pixels up by 2x using point sampling.
-void ScaleColsUp2_C(uint8_t *dst_ptr,
-                    const uint8_t *src_ptr,
-                    int dst_width,
-                    int x,
-                    int dx) {
-    int j;
-    (void) x;
-    (void) dx;
-    for (j = 0; j < dst_width - 1; j += 2) {
-        dst_ptr[1] = dst_ptr[0] = src_ptr[0];
-        src_ptr += 1;
-        dst_ptr += 2;
-    }
-    if (dst_width & 1) {
-        dst_ptr[0] = src_ptr[0];
-    }
-}
-
-void ScaleUVRowUp2_Linear_C(const uint8_t *src_ptr,
-                            uint8_t *dst_ptr,
-                            int dst_width) {
-    int src_width = dst_width >> 1;
-    int x;
-    assert((dst_width % 2 == 0) && (dst_width >= 0));
-    for (x = 0; x < src_width; ++x) {
-        dst_ptr[4 * x + 0] =
-                (src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2;
-        dst_ptr[4 * x + 1] =
-                (src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2;
-        dst_ptr[4 * x + 2] =
-                (src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2;
-        dst_ptr[4 * x + 3] =
-                (src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2;
-    }
-}
-
-void ScaleUVRowUp2_Bilinear_C(const uint8_t *src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t *dst_ptr,
-                              ptrdiff_t dst_stride,
-                              int dst_width) {
-    const uint8_t *s = src_ptr;
-    const uint8_t *t = src_ptr + src_stride;
-    uint8_t *d = dst_ptr;
-    uint8_t *e = dst_ptr + dst_stride;
-    int src_width = dst_width >> 1;
-    int x;
-    assert((dst_width % 2 == 0) && (dst_width >= 0));
-    for (x = 0; x < src_width; ++x) {
-        d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
-                        t[2 * x + 2] * 1 + 8) >>
-                                              4;
-        d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
-                        t[2 * x + 3] * 1 + 8) >>
-                                              4;
-        d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 +
-                        t[2 * x + 2] * 3 + 8) >>
-                                              4;
-        d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 +
-                        t[2 * x + 3] * 3 + 8) >>
-                                              4;
-        e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 +
-                        t[2 * x + 2] * 3 + 8) >>
-                                              4;
-        e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 +
-                        t[2 * x + 3] * 3 + 8) >>
-                                              4;
-        e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
-                        t[2 * x + 2] * 9 + 8) >>
-                                              4;
-        e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
-                        t[2 * x + 3] * 9 + 8) >>
-                                              4;
-    }
-}
-
-void ScaleUVRowUp2_Linear_16_C(const uint16_t *src_ptr,
-                               uint16_t *dst_ptr,
-                               int dst_width) {
-    int src_width = dst_width >> 1;
-    int x;
-    assert((dst_width % 2 == 0) && (dst_width >= 0));
-    for (x = 0; x < src_width; ++x) {
-        dst_ptr[4 * x + 0] =
-                (src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2;
-        dst_ptr[4 * x + 1] =
-                (src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2;
-        dst_ptr[4 * x + 2] =
-                (src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2;
-        dst_ptr[4 * x + 3] =
-                (src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2;
-    }
-}
-
-void ScaleUVRowUp2_Bilinear_16_C(const uint16_t *src_ptr,
-                                 ptrdiff_t src_stride,
-                                 uint16_t *dst_ptr,
-                                 ptrdiff_t dst_stride,
-                                 int dst_width) {
-    const uint16_t *s = src_ptr;
-    const uint16_t *t = src_ptr + src_stride;
-    uint16_t *d = dst_ptr;
-    uint16_t *e = dst_ptr + dst_stride;
-    int src_width = dst_width >> 1;
-    int x;
-    assert((dst_width % 2 == 0) && (dst_width >= 0));
-    for (x = 0; x < src_width; ++x) {
-        d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
-                        t[2 * x + 2] * 1 + 8) >>
-                                              4;
-        d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
-                        t[2 * x + 3] * 1 + 8) >>
-                                              4;
-        d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 +
-                        t[2 * x + 2] * 3 + 8) >>
-                                              4;
-        d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 +
-                        t[2 * x + 3] * 3 + 8) >>
-                                              4;
-        e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 +
-                        t[2 * x + 2] * 3 + 8) >>
-                                              4;
-        e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 +
-                        t[2 * x + 3] * 3 + 8) >>
-                                              4;
-        e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
-                        t[2 * x + 2] * 9 + 8) >>
-                                              4;
-        e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
-                        t[2 * x + 3] * 9 + 8) >>
-                                              4;
-    }
-}
-
-// TODO(fbarchard): Replace 0x7f ^ f with 128-f.  bug=607.
-// Mimics SSSE3 blender
-#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7
-#define BLENDERC(a, b, f, s) \
-  (uint16_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
-#define BLENDER(a, b, f) BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
-
-void ScaleUVFilterCols_C(uint8_t *dst_uv,
-                         const uint8_t *src_uv,
-                         int dst_width,
-                         int x,
-                         int dx) {
-    const uint16_t *src = (const uint16_t *) (src_uv);
-    uint16_t *dst = (uint16_t *) (dst_uv);
-    int j;
-    for (j = 0; j < dst_width - 1; j += 2) {
-        int xi = x >> 16;
-        int xf = (x >> 9) & 0x7f;
-        uint16_t a = src[xi];
-        uint16_t b = src[xi + 1];
-        dst[0] = BLENDER(a, b, xf);
-        x += dx;
-        xi = x >> 16;
-        xf = (x >> 9) & 0x7f;
-        a = src[xi];
-        b = src[xi + 1];
-        dst[1] = BLENDER(a, b, xf);
-        x += dx;
-        dst += 2;
-    }
-    if (dst_width & 1) {
-        int xi = x >> 16;
-        int xf = (x >> 9) & 0x7f;
-        uint16_t a = src[xi];
-        uint16_t b = src[xi + 1];
-        dst[0] = BLENDER(a, b, xf);
-    }
-}
-
-#undef BLENDER1
-#undef BLENDERC
-#undef BLENDER
-
-// Scale plane vertically with bilinear interpolation.
-void ScalePlaneVertical(int src_height,
-                        int dst_width,
-                        int dst_height,
-                        int src_stride,
-                        int dst_stride,
-                        const uint8_t *src_argb,
-                        uint8_t *dst_argb,
-                        int x,
-                        int y,
-                        int dy,
-                        int bpp,  // bytes per pixel. 4 for ARGB.
-                        enum FilterMode filtering) {
-    // TODO(fbarchard): Allow higher bpp.
-    int dst_width_bytes = dst_width * bpp;
-    void (*InterpolateRow)(uint8_t *dst_argb, const uint8_t *src_argb,
-                           ptrdiff_t src_stride, int dst_width,
-                           int source_y_fraction) = InterpolateRow_C;
-    const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
-    int j;
-    assert(bpp >= 1 && bpp <= 4);
-    assert(src_height != 0);
-    assert(dst_width > 0);
-    assert(dst_height > 0);
-    src_argb += (x >> 16) * bpp;
-#if defined(HAS_INTERPOLATEROW_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3)) {
-        InterpolateRow = InterpolateRow_Any_SSSE3;
-        if (IS_ALIGNED(dst_width_bytes, 16)) {
-            InterpolateRow = InterpolateRow_SSSE3;
-        }
-    }
-#endif
-#if defined(HAS_INTERPOLATEROW_AVX2)
-    if (TestCpuFlag(kCpuHasAVX2)) {
-        InterpolateRow = InterpolateRow_Any_AVX2;
-        if (IS_ALIGNED(dst_width_bytes, 32)) {
-            InterpolateRow = InterpolateRow_AVX2;
-        }
-    }
-#endif
-
-
-    for (j = 0; j < dst_height; ++j) {
-        int yi;
-        int yf;
-        if (y > max_y) {
-            y = max_y;
-        }
-        yi = y >> 16;
-        yf = filtering ? ((y >> 8) & 255) : 0;
-        InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,
-                       dst_width_bytes, yf);
-        dst_argb += dst_stride;
-        y += dy;
-    }
-}
-
-// Simplify the filtering based on scale factors.
-enum FilterMode ScaleFilterReduce(int src_width,
-                                  int src_height,
-                                  int dst_width,
-                                  int dst_height,
-                                  enum FilterMode filtering) {
-    if (src_width < 0) {
-        src_width = -src_width;
-    }
-    if (src_height < 0) {
-        src_height = -src_height;
-    }
-    if (filtering == kFilterBox) {
-        // If scaling either axis to 0.5 or larger, switch from Box to Bilinear.
-        if (dst_width * 2 >= src_width || dst_height * 2 >= src_height) {
-            filtering = kFilterBilinear;
-        }
-    }
-    if (filtering == kFilterBilinear) {
-        if (src_height == 1) {
-            filtering = kFilterLinear;
-        }
-        // TODO(fbarchard): Detect any odd scale factor and reduce to Linear.
-        if (dst_height == src_height || dst_height * 3 == src_height) {
-            filtering = kFilterLinear;
-        }
-        // TODO(fbarchard): Remove 1 pixel wide filter restriction, which is to
-        // avoid reading 2 pixels horizontally that causes memory exception.
-        if (src_width == 1) {
-            filtering = kFilterNone;
-        }
-    }
-    if (filtering == kFilterLinear) {
-        if (src_width == 1) {
-            filtering = kFilterNone;
-        }
-        // TODO(fbarchard): Detect any odd scale factor and reduce to None.
-        if (dst_width == src_width || dst_width * 3 == src_width) {
-            filtering = kFilterNone;
-        }
-    }
-    return filtering;
-}
-
-#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
-
-// Compute slope values for stepping.
-void ScaleSlope(int src_width,
-                int src_height,
-                int dst_width,
-                int dst_height,
-                enum FilterMode filtering,
-                int *x,
-                int *y,
-                int *dx,
-                int *dy) {
-    assert(x != NULL);
-    assert(y != NULL);
-    assert(dx != NULL);
-    assert(dy != NULL);
-    assert(src_width != 0);
-    assert(src_height != 0);
-    assert(dst_width > 0);
-    assert(dst_height > 0);
-    // Check for 1 pixel and avoid FixedDiv overflow.
-    if (dst_width == 1 && src_width >= 32768) {
-        dst_width = src_width;
-    }
-    if (dst_height == 1 && src_height >= 32768) {
-        dst_height = src_height;
-    }
-    if (filtering == kFilterBox) {
-        // Scale step for point sampling duplicates all pixels equally.
-        *dx = FixedDiv(Abs(src_width), dst_width);
-        *dy = FixedDiv(src_height, dst_height);
-        *x = 0;
-        *y = 0;
-    } else if (filtering == kFilterBilinear) {
-        // Scale step for bilinear sampling renders last pixel once for upsample.
-        if (dst_width <= Abs(src_width)) {
-            *dx = FixedDiv(Abs(src_width), dst_width);
-            *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.
-        } else if (src_width > 1 && dst_width > 1) {
-            *dx = FixedDiv1(Abs(src_width), dst_width);
-            *x = 0;
-        }
-        if (dst_height <= src_height) {
-            *dy = FixedDiv(src_height, dst_height);
-            *y = CENTERSTART(*dy, -32768);  // Subtract 0.5 (32768) to center filter.
-        } else if (src_height > 1 && dst_height > 1) {
-            *dy = FixedDiv1(src_height, dst_height);
-            *y = 0;
-        }
-    } else if (filtering == kFilterLinear) {
-        // Scale step for bilinear sampling renders last pixel once for upsample.
-        if (dst_width <= Abs(src_width)) {
-            *dx = FixedDiv(Abs(src_width), dst_width);
-            *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.
-        } else if (src_width > 1 && dst_width > 1) {
-            *dx = FixedDiv1(Abs(src_width), dst_width);
-            *x = 0;
-        }
-        *dy = FixedDiv(src_height, dst_height);
-        *y = *dy >> 1;
-    } else {
-        // Scale step for point sampling duplicates all pixels equally.
-        *dx = FixedDiv(Abs(src_width), dst_width);
-        *dy = FixedDiv(src_height, dst_height);
-        *x = CENTERSTART(*dx, 0);
-        *y = CENTERSTART(*dy, 0);
-    }
-    // Negative src_width means horizontally mirror.
-    if (src_width < 0) {
-        *x += (dst_width - 1) * *dx;
-        *dx = -*dx;
-        // src_width = -src_width;   // Caller must do this.
-    }
-}
-
-#undef CENTERSTART
diff --git a/pkg/encoder/yuv/libyuv/scale_gcc.c b/pkg/encoder/yuv/libyuv/scale_gcc.c
deleted file mode 100644
index 716d6cfdb..000000000
--- a/pkg/encoder/yuv/libyuv/scale_gcc.c
+++ /dev/null
@@ -1,2651 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "row.h"
-#include "scale_row.h"
-
-// This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
-
-// Offsets for source bytes 0 to 9
-static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
-                             128, 128, 128, 128, 128, 128, 128, 128};
-
-// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
-static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
-                             128, 128, 128, 128, 128, 128, 128, 128};
-
-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
-                             128, 128, 128, 128, 128, 128, 128, 128};
-
-// Offsets for source bytes 0 to 10
-static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
-
-// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
-static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7,
-                              8, 9, 9, 10, 10, 11, 12, 13};
-
-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
-                              10, 11, 12, 13, 13, 14, 14, 15};
-
-// Coefficients for source bytes 0 to 10
-static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
-
-// Coefficients for source bytes 10 to 21
-static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
-
-// Coefficients for source bytes 21 to 31
-static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
-
-// Coefficients for source bytes 21 to 31
-static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
-
-static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
-                               128, 128, 128, 128, 128, 128, 128, 128};
-
-static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
-                               6, 8, 11, 14, 128, 128, 128, 128};
-
-// Arrange words 0,3,6 into 0,1,2
-static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
-                              128, 128, 128, 128, 128, 128, 128, 128};
-
-// Arrange words 0,3,6 into 3,4,5
-static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
-                               6, 7, 12, 13, 128, 128, 128, 128};
-
-// Scaling values for boxes of 3x3 and 2x3
-static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
-                                  65536 / 9, 65536 / 6, 0, 0};
-
-// Arrange first value for pixels 0,1,2,3,4,5
-static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
-                               11, 128, 14, 128, 128, 128, 128, 128};
-
-// Arrange second value for pixels 0,1,2,3,4,5
-static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
-                               12, 128, 15, 128, 128, 128, 128, 128};
-
-// Arrange third value for pixels 0,1,2,3,4,5
-static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
-                               13, 128, 128, 128, 128, 128, 128, 128};
-
-// Scaling values for boxes of 3x2 and 2x2
-static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
-                                 65536 / 3, 65536 / 2, 0, 0};
-
-// GCC versions of row functions are verbatim conversions from Visual C.
-// Generated using gcc disassembly on Visual C object file:
-// objdump -D yuvscaler.obj >yuvscaler.txt
-
-void ScaleRowDown2_SSSE3(const uint8_t *src_ptr,
-                         ptrdiff_t src_stride,
-                         uint8_t *dst_ptr,
-                         int dst_width) {
-    (void) src_stride;
-    asm volatile(
-        // 16 pixel loop.
-            LABELALIGN
-            "1:                                        \n"
-            "movdqu      (%0),%%xmm0                   \n"
-            "movdqu      0x10(%0),%%xmm1               \n"
-            "lea         0x20(%0),%0                   \n"
-            "psrlw       $0x8,%%xmm0                   \n"
-            "psrlw       $0x8,%%xmm1                   \n"
-            "packuswb    %%xmm1,%%xmm0                 \n"
-            "movdqu      %%xmm0,(%1)                   \n"
-            "lea         0x10(%1),%1                   \n"
-            "sub         $0x10,%2                      \n"
-            "jg          1b                            \n"
-            : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-            ::"memory",
-    "cc", "xmm0", "xmm1");
-}
-
-void ScaleRowDown2Linear_SSSE3(const uint8_t *src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t *dst_ptr,
-                               int dst_width) {
-    (void) src_stride;
-    asm volatile(
-            "pcmpeqb     %%xmm4,%%xmm4                 \n"
-            "psrlw       $0xf,%%xmm4                   \n"
-            "packuswb    %%xmm4,%%xmm4                 \n"
-            "pxor        %%xmm5,%%xmm5                 \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "movdqu      (%0),%%xmm0                   \n"
-            "movdqu      0x10(%0),%%xmm1               \n"
-            "lea         0x20(%0),%0                   \n"
-            "pmaddubsw   %%xmm4,%%xmm0                 \n"
-            "pmaddubsw   %%xmm4,%%xmm1                 \n"
-            "pavgw       %%xmm5,%%xmm0                 \n"
-            "pavgw       %%xmm5,%%xmm1                 \n"
-            "packuswb    %%xmm1,%%xmm0                 \n"
-            "movdqu      %%xmm0,(%1)                   \n"
-            "lea         0x10(%1),%1                   \n"
-            "sub         $0x10,%2                      \n"
-            "jg          1b                            \n"
-            : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-            ::"memory",
-    "cc", "xmm0", "xmm1", "xmm4", "xmm5");
-}
-
-void ScaleRowDown2Box_SSSE3(const uint8_t *src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t *dst_ptr,
-                            int dst_width) {
-    asm volatile(
-            "pcmpeqb     %%xmm4,%%xmm4                 \n"
-            "psrlw       $0xf,%%xmm4                   \n"
-            "packuswb    %%xmm4,%%xmm4                 \n"
-            "pxor        %%xmm5,%%xmm5                 \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "movdqu      (%0),%%xmm0                   \n"
-            "movdqu      0x10(%0),%%xmm1               \n"
-            "movdqu      0x00(%0,%3,1),%%xmm2          \n"
-            "movdqu      0x10(%0,%3,1),%%xmm3          \n"
-            "lea         0x20(%0),%0                   \n"
-            "pmaddubsw   %%xmm4,%%xmm0                 \n"
-            "pmaddubsw   %%xmm4,%%xmm1                 \n"
-            "pmaddubsw   %%xmm4,%%xmm2                 \n"
-            "pmaddubsw   %%xmm4,%%xmm3                 \n"
-            "paddw       %%xmm2,%%xmm0                 \n"
-            "paddw       %%xmm3,%%xmm1                 \n"
-            "psrlw       $0x1,%%xmm0                   \n"
-            "psrlw       $0x1,%%xmm1                   \n"
-            "pavgw       %%xmm5,%%xmm0                 \n"
-            "pavgw       %%xmm5,%%xmm1                 \n"
-            "packuswb    %%xmm1,%%xmm0                 \n"
-            "movdqu      %%xmm0,(%1)                   \n"
-            "lea         0x10(%1),%1                   \n"
-            "sub         $0x10,%2                      \n"
-            "jg          1b                            \n"
-            : "+r"(src_ptr),               // %0
-    "+r"(dst_ptr),               // %1
-    "+r"(dst_width)              // %2
-            : "r"((intptr_t) (src_stride))  // %3
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-
-#ifdef HAS_SCALEROWDOWN2_AVX2
-
-void ScaleRowDown2_AVX2(const uint8_t *src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t *dst_ptr,
-                        int dst_width) {
-    (void) src_stride;
-    asm volatile(LABELALIGN
-            "1:                                        \n"
-            "vmovdqu     (%0),%%ymm0                   \n"
-            "vmovdqu     0x20(%0),%%ymm1               \n"
-            "lea         0x40(%0),%0                   \n"
-            "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
-            "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
-            "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-            "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-            "vmovdqu     %%ymm0,(%1)                   \n"
-            "lea         0x20(%1),%1                   \n"
-            "sub         $0x20,%2                      \n"
-            "jg          1b                            \n"
-            "vzeroupper                                \n"
-            : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-            ::"memory",
-    "cc", "xmm0", "xmm1");
-}
-
-void ScaleRowDown2Linear_AVX2(const uint8_t *src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t *dst_ptr,
-                              int dst_width) {
-    (void) src_stride;
-    asm volatile(
-            "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
-            "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
-            "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
-            "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "vmovdqu     (%0),%%ymm0                   \n"
-            "vmovdqu     0x20(%0),%%ymm1               \n"
-            "lea         0x40(%0),%0                   \n"
-            "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
-            "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
-            "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
-            "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
-            "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-            "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-            "vmovdqu     %%ymm0,(%1)                   \n"
-            "lea         0x20(%1),%1                   \n"
-            "sub         $0x20,%2                      \n"
-            "jg          1b                            \n"
-            "vzeroupper                                \n"
-            : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-            ::"memory",
-    "cc", "xmm0", "xmm1", "xmm4", "xmm5");
-}
-
-void ScaleRowDown2Box_AVX2(const uint8_t *src_ptr,
-                           ptrdiff_t src_stride,
-                           uint8_t *dst_ptr,
-                           int dst_width) {
-    asm volatile(
-            "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
-            "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
-            "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
-            "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "vmovdqu     (%0),%%ymm0                   \n"
-            "vmovdqu     0x20(%0),%%ymm1               \n"
-            "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"
-            "vmovdqu     0x20(%0,%3,1),%%ymm3          \n"
-            "lea         0x40(%0),%0                   \n"
-            "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
-            "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
-            "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
-            "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
-            "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
-            "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
-            "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"
-            "vpsrlw      $0x1,%%ymm1,%%ymm1            \n"
-            "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
-            "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
-            "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-            "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-            "vmovdqu     %%ymm0,(%1)                   \n"
-            "lea         0x20(%1),%1                   \n"
-            "sub         $0x20,%2                      \n"
-            "jg          1b                            \n"
-            "vzeroupper                                \n"
-            : "+r"(src_ptr),               // %0
-    "+r"(dst_ptr),               // %1
-    "+r"(dst_width)              // %2
-            : "r"((intptr_t) (src_stride))  // %3
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-
-#endif  // HAS_SCALEROWDOWN2_AVX2
-
-void ScaleRowDown4_SSSE3(const uint8_t *src_ptr,
-                         ptrdiff_t src_stride,
-                         uint8_t *dst_ptr,
-                         int dst_width) {
-    (void) src_stride;
-    asm volatile(
-            "pcmpeqb     %%xmm5,%%xmm5                 \n"
-            "psrld       $0x18,%%xmm5                  \n"
-            "pslld       $0x10,%%xmm5                  \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "movdqu      (%0),%%xmm0                   \n"
-            "movdqu      0x10(%0),%%xmm1               \n"
-            "lea         0x20(%0),%0                   \n"
-            "pand        %%xmm5,%%xmm0                 \n"
-            "pand        %%xmm5,%%xmm1                 \n"
-            "packuswb    %%xmm1,%%xmm0                 \n"
-            "psrlw       $0x8,%%xmm0                   \n"
-            "packuswb    %%xmm0,%%xmm0                 \n"
-            "movq        %%xmm0,(%1)                   \n"
-            "lea         0x8(%1),%1                    \n"
-            "sub         $0x8,%2                       \n"
-            "jg          1b                            \n"
-            : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-            ::"memory",
-    "cc", "xmm0", "xmm1", "xmm5");
-}
-
-void ScaleRowDown4Box_SSSE3(const uint8_t *src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t *dst_ptr,
-                            int dst_width) {
-    intptr_t stridex3;
-    asm volatile(
-            "pcmpeqb     %%xmm4,%%xmm4                 \n"
-            "psrlw       $0xf,%%xmm4                   \n"
-            "movdqa      %%xmm4,%%xmm5                 \n"
-            "packuswb    %%xmm4,%%xmm4                 \n"
-            "psllw       $0x3,%%xmm5                   \n"
-            "lea         0x00(%4,%4,2),%3              \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "movdqu      (%0),%%xmm0                   \n"
-            "movdqu      0x10(%0),%%xmm1               \n"
-            "movdqu      0x00(%0,%4,1),%%xmm2          \n"
-            "movdqu      0x10(%0,%4,1),%%xmm3          \n"
-            "pmaddubsw   %%xmm4,%%xmm0                 \n"
-            "pmaddubsw   %%xmm4,%%xmm1                 \n"
-            "pmaddubsw   %%xmm4,%%xmm2                 \n"
-            "pmaddubsw   %%xmm4,%%xmm3                 \n"
-            "paddw       %%xmm2,%%xmm0                 \n"
-            "paddw       %%xmm3,%%xmm1                 \n"
-            "movdqu      0x00(%0,%4,2),%%xmm2          \n"
-            "movdqu      0x10(%0,%4,2),%%xmm3          \n"
-            "pmaddubsw   %%xmm4,%%xmm2                 \n"
-            "pmaddubsw   %%xmm4,%%xmm3                 \n"
-            "paddw       %%xmm2,%%xmm0                 \n"
-            "paddw       %%xmm3,%%xmm1                 \n"
-            "movdqu      0x00(%0,%3,1),%%xmm2          \n"
-            "movdqu      0x10(%0,%3,1),%%xmm3          \n"
-            "lea         0x20(%0),%0                   \n"
-            "pmaddubsw   %%xmm4,%%xmm2                 \n"
-            "pmaddubsw   %%xmm4,%%xmm3                 \n"
-            "paddw       %%xmm2,%%xmm0                 \n"
-            "paddw       %%xmm3,%%xmm1                 \n"
-            "phaddw      %%xmm1,%%xmm0                 \n"
-            "paddw       %%xmm5,%%xmm0                 \n"
-            "psrlw       $0x4,%%xmm0                   \n"
-            "packuswb    %%xmm0,%%xmm0                 \n"
-            "movq        %%xmm0,(%1)                   \n"
-            "lea         0x8(%1),%1                    \n"
-            "sub         $0x8,%2                       \n"
-            "jg          1b                            \n"
-            : "+r"(src_ptr),               // %0
-    "+r"(dst_ptr),               // %1
-    "+r"(dst_width),             // %2
-    "=&r"(stridex3)              // %3
-            : "r"((intptr_t) (src_stride))  // %4
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-#ifdef HAS_SCALEROWDOWN4_AVX2
-
-void ScaleRowDown4_AVX2(const uint8_t *src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t *dst_ptr,
-                        int dst_width) {
-    (void) src_stride;
-    asm volatile(
-            "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
-            "vpsrld      $0x18,%%ymm5,%%ymm5           \n"
-            "vpslld      $0x10,%%ymm5,%%ymm5           \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "vmovdqu     (%0),%%ymm0                   \n"
-            "vmovdqu     0x20(%0),%%ymm1               \n"
-            "lea         0x40(%0),%0                   \n"
-            "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
-            "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
-            "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-            "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-            "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
-            "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
-            "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-            "vmovdqu     %%xmm0,(%1)                   \n"
-            "lea         0x10(%1),%1                   \n"
-            "sub         $0x10,%2                      \n"
-            "jg          1b                            \n"
-            "vzeroupper                                \n"
-            : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-            ::"memory",
-    "cc", "xmm0", "xmm1", "xmm5");
-}
-
-void ScaleRowDown4Box_AVX2(const uint8_t *src_ptr,
-                           ptrdiff_t src_stride,
-                           uint8_t *dst_ptr,
-                           int dst_width) {
-    asm volatile(
-            "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
-            "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
-            "vpsllw      $0x3,%%ymm4,%%ymm5            \n"
-            "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "vmovdqu     (%0),%%ymm0                   \n"
-            "vmovdqu     0x20(%0),%%ymm1               \n"
-            "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"
-            "vmovdqu     0x20(%0,%3,1),%%ymm3          \n"
-            "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
-            "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
-            "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
-            "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
-            "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
-            "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
-            "vmovdqu     0x00(%0,%3,2),%%ymm2          \n"
-            "vmovdqu     0x20(%0,%3,2),%%ymm3          \n"
-            "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
-            "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
-            "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
-            "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
-            "vmovdqu     0x00(%0,%4,1),%%ymm2          \n"
-            "vmovdqu     0x20(%0,%4,1),%%ymm3          \n"
-            "lea         0x40(%0),%0                   \n"
-            "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
-            "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
-            "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
-            "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
-            "vphaddw     %%ymm1,%%ymm0,%%ymm0          \n"
-            "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-            "vpaddw      %%ymm5,%%ymm0,%%ymm0          \n"
-            "vpsrlw      $0x4,%%ymm0,%%ymm0            \n"
-            "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
-            "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-            "vmovdqu     %%xmm0,(%1)                   \n"
-            "lea         0x10(%1),%1                   \n"
-            "sub         $0x10,%2                      \n"
-            "jg          1b                            \n"
-            "vzeroupper                                \n"
-            : "+r"(src_ptr),                   // %0
-    "+r"(dst_ptr),                   // %1
-    "+r"(dst_width)                  // %2
-            : "r"((intptr_t) (src_stride)),     // %3
-    "r"((intptr_t) (src_stride * 3))  // %4
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-#endif  // HAS_SCALEROWDOWN4_AVX2
-
-void ScaleRowDown34_SSSE3(const uint8_t *src_ptr,
-                          ptrdiff_t src_stride,
-                          uint8_t *dst_ptr,
-                          int dst_width) {
-    (void) src_stride;
-    asm volatile(
-            "movdqa      %0,%%xmm3                     \n"
-            "movdqa      %1,%%xmm4                     \n"
-            "movdqa      %2,%%xmm5                     \n"
-            :
-            : "m"(kShuf0),  // %0
-    "m"(kShuf1),  // %1
-    "m"(kShuf2)   // %2
-            );
-    asm volatile(LABELALIGN
-            "1:                                        \n"
-            "movdqu      (%0),%%xmm0                   \n"
-            "movdqu      0x10(%0),%%xmm2               \n"
-            "lea         0x20(%0),%0                   \n"
-            "movdqa      %%xmm2,%%xmm1                 \n"
-            "palignr     $0x8,%%xmm0,%%xmm1            \n"
-            "pshufb      %%xmm3,%%xmm0                 \n"
-            "pshufb      %%xmm4,%%xmm1                 \n"
-            "pshufb      %%xmm5,%%xmm2                 \n"
-            "movq        %%xmm0,(%1)                   \n"
-            "movq        %%xmm1,0x8(%1)                \n"
-            "movq        %%xmm2,0x10(%1)               \n"
-            "lea         0x18(%1),%1                   \n"
-            "sub         $0x18,%2                      \n"
-            "jg          1b                            \n"
-            : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-            ::"memory",
-    "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-void ScaleRowDown34_1_Box_SSSE3(const uint8_t *src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t *dst_ptr,
-                                int dst_width) {
-    asm volatile(
-            "movdqa      %0,%%xmm2                     \n"  // kShuf01
-            "movdqa      %1,%%xmm3                     \n"  // kShuf11
-            "movdqa      %2,%%xmm4                     \n"  // kShuf21
-            :
-            : "m"(kShuf01),  // %0
-    "m"(kShuf11),  // %1
-    "m"(kShuf21)   // %2
-            );
-    asm volatile(
-            "movdqa      %0,%%xmm5                     \n"  // kMadd01
-            "movdqa      %1,%%xmm0                     \n"  // kMadd11
-            "movdqa      %2,%%xmm1                     \n"  // kRound34
-            :
-            : "m"(kMadd01),  // %0
-    "m"(kMadd11),  // %1
-    "m"(kRound34)  // %2
-            );
-    asm volatile(LABELALIGN
-            "1:                                        \n"
-            "movdqu      (%0),%%xmm6                   \n"
-            "movdqu      0x00(%0,%3,1),%%xmm7          \n"
-            "pavgb       %%xmm7,%%xmm6                 \n"
-            "pshufb      %%xmm2,%%xmm6                 \n"
-            "pmaddubsw   %%xmm5,%%xmm6                 \n"
-            "paddsw      %%xmm1,%%xmm6                 \n"
-            "psrlw       $0x2,%%xmm6                   \n"
-            "packuswb    %%xmm6,%%xmm6                 \n"
-            "movq        %%xmm6,(%1)                   \n"
-            "movdqu      0x8(%0),%%xmm6                \n"
-            "movdqu      0x8(%0,%3,1),%%xmm7           \n"
-            "pavgb       %%xmm7,%%xmm6                 \n"
-            "pshufb      %%xmm3,%%xmm6                 \n"
-            "pmaddubsw   %%xmm0,%%xmm6                 \n"
-            "paddsw      %%xmm1,%%xmm6                 \n"
-            "psrlw       $0x2,%%xmm6                   \n"
-            "packuswb    %%xmm6,%%xmm6                 \n"
-            "movq        %%xmm6,0x8(%1)                \n"
-            "movdqu      0x10(%0),%%xmm6               \n"
-            "movdqu      0x10(%0,%3,1),%%xmm7          \n"
-            "lea         0x20(%0),%0                   \n"
-            "pavgb       %%xmm7,%%xmm6                 \n"
-            "pshufb      %%xmm4,%%xmm6                 \n"
-            "pmaddubsw   %4,%%xmm6                     \n"
-            "paddsw      %%xmm1,%%xmm6                 \n"
-            "psrlw       $0x2,%%xmm6                   \n"
-            "packuswb    %%xmm6,%%xmm6                 \n"
-            "movq        %%xmm6,0x10(%1)               \n"
-            "lea         0x18(%1),%1                   \n"
-            "sub         $0x18,%2                      \n"
-            "jg          1b                            \n"
-            : "+r"(src_ptr),                // %0
-    "+r"(dst_ptr),                // %1
-    "+r"(dst_width)               // %2
-            : "r"((intptr_t) (src_stride)),  // %3
-    "m"(kMadd21)                  // %4
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
-    "xmm6", "xmm7");
-}
-
-void ScaleRowDown34_0_Box_SSSE3(const uint8_t *src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t *dst_ptr,
-                                int dst_width) {
-    asm volatile(
-            "movdqa      %0,%%xmm2                     \n"  // kShuf01
-            "movdqa      %1,%%xmm3                     \n"  // kShuf11
-            "movdqa      %2,%%xmm4                     \n"  // kShuf21
-            :
-            : "m"(kShuf01),  // %0
-    "m"(kShuf11),  // %1
-    "m"(kShuf21)   // %2
-            );
-    asm volatile(
-            "movdqa      %0,%%xmm5                     \n"  // kMadd01
-            "movdqa      %1,%%xmm0                     \n"  // kMadd11
-            "movdqa      %2,%%xmm1                     \n"  // kRound34
-            :
-            : "m"(kMadd01),  // %0
-    "m"(kMadd11),  // %1
-    "m"(kRound34)  // %2
-            );
-
-    asm volatile(LABELALIGN
-            "1:                                        \n"
-            "movdqu      (%0),%%xmm6                   \n"
-            "movdqu      0x00(%0,%3,1),%%xmm7          \n"
-            "pavgb       %%xmm6,%%xmm7                 \n"
-            "pavgb       %%xmm7,%%xmm6                 \n"
-            "pshufb      %%xmm2,%%xmm6                 \n"
-            "pmaddubsw   %%xmm5,%%xmm6                 \n"
-            "paddsw      %%xmm1,%%xmm6                 \n"
-            "psrlw       $0x2,%%xmm6                   \n"
-            "packuswb    %%xmm6,%%xmm6                 \n"
-            "movq        %%xmm6,(%1)                   \n"
-            "movdqu      0x8(%0),%%xmm6                \n"
-            "movdqu      0x8(%0,%3,1),%%xmm7           \n"
-            "pavgb       %%xmm6,%%xmm7                 \n"
-            "pavgb       %%xmm7,%%xmm6                 \n"
-            "pshufb      %%xmm3,%%xmm6                 \n"
-            "pmaddubsw   %%xmm0,%%xmm6                 \n"
-            "paddsw      %%xmm1,%%xmm6                 \n"
-            "psrlw       $0x2,%%xmm6                   \n"
-            "packuswb    %%xmm6,%%xmm6                 \n"
-            "movq        %%xmm6,0x8(%1)                \n"
-            "movdqu      0x10(%0),%%xmm6               \n"
-            "movdqu      0x10(%0,%3,1),%%xmm7          \n"
-            "lea         0x20(%0),%0                   \n"
-            "pavgb       %%xmm6,%%xmm7                 \n"
-            "pavgb       %%xmm7,%%xmm6                 \n"
-            "pshufb      %%xmm4,%%xmm6                 \n"
-            "pmaddubsw   %4,%%xmm6                     \n"
-            "paddsw      %%xmm1,%%xmm6                 \n"
-            "psrlw       $0x2,%%xmm6                   \n"
-            "packuswb    %%xmm6,%%xmm6                 \n"
-            "movq        %%xmm6,0x10(%1)               \n"
-            "lea         0x18(%1),%1                   \n"
-            "sub         $0x18,%2                      \n"
-            "jg          1b                            \n"
-            : "+r"(src_ptr),                // %0
-    "+r"(dst_ptr),                // %1
-    "+r"(dst_width)               // %2
-            : "r"((intptr_t) (src_stride)),  // %3
-    "m"(kMadd21)                  // %4
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
-    "xmm6", "xmm7");
-}
-
-void ScaleRowDown38_SSSE3(const uint8_t *src_ptr,
-                          ptrdiff_t src_stride,
-                          uint8_t *dst_ptr,
-                          int dst_width) {
-    (void) src_stride;
-    asm volatile(
-            "movdqa      %3,%%xmm4                     \n"
-            "movdqa      %4,%%xmm5                     \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "movdqu      (%0),%%xmm0                   \n"
-            "movdqu      0x10(%0),%%xmm1               \n"
-            "lea         0x20(%0),%0                   \n"
-            "pshufb      %%xmm4,%%xmm0                 \n"
-            "pshufb      %%xmm5,%%xmm1                 \n"
-            "paddusb     %%xmm1,%%xmm0                 \n"
-            "movq        %%xmm0,(%1)                   \n"
-            "movhlps     %%xmm0,%%xmm1                 \n"
-            "movd        %%xmm1,0x8(%1)                \n"
-            "lea         0xc(%1),%1                    \n"
-            "sub         $0xc,%2                       \n"
-            "jg          1b                            \n"
-            : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-            : "m"(kShuf38a),   // %3
-    "m"(kShuf38b)    // %4
-            : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
-}
-
-void ScaleRowDown38_2_Box_SSSE3(const uint8_t *src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t *dst_ptr,
-                                int dst_width) {
-    asm volatile(
-            "movdqa      %0,%%xmm2                     \n"
-            "movdqa      %1,%%xmm3                     \n"
-            "movdqa      %2,%%xmm4                     \n"
-            "movdqa      %3,%%xmm5                     \n"
-            :
-            : "m"(kShufAb0),  // %0
-    "m"(kShufAb1),  // %1
-    "m"(kShufAb2),  // %2
-    "m"(kScaleAb2)  // %3
-            );
-    asm volatile(LABELALIGN
-            "1:                                        \n"
-            "movdqu      (%0),%%xmm0                   \n"
-            "movdqu      0x00(%0,%3,1),%%xmm1          \n"
-            "lea         0x10(%0),%0                   \n"
-            "pavgb       %%xmm1,%%xmm0                 \n"
-            "movdqa      %%xmm0,%%xmm1                 \n"
-            "pshufb      %%xmm2,%%xmm1                 \n"
-            "movdqa      %%xmm0,%%xmm6                 \n"
-            "pshufb      %%xmm3,%%xmm6                 \n"
-            "paddusw     %%xmm6,%%xmm1                 \n"
-            "pshufb      %%xmm4,%%xmm0                 \n"
-            "paddusw     %%xmm0,%%xmm1                 \n"
-            "pmulhuw     %%xmm5,%%xmm1                 \n"
-            "packuswb    %%xmm1,%%xmm1                 \n"
-            "movd        %%xmm1,(%1)                   \n"
-            "psrlq       $0x10,%%xmm1                  \n"
-            "movd        %%xmm1,0x2(%1)                \n"
-            "lea         0x6(%1),%1                    \n"
-            "sub         $0x6,%2                       \n"
-            "jg          1b                            \n"
-            : "+r"(src_ptr),               // %0
-    "+r"(dst_ptr),               // %1
-    "+r"(dst_width)              // %2
-            : "r"((intptr_t) (src_stride))  // %3
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
-    "xmm6");
-}
-
-void ScaleRowDown38_3_Box_SSSE3(const uint8_t *src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t *dst_ptr,
-                                int dst_width) {
-    asm volatile(
-            "movdqa      %0,%%xmm2                     \n"
-            "movdqa      %1,%%xmm3                     \n"
-            "movdqa      %2,%%xmm4                     \n"
-            "pxor        %%xmm5,%%xmm5                 \n"
-            :
-            : "m"(kShufAc),    // %0
-    "m"(kShufAc3),   // %1
-    "m"(kScaleAc33)  // %2
-            );
-    asm volatile(LABELALIGN
-            "1:                                        \n"
-            "movdqu      (%0),%%xmm0                   \n"
-            "movdqu      0x00(%0,%3,1),%%xmm6          \n"
-            "movhlps     %%xmm0,%%xmm1                 \n"
-            "movhlps     %%xmm6,%%xmm7                 \n"
-            "punpcklbw   %%xmm5,%%xmm0                 \n"
-            "punpcklbw   %%xmm5,%%xmm1                 \n"
-            "punpcklbw   %%xmm5,%%xmm6                 \n"
-            "punpcklbw   %%xmm5,%%xmm7                 \n"
-            "paddusw     %%xmm6,%%xmm0                 \n"
-            "paddusw     %%xmm7,%%xmm1                 \n"
-            "movdqu      0x00(%0,%3,2),%%xmm6          \n"
-            "lea         0x10(%0),%0                   \n"
-            "movhlps     %%xmm6,%%xmm7                 \n"
-            "punpcklbw   %%xmm5,%%xmm6                 \n"
-            "punpcklbw   %%xmm5,%%xmm7                 \n"
-            "paddusw     %%xmm6,%%xmm0                 \n"
-            "paddusw     %%xmm7,%%xmm1                 \n"
-            "movdqa      %%xmm0,%%xmm6                 \n"
-            "psrldq      $0x2,%%xmm0                   \n"
-            "paddusw     %%xmm0,%%xmm6                 \n"
-            "psrldq      $0x2,%%xmm0                   \n"
-            "paddusw     %%xmm0,%%xmm6                 \n"
-            "pshufb      %%xmm2,%%xmm6                 \n"
-            "movdqa      %%xmm1,%%xmm7                 \n"
-            "psrldq      $0x2,%%xmm1                   \n"
-            "paddusw     %%xmm1,%%xmm7                 \n"
-            "psrldq      $0x2,%%xmm1                   \n"
-            "paddusw     %%xmm1,%%xmm7                 \n"
-            "pshufb      %%xmm3,%%xmm7                 \n"
-            "paddusw     %%xmm7,%%xmm6                 \n"
-            "pmulhuw     %%xmm4,%%xmm6                 \n"
-            "packuswb    %%xmm6,%%xmm6                 \n"
-            "movd        %%xmm6,(%1)                   \n"
-            "psrlq       $0x10,%%xmm6                  \n"
-            "movd        %%xmm6,0x2(%1)                \n"
-            "lea         0x6(%1),%1                    \n"
-            "sub         $0x6,%2                       \n"
-            "jg          1b                            \n"
-            : "+r"(src_ptr),               // %0
-    "+r"(dst_ptr),               // %1
-    "+r"(dst_width)              // %2
-            : "r"((intptr_t) (src_stride))  // %3
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
-    "xmm6", "xmm7");
-}
-
-static const uvec8 kLinearShuffleFar = {2, 3, 0, 1, 6, 7, 4, 5,
-                                        10, 11, 8, 9, 14, 15, 12, 13};
-
-static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3,
-                                    3, 1, 1, 3, 3, 1, 1, 3};
-
-#ifdef HAS_SCALEROWUP2_LINEAR_SSE2
-
-void ScaleRowUp2_Linear_SSE2(const uint8_t *src_ptr,
-                             uint8_t *dst_ptr,
-                             int dst_width) {
-    asm volatile(
-            "pxor        %%xmm0,%%xmm0                 \n"  // 0
-            "pcmpeqw     %%xmm6,%%xmm6                 \n"
-            "psrlw       $15,%%xmm6                    \n"
-            "psllw       $1,%%xmm6                     \n"  // all 2
-
-            LABELALIGN
-            "1:                                        \n"
-            "movq        (%0),%%xmm1                   \n"  // 01234567
-            "movq        1(%0),%%xmm2                  \n"  // 12345678
-            "movdqa      %%xmm1,%%xmm3                 \n"
-            "punpcklbw   %%xmm2,%%xmm3                 \n"  // 0112233445566778
-            "punpcklbw   %%xmm1,%%xmm1                 \n"  // 0011223344556677
-            "punpcklbw   %%xmm2,%%xmm2                 \n"  // 1122334455667788
-            "movdqa      %%xmm1,%%xmm4                 \n"
-            "punpcklbw   %%xmm0,%%xmm4                 \n"  // 00112233 (16)
-            "movdqa      %%xmm2,%%xmm5                 \n"
-            "punpcklbw   %%xmm0,%%xmm5                 \n"  // 11223344 (16)
-            "paddw       %%xmm5,%%xmm4                 \n"
-            "movdqa      %%xmm3,%%xmm5                 \n"
-            "paddw       %%xmm6,%%xmm4                 \n"
-            "punpcklbw   %%xmm0,%%xmm5                 \n"  // 01122334 (16)
-            "paddw       %%xmm5,%%xmm5                 \n"
-            "paddw       %%xmm4,%%xmm5                 \n"  // 3*near+far+2 (lo)
-            "psrlw       $2,%%xmm5                     \n"  // 3/4*near+1/4*far (lo)
-
-            "punpckhbw   %%xmm0,%%xmm1                 \n"  // 44556677 (16)
-            "punpckhbw   %%xmm0,%%xmm2                 \n"  // 55667788 (16)
-            "paddw       %%xmm2,%%xmm1                 \n"
-            "punpckhbw   %%xmm0,%%xmm3                 \n"  // 45566778 (16)
-            "paddw       %%xmm6,%%xmm1                 \n"
-            "paddw       %%xmm3,%%xmm3                 \n"
-            "paddw       %%xmm3,%%xmm1                 \n"  // 3*near+far+2 (hi)
-            "psrlw       $2,%%xmm1                     \n"  // 3/4*near+1/4*far (hi)
-
-            "packuswb    %%xmm1,%%xmm5                 \n"
-            "movdqu      %%xmm5,(%1)                   \n"
-
-            "lea         0x8(%0),%0                    \n"
-            "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
-            "sub         $0x10,%2                      \n"
-            "jg          1b                            \n"
-            : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-            :
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-
-#endif
-
-#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2
-
-void ScaleRowUp2_Bilinear_SSE2(const uint8_t *src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t *dst_ptr,
-                               ptrdiff_t dst_stride,
-                               int dst_width) {
-    asm volatile(
-            LABELALIGN
-            "1:                                        \n"
-            "pxor        %%xmm0,%%xmm0                 \n"  // 0
-            // above line
-            "movq        (%0),%%xmm1                   \n"  // 01234567
-            "movq        1(%0),%%xmm2                  \n"  // 12345678
-            "movdqa      %%xmm1,%%xmm3                 \n"
-            "punpcklbw   %%xmm2,%%xmm3                 \n"  // 0112233445566778
-            "punpcklbw   %%xmm1,%%xmm1                 \n"  // 0011223344556677
-            "punpcklbw   %%xmm2,%%xmm2                 \n"  // 1122334455667788
-
-            "movdqa      %%xmm1,%%xmm4                 \n"
-            "punpcklbw   %%xmm0,%%xmm4                 \n"  // 00112233 (16)
-            "movdqa      %%xmm2,%%xmm5                 \n"
-            "punpcklbw   %%xmm0,%%xmm5                 \n"  // 11223344 (16)
-            "paddw       %%xmm5,%%xmm4                 \n"  // near+far
-            "movdqa      %%xmm3,%%xmm5                 \n"
-            "punpcklbw   %%xmm0,%%xmm5                 \n"  // 01122334 (16)
-            "paddw       %%xmm5,%%xmm5                 \n"  // 2*near
-            "paddw       %%xmm5,%%xmm4                 \n"  // 3*near+far (1, lo)
-
-            "punpckhbw   %%xmm0,%%xmm1                 \n"  // 44556677 (16)
-            "punpckhbw   %%xmm0,%%xmm2                 \n"  // 55667788 (16)
-            "paddw       %%xmm2,%%xmm1                 \n"
-            "punpckhbw   %%xmm0,%%xmm3                 \n"  // 45566778 (16)
-            "paddw       %%xmm3,%%xmm3                 \n"  // 2*near
-            "paddw       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
-
-            // below line
-            "movq        (%0,%3),%%xmm6                \n"  // 01234567
-            "movq        1(%0,%3),%%xmm2               \n"  // 12345678
-            "movdqa      %%xmm6,%%xmm3                 \n"
-            "punpcklbw   %%xmm2,%%xmm3                 \n"  // 0112233445566778
-            "punpcklbw   %%xmm6,%%xmm6                 \n"  // 0011223344556677
-            "punpcklbw   %%xmm2,%%xmm2                 \n"  // 1122334455667788
-
-            "movdqa      %%xmm6,%%xmm5                 \n"
-            "punpcklbw   %%xmm0,%%xmm5                 \n"  // 00112233 (16)
-            "movdqa      %%xmm2,%%xmm7                 \n"
-            "punpcklbw   %%xmm0,%%xmm7                 \n"  // 11223344 (16)
-            "paddw       %%xmm7,%%xmm5                 \n"  // near+far
-            "movdqa      %%xmm3,%%xmm7                 \n"
-            "punpcklbw   %%xmm0,%%xmm7                 \n"  // 01122334 (16)
-            "paddw       %%xmm7,%%xmm7                 \n"  // 2*near
-            "paddw       %%xmm7,%%xmm5                 \n"  // 3*near+far (2, lo)
-
-            "punpckhbw   %%xmm0,%%xmm6                 \n"  // 44556677 (16)
-            "punpckhbw   %%xmm0,%%xmm2                 \n"  // 55667788 (16)
-            "paddw       %%xmm6,%%xmm2                 \n"  // near+far
-            "punpckhbw   %%xmm0,%%xmm3                 \n"  // 45566778 (16)
-            "paddw       %%xmm3,%%xmm3                 \n"  // 2*near
-            "paddw       %%xmm3,%%xmm2                 \n"  // 3*near+far (2, hi)
-
-            // xmm4 xmm1
-            // xmm5 xmm2
-            "pcmpeqw     %%xmm0,%%xmm0                 \n"
-            "psrlw       $15,%%xmm0                    \n"
-            "psllw       $3,%%xmm0                     \n"  // all 8
-
-            "movdqa      %%xmm4,%%xmm3                 \n"
-            "movdqa      %%xmm5,%%xmm6                 \n"
-            "paddw       %%xmm3,%%xmm3                 \n"  // 6*near+2*far (1, lo)
-            "paddw       %%xmm0,%%xmm6                 \n"  // 3*near+far+8 (2, lo)
-            "paddw       %%xmm4,%%xmm3                 \n"  // 9*near+3*far (1, lo)
-            "paddw       %%xmm6,%%xmm3                 \n"  // 9 3 3 1 + 8 (1, lo)
-            "psrlw       $4,%%xmm3                     \n"  // ^ div by 16
-
-            "movdqa      %%xmm1,%%xmm7                 \n"
-            "movdqa      %%xmm2,%%xmm6                 \n"
-            "paddw       %%xmm7,%%xmm7                 \n"  // 6*near+2*far (1, hi)
-            "paddw       %%xmm0,%%xmm6                 \n"  // 3*near+far+8 (2, hi)
-            "paddw       %%xmm1,%%xmm7                 \n"  // 9*near+3*far (1, hi)
-            "paddw       %%xmm6,%%xmm7                 \n"  // 9 3 3 1 + 8 (1, hi)
-            "psrlw       $4,%%xmm7                     \n"  // ^ div by 16
-
-            "packuswb    %%xmm7,%%xmm3                 \n"
-            "movdqu      %%xmm3,(%1)                   \n"  // save above line
-
-            "movdqa      %%xmm5,%%xmm3                 \n"
-            "paddw       %%xmm0,%%xmm4                 \n"  // 3*near+far+8 (1, lo)
-            "paddw       %%xmm3,%%xmm3                 \n"  // 6*near+2*far (2, lo)
-            "paddw       %%xmm3,%%xmm5                 \n"  // 9*near+3*far (2, lo)
-            "paddw       %%xmm4,%%xmm5                 \n"  // 9 3 3 1 + 8 (lo)
-            "psrlw       $4,%%xmm5                     \n"  // ^ div by 16
-
-            "movdqa      %%xmm2,%%xmm3                 \n"
-            "paddw       %%xmm0,%%xmm1                 \n"  // 3*near+far+8 (1, hi)
-            "paddw       %%xmm3,%%xmm3                 \n"  // 6*near+2*far (2, hi)
-            "paddw       %%xmm3,%%xmm2                 \n"  // 9*near+3*far (2, hi)
-            "paddw       %%xmm1,%%xmm2                 \n"  // 9 3 3 1 + 8 (hi)
-            "psrlw       $4,%%xmm2                     \n"  // ^ div by 16
-
-            "packuswb    %%xmm2,%%xmm5                 \n"
-            "movdqu      %%xmm5,(%1,%4)                \n"  // save below line
-
-            "lea         0x8(%0),%0                    \n"
-            "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
-            "sub         $0x10,%2                      \n"
-            "jg          1b                            \n"
-            : "+r"(src_ptr),                // %0
-    "+r"(dst_ptr),                // %1
-    "+r"(dst_width)               // %2
-            : "r"((intptr_t) (src_stride)),  // %3
-    "r"((intptr_t) (dst_stride))   // %4
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-    "xmm7");
-}
-
-#endif
-
-#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3
-
-void ScaleRowUp2_Linear_12_SSSE3(const uint16_t *src_ptr,
-                                 uint16_t *dst_ptr,
-                                 int dst_width) {
-    asm volatile(
-            "movdqa      %3,%%xmm5                     \n"
-            "pcmpeqw     %%xmm4,%%xmm4                 \n"
-            "psrlw       $15,%%xmm4                    \n"
-            "psllw       $1,%%xmm4                     \n"  // all 2
-
-            LABELALIGN
-            "1:                                        \n"
-            "movdqu      (%0),%%xmm0                   \n"  // 01234567 (16)
-            "movdqu      2(%0),%%xmm1                  \n"  // 12345678 (16)
-
-            "movdqa      %%xmm0,%%xmm2                 \n"
-            "punpckhwd   %%xmm1,%%xmm2                 \n"  // 45566778 (16)
-            "punpcklwd   %%xmm1,%%xmm0                 \n"  // 01122334 (16)
-
-            "movdqa      %%xmm2,%%xmm3                 \n"
-            "movdqa      %%xmm0,%%xmm1                 \n"
-            "pshufb      %%xmm5,%%xmm3                 \n"  // 54657687 (far)
-            "pshufb      %%xmm5,%%xmm1                 \n"  // 10213243 (far)
-
-            "paddw       %%xmm4,%%xmm1                 \n"  // far+2
-            "paddw       %%xmm4,%%xmm3                 \n"  // far+2
-            "paddw       %%xmm0,%%xmm1                 \n"  // near+far+2
-            "paddw       %%xmm2,%%xmm3                 \n"  // near+far+2
-            "paddw       %%xmm0,%%xmm0                 \n"  // 2*near
-            "paddw       %%xmm2,%%xmm2                 \n"  // 2*near
-            "paddw       %%xmm1,%%xmm0                 \n"  // 3*near+far+2 (lo)
-            "paddw       %%xmm3,%%xmm2                 \n"  // 3*near+far+2 (hi)
-
-            "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far
-            "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far
-            "movdqu      %%xmm0,(%1)                   \n"
-            "movdqu      %%xmm2,16(%1)                 \n"
-
-            "lea         0x10(%0),%0                   \n"
-            "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
-            "sub         $0x10,%2                      \n"
-            "jg          1b                            \n"
-            : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width)         // %2
-            : "m"(kLinearShuffleFar)  // %3
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-#endif
-
-#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3
-
-void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t *src_ptr,
-                                   ptrdiff_t src_stride,
-                                   uint16_t *dst_ptr,
-                                   ptrdiff_t dst_stride,
-                                   int dst_width) {
-    asm volatile(
-            "pcmpeqw     %%xmm7,%%xmm7                 \n"
-            "psrlw       $15,%%xmm7                    \n"
-            "psllw       $3,%%xmm7                     \n"  // all 8
-            "movdqa      %5,%%xmm6                     \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            // above line
-            "movdqu      (%0),%%xmm0                   \n"  // 01234567 (16)
-            "movdqu      2(%0),%%xmm1                  \n"  // 12345678 (16)
-            "movdqa      %%xmm0,%%xmm2                 \n"
-            "punpckhwd   %%xmm1,%%xmm2                 \n"  // 45566778 (16)
-            "punpcklwd   %%xmm1,%%xmm0                 \n"  // 01122334 (16)
-            "movdqa      %%xmm2,%%xmm3                 \n"
-            "movdqa      %%xmm0,%%xmm1                 \n"
-            "pshufb      %%xmm6,%%xmm3                 \n"  // 54657687 (far)
-            "pshufb      %%xmm6,%%xmm1                 \n"  // 10213243 (far)
-            "paddw       %%xmm0,%%xmm1                 \n"  // near+far
-            "paddw       %%xmm2,%%xmm3                 \n"  // near+far
-            "paddw       %%xmm0,%%xmm0                 \n"  // 2*near
-            "paddw       %%xmm2,%%xmm2                 \n"  // 2*near
-            "paddw       %%xmm1,%%xmm0                 \n"  // 3*near+far (1, lo)
-            "paddw       %%xmm3,%%xmm2                 \n"  // 3*near+far (1, hi)
-
-            // below line
-            "movdqu      (%0,%3,2),%%xmm1              \n"  // 01234567 (16)
-            "movdqu      2(%0,%3,2),%%xmm4             \n"  // 12345678 (16)
-            "movdqa      %%xmm1,%%xmm3                 \n"
-            "punpckhwd   %%xmm4,%%xmm3                 \n"  // 45566778 (16)
-            "punpcklwd   %%xmm4,%%xmm1                 \n"  // 01122334 (16)
-            "movdqa      %%xmm3,%%xmm5                 \n"
-            "movdqa      %%xmm1,%%xmm4                 \n"
-            "pshufb      %%xmm6,%%xmm5                 \n"  // 54657687 (far)
-            "pshufb      %%xmm6,%%xmm4                 \n"  // 10213243 (far)
-            "paddw       %%xmm1,%%xmm4                 \n"  // near+far
-            "paddw       %%xmm3,%%xmm5                 \n"  // near+far
-            "paddw       %%xmm1,%%xmm1                 \n"  // 2*near
-            "paddw       %%xmm3,%%xmm3                 \n"  // 2*near
-            "paddw       %%xmm4,%%xmm1                 \n"  // 3*near+far (2, lo)
-            "paddw       %%xmm5,%%xmm3                 \n"  // 3*near+far (2, hi)
-
-            // xmm0 xmm2
-            // xmm1 xmm3
-
-            "movdqa      %%xmm0,%%xmm4                 \n"
-            "movdqa      %%xmm1,%%xmm5                 \n"
-            "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (1, lo)
-            "paddw       %%xmm7,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
-            "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
-            "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
-            "psrlw       $4,%%xmm4                     \n"  // ^ div by 16
-            "movdqu      %%xmm4,(%1)                   \n"
-
-            "movdqa      %%xmm2,%%xmm4                 \n"
-            "movdqa      %%xmm3,%%xmm5                 \n"
-            "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (1, hi)
-            "paddw       %%xmm7,%%xmm5                 \n"  // 3*near+far+8 (2, hi)
-            "paddw       %%xmm2,%%xmm4                 \n"  // 9*near+3*far (1, hi)
-            "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, hi)
-            "psrlw       $4,%%xmm4                     \n"  // ^ div by 16
-            "movdqu      %%xmm4,0x10(%1)               \n"
-
-            "movdqa      %%xmm1,%%xmm4                 \n"
-            "paddw       %%xmm7,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
-            "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (2, lo)
-            "paddw       %%xmm4,%%xmm1                 \n"  // 9*near+3*far (2, lo)
-            "paddw       %%xmm0,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, lo)
-            "psrlw       $4,%%xmm1                     \n"  // ^ div by 16
-            "movdqu      %%xmm1,(%1,%4,2)              \n"
-
-            "movdqa      %%xmm3,%%xmm4                 \n"
-            "paddw       %%xmm7,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
-            "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (2, hi)
-            "paddw       %%xmm4,%%xmm3                 \n"  // 9*near+3*far (2, hi)
-            "paddw       %%xmm2,%%xmm3                 \n"  // 9 3 3 1 + 8 (2, hi)
-            "psrlw       $4,%%xmm3                     \n"  // ^ div by 16
-            "movdqu      %%xmm3,0x10(%1,%4,2)          \n"
-
-            "lea         0x10(%0),%0                   \n"
-            "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
-            "sub         $0x10,%2                      \n"
-            "jg          1b                            \n"
-            : "+r"(src_ptr),                // %0
-    "+r"(dst_ptr),                // %1
-    "+r"(dst_width)               // %2
-            : "r"((intptr_t) (src_stride)),  // %3
-    "r"((intptr_t) (dst_stride)),  // %4
-    "m"(kLinearShuffleFar)        // %5
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-    "xmm7");
-}
-
-#endif
-
-#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2
-
-void ScaleRowUp2_Linear_16_SSE2(const uint16_t *src_ptr,
-                                uint16_t *dst_ptr,
-                                int dst_width) {
-    asm volatile(
-            "pxor        %%xmm5,%%xmm5                 \n"
-            "pcmpeqd     %%xmm4,%%xmm4                 \n"
-            "psrld       $31,%%xmm4                    \n"
-            "pslld       $1,%%xmm4                     \n"  // all 2
-
-            LABELALIGN
-            "1:                                        \n"
-            "movq        (%0),%%xmm0                   \n"  // 0123 (16b)
-            "movq        2(%0),%%xmm1                  \n"  // 1234 (16b)
-
-            "punpcklwd   %%xmm5,%%xmm0                 \n"  // 0123 (32b)
-            "punpcklwd   %%xmm5,%%xmm1                 \n"  // 1234 (32b)
-
-            "movdqa      %%xmm0,%%xmm2                 \n"
-            "movdqa      %%xmm1,%%xmm3                 \n"
-
-            "pshufd      $0b10110001,%%xmm2,%%xmm2     \n"  // 1032 (even, far)
-            "pshufd      $0b10110001,%%xmm3,%%xmm3     \n"  // 2143 (odd, far)
-
-            "paddd       %%xmm4,%%xmm2                 \n"  // far+2 (lo)
-            "paddd       %%xmm4,%%xmm3                 \n"  // far+2 (hi)
-            "paddd       %%xmm0,%%xmm2                 \n"  // near+far+2 (lo)
-            "paddd       %%xmm1,%%xmm3                 \n"  // near+far+2 (hi)
-            "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (lo)
-            "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (hi)
-            "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far+2 (lo)
-            "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far+2 (hi)
-
-            "psrld       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
-            "psrld       $2,%%xmm1                     \n"  // 3/4*near+1/4*far (hi)
-            "packssdw    %%xmm1,%%xmm0                 \n"
-            "pshufd      $0b11011000,%%xmm0,%%xmm0     \n"
-            "movdqu      %%xmm0,(%1)                   \n"
-
-            "lea         0x8(%0),%0                    \n"
-            "lea         0x10(%1),%1                   \n"  // 4 pixel to 8 pixel
-            "sub         $0x8,%2                       \n"
-            "jg          1b                            \n"
-            : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-            :
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-#endif
-
-#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2
-
-void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t *src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint16_t *dst_ptr,
-                                  ptrdiff_t dst_stride,
-                                  int dst_width) {
-    asm volatile(
-            "pxor        %%xmm7,%%xmm7                 \n"
-            "pcmpeqd     %%xmm6,%%xmm6                 \n"
-            "psrld       $31,%%xmm6                    \n"
-            "pslld       $3,%%xmm6                     \n"  // all 8
-
-            LABELALIGN
-            "1:                                        \n"
-            "movq        (%0),%%xmm0                   \n"  // 0011 (16b, 1u1v)
-            "movq        4(%0),%%xmm1                  \n"  // 1122 (16b, 1u1v)
-            "punpcklwd   %%xmm7,%%xmm0                 \n"  // 0011 (near) (32b, 1u1v)
-            "punpcklwd   %%xmm7,%%xmm1                 \n"  // 1122 (near) (32b, 1u1v)
-            "movdqa      %%xmm0,%%xmm2                 \n"
-            "movdqa      %%xmm1,%%xmm3                 \n"
-            "pshufd      $0b01001110,%%xmm2,%%xmm2     \n"  // 1100 (far) (1, lo)
-            "pshufd      $0b01001110,%%xmm3,%%xmm3     \n"  // 2211 (far) (1, hi)
-            "paddd       %%xmm0,%%xmm2                 \n"  // near+far (1, lo)
-            "paddd       %%xmm1,%%xmm3                 \n"  // near+far (1, hi)
-            "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (1, lo)
-            "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (1, hi)
-            "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far (1, lo)
-            "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
-
-            "movq        (%0),%%xmm0                   \n"  // 0123 (16b)
-            "movq        2(%0),%%xmm1                  \n"  // 1234 (16b)
-            "punpcklwd   %%xmm7,%%xmm0                 \n"  // 0123 (32b)
-            "punpcklwd   %%xmm7,%%xmm1                 \n"  // 1234 (32b)
-            "movdqa      %%xmm0,%%xmm2                 \n"
-            "movdqa      %%xmm1,%%xmm3                 \n"
-            "pshufd      $0b10110001,%%xmm2,%%xmm2     \n"  // 1032 (even, far)
-            "pshufd      $0b10110001,%%xmm3,%%xmm3     \n"  // 2143 (odd, far)
-            "paddd       %%xmm0,%%xmm2                 \n"  // near+far (lo)
-            "paddd       %%xmm1,%%xmm3                 \n"  // near+far (hi)
-            "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (lo)
-            "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (hi)
-            "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far (1, lo)
-            "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
-
-            "movq        (%0,%3,2),%%xmm2              \n"
-            "movq        2(%0,%3,2),%%xmm3             \n"
-            "punpcklwd   %%xmm7,%%xmm2                 \n"  // 0123 (32b)
-            "punpcklwd   %%xmm7,%%xmm3                 \n"  // 1234 (32b)
-            "movdqa      %%xmm2,%%xmm4                 \n"
-            "movdqa      %%xmm3,%%xmm5                 \n"
-            "pshufd      $0b10110001,%%xmm4,%%xmm4     \n"  // 1032 (even, far)
-            "pshufd      $0b10110001,%%xmm5,%%xmm5     \n"  // 2143 (odd, far)
-            "paddd       %%xmm2,%%xmm4                 \n"  // near+far (lo)
-            "paddd       %%xmm3,%%xmm5                 \n"  // near+far (hi)
-            "paddd       %%xmm2,%%xmm2                 \n"  // 2*near (lo)
-            "paddd       %%xmm3,%%xmm3                 \n"  // 2*near (hi)
-            "paddd       %%xmm4,%%xmm2                 \n"  // 3*near+far (2, lo)
-            "paddd       %%xmm5,%%xmm3                 \n"  // 3*near+far (2, hi)
-
-            "movdqa      %%xmm0,%%xmm4                 \n"
-            "movdqa      %%xmm2,%%xmm5                 \n"
-            "paddd       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
-            "paddd       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
-            "paddd       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
-            "paddd       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
-            "psrld       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
-
-            "movdqa      %%xmm2,%%xmm5                 \n"
-            "paddd       %%xmm2,%%xmm5                 \n"  // 6*near+2*far (2, lo)
-            "paddd       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
-            "paddd       %%xmm2,%%xmm5                 \n"  // 9*near+3*far (2, lo)
-            "paddd       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
-            "psrld       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
-
-            "movdqa      %%xmm1,%%xmm0                 \n"
-            "movdqa      %%xmm3,%%xmm2                 \n"
-            "paddd       %%xmm1,%%xmm0                 \n"  // 6*near+2*far (1, hi)
-            "paddd       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (2, hi)
-            "paddd       %%xmm1,%%xmm0                 \n"  // 9*near+3*far (1, hi)
-            "paddd       %%xmm2,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
-            "psrld       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
-
-            "movdqa      %%xmm3,%%xmm2                 \n"
-            "paddd       %%xmm3,%%xmm2                 \n"  // 6*near+2*far (2, hi)
-            "paddd       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (1, hi)
-            "paddd       %%xmm3,%%xmm2                 \n"  // 9*near+3*far (2, hi)
-            "paddd       %%xmm1,%%xmm2                 \n"  // 9 3 3 1 + 8 (2, hi)
-            "psrld       $4,%%xmm2                     \n"  // ^ div by 16 (2, hi)
-
-            "packssdw    %%xmm0,%%xmm4                 \n"
-            "pshufd      $0b11011000,%%xmm4,%%xmm4     \n"
-            "movdqu      %%xmm4,(%1)                   \n"  // store above
-            "packssdw    %%xmm2,%%xmm5                 \n"
-            "pshufd      $0b11011000,%%xmm5,%%xmm5     \n"
-            "movdqu      %%xmm5,(%1,%4,2)              \n"  // store below
-
-            "lea         0x8(%0),%0                    \n"
-            "lea         0x10(%1),%1                   \n"  // 4 pixel to 8 pixel
-            "sub         $0x8,%2                       \n"
-            "jg          1b                            \n"
-            : "+r"(src_ptr),                // %0
-    "+r"(dst_ptr),                // %1
-    "+r"(dst_width)               // %2
-            : "r"((intptr_t) (src_stride)),  // %3
-    "r"((intptr_t) (dst_stride))   // %4
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-    "xmm7");
-}
-
-#endif
-
-#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3
-
-void ScaleRowUp2_Linear_SSSE3(const uint8_t *src_ptr,
-                              uint8_t *dst_ptr,
-                              int dst_width) {
-    asm volatile(
-            "pcmpeqw     %%xmm4,%%xmm4                 \n"
-            "psrlw       $15,%%xmm4                    \n"
-            "psllw       $1,%%xmm4                     \n"  // all 2
-            "movdqa      %3,%%xmm3                     \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "movq        (%0),%%xmm0                   \n"  // 01234567
-            "movq        1(%0),%%xmm1                  \n"  // 12345678
-            "punpcklwd   %%xmm0,%%xmm0                 \n"  // 0101232345456767
-            "punpcklwd   %%xmm1,%%xmm1                 \n"  // 1212343456567878
-            "movdqa      %%xmm0,%%xmm2                 \n"
-            "punpckhdq   %%xmm1,%%xmm2                 \n"  // 4545565667677878
-            "punpckldq   %%xmm1,%%xmm0                 \n"  // 0101121223233434
-            "pmaddubsw   %%xmm3,%%xmm2                 \n"  // 3*near+far (hi)
-            "pmaddubsw   %%xmm3,%%xmm0                 \n"  // 3*near+far (lo)
-            "paddw       %%xmm4,%%xmm0                 \n"  // 3*near+far+2 (lo)
-            "paddw       %%xmm4,%%xmm2                 \n"  // 3*near+far+2 (hi)
-            "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
-            "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far (hi)
-            "packuswb    %%xmm2,%%xmm0                 \n"
-            "movdqu      %%xmm0,(%1)                   \n"
-            "lea         0x8(%0),%0                    \n"
-            "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
-            "sub         $0x10,%2                      \n"
-            "jg          1b                            \n"
-            : "+r"(src_ptr),      // %0
-    "+r"(dst_ptr),      // %1
-    "+r"(dst_width)     // %2
-            : "m"(kLinearMadd31)  // %3
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
-}
-
-#endif
-
-#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3
-
-void ScaleRowUp2_Bilinear_SSSE3(const uint8_t *src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t *dst_ptr,
-                                ptrdiff_t dst_stride,
-                                int dst_width) {
-    asm volatile(
-            "pcmpeqw     %%xmm6,%%xmm6                 \n"
-            "psrlw       $15,%%xmm6                    \n"
-            "psllw       $3,%%xmm6                     \n"  // all 8
-            "movdqa      %5,%%xmm7                     \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "movq        (%0),%%xmm0                   \n"  // 01234567
-            "movq        1(%0),%%xmm1                  \n"  // 12345678
-            "punpcklwd   %%xmm0,%%xmm0                 \n"  // 0101232345456767
-            "punpcklwd   %%xmm1,%%xmm1                 \n"  // 1212343456567878
-            "movdqa      %%xmm0,%%xmm2                 \n"
-            "punpckhdq   %%xmm1,%%xmm2                 \n"  // 4545565667677878
-            "punpckldq   %%xmm1,%%xmm0                 \n"  // 0101121223233434
-            "pmaddubsw   %%xmm7,%%xmm2                 \n"  // 3*near+far (1, hi)
-            "pmaddubsw   %%xmm7,%%xmm0                 \n"  // 3*near+far (1, lo)
-
-            "movq        (%0,%3),%%xmm1                \n"
-            "movq        1(%0,%3),%%xmm4               \n"
-            "punpcklwd   %%xmm1,%%xmm1                 \n"
-            "punpcklwd   %%xmm4,%%xmm4                 \n"
-            "movdqa      %%xmm1,%%xmm3                 \n"
-            "punpckhdq   %%xmm4,%%xmm3                 \n"
-            "punpckldq   %%xmm4,%%xmm1                 \n"
-            "pmaddubsw   %%xmm7,%%xmm3                 \n"  // 3*near+far (2, hi)
-            "pmaddubsw   %%xmm7,%%xmm1                 \n"  // 3*near+far (2, lo)
-
-            // xmm0 xmm2
-            // xmm1 xmm3
-
-            "movdqa      %%xmm0,%%xmm4                 \n"
-            "movdqa      %%xmm1,%%xmm5                 \n"
-            "paddw       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
-            "paddw       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
-            "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
-            "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
-            "psrlw       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
-
-            "movdqa      %%xmm1,%%xmm5                 \n"
-            "paddw       %%xmm1,%%xmm5                 \n"  // 6*near+2*far (2, lo)
-            "paddw       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
-            "paddw       %%xmm1,%%xmm5                 \n"  // 9*near+3*far (2, lo)
-            "paddw       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
-            "psrlw       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
-
-            "movdqa      %%xmm2,%%xmm0                 \n"
-            "movdqa      %%xmm3,%%xmm1                 \n"
-            "paddw       %%xmm2,%%xmm0                 \n"  // 6*near+2*far (1, hi)
-            "paddw       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (2, hi)
-            "paddw       %%xmm2,%%xmm0                 \n"  // 9*near+3*far (1, hi)
-            "paddw       %%xmm1,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
-            "psrlw       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
-
-            "movdqa      %%xmm3,%%xmm1                 \n"
-            "paddw       %%xmm3,%%xmm1                 \n"  // 6*near+2*far (2, hi)
-            "paddw       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
-            "paddw       %%xmm3,%%xmm1                 \n"  // 9*near+3*far (2, hi)
-            "paddw       %%xmm2,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, hi)
-            "psrlw       $4,%%xmm1                     \n"  // ^ div by 16 (2, hi)
-
-            "packuswb    %%xmm0,%%xmm4                 \n"
-            "movdqu      %%xmm4,(%1)                   \n"  // store above
-            "packuswb    %%xmm1,%%xmm5                 \n"
-            "movdqu      %%xmm5,(%1,%4)                \n"  // store below
-
-            "lea         0x8(%0),%0                    \n"
-            "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
-            "sub         $0x10,%2                      \n"
-            "jg          1b                            \n"
-            : "+r"(src_ptr),                // %0
-    "+r"(dst_ptr),                // %1
-    "+r"(dst_width)               // %2
-            : "r"((intptr_t) (src_stride)),  // %3
-    "r"((intptr_t) (dst_stride)),  // %4
-    "m"(kLinearMadd31)            // %5
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-    "xmm7");
-}
-
-#endif
-
-#ifdef HAS_SCALEROWUP2_LINEAR_AVX2
-
-void ScaleRowUp2_Linear_AVX2(const uint8_t *src_ptr,
-                             uint8_t *dst_ptr,
-                             int dst_width) {
-    asm volatile(
-            "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
-            "vpsrlw      $15,%%ymm4,%%ymm4             \n"
-            "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
-            "vbroadcastf128 %3,%%ymm3                  \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "vmovdqu     (%0),%%xmm0                   \n"  // 0123456789ABCDEF
-            "vmovdqu     1(%0),%%xmm1                  \n"  // 123456789ABCDEF0
-            "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
-            "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
-            "vpunpcklwd  %%ymm0,%%ymm0,%%ymm0          \n"
-            "vpunpcklwd  %%ymm1,%%ymm1,%%ymm1          \n"
-            "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"
-            "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"
-            "vpmaddubsw  %%ymm3,%%ymm2,%%ymm1          \n"  // 3*near+far (hi)
-            "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0          \n"  // 3*near+far (lo)
-            "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"  // 3*near+far+2 (lo)
-            "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // 3*near+far+2 (hi)
-            "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
-            "vpsrlw      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
-            "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-            "vmovdqu     %%ymm0,(%1)                   \n"
-
-            "lea         0x10(%0),%0                   \n"
-            "lea         0x20(%1),%1                   \n"  // 16 sample to 32 sample
-            "sub         $0x20,%2                      \n"
-            "jg          1b                            \n"
-            "vzeroupper                                \n"
-            : "+r"(src_ptr),      // %0
-    "+r"(dst_ptr),      // %1
-    "+r"(dst_width)     // %2
-            : "m"(kLinearMadd31)  // %3
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
-}
-
-#endif
-
-#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2
-
-void ScaleRowUp2_Bilinear_AVX2(const uint8_t *src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t *dst_ptr,
-                               ptrdiff_t dst_stride,
-                               int dst_width) {
-    asm volatile(
-            "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
-            "vpsrlw      $15,%%ymm6,%%ymm6             \n"
-            "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
-            "vbroadcastf128 %5,%%ymm7                  \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "vmovdqu     (%0),%%xmm0                   \n"  // 0123456789ABCDEF
-            "vmovdqu     1(%0),%%xmm1                  \n"  // 123456789ABCDEF0
-            "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
-            "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
-            "vpunpcklwd  %%ymm0,%%ymm0,%%ymm0          \n"
-            "vpunpcklwd  %%ymm1,%%ymm1,%%ymm1          \n"
-            "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"
-            "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"
-            "vpmaddubsw  %%ymm7,%%ymm2,%%ymm1          \n"  // 3*near+far (1, hi)
-            "vpmaddubsw  %%ymm7,%%ymm0,%%ymm0          \n"  // 3*near+far (1, lo)
-
-            "vmovdqu     (%0,%3),%%xmm2                \n"  // 0123456789ABCDEF
-            "vmovdqu     1(%0,%3),%%xmm3               \n"  // 123456789ABCDEF0
-            "vpermq      $0b11011000,%%ymm2,%%ymm2     \n"
-            "vpermq      $0b11011000,%%ymm3,%%ymm3     \n"
-            "vpunpcklwd  %%ymm2,%%ymm2,%%ymm2          \n"
-            "vpunpcklwd  %%ymm3,%%ymm3,%%ymm3          \n"
-            "vpunpckhdq  %%ymm3,%%ymm2,%%ymm4          \n"
-            "vpunpckldq  %%ymm3,%%ymm2,%%ymm2          \n"
-            "vpmaddubsw  %%ymm7,%%ymm4,%%ymm3          \n"  // 3*near+far (2, hi)
-            "vpmaddubsw  %%ymm7,%%ymm2,%%ymm2          \n"  // 3*near+far (2, lo)
-
-            // ymm0 ymm1
-            // ymm2 ymm3
-
-            "vpaddw      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
-            "vpaddw      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
-            "vpaddw      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
-            "vpaddw      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
-            "vpsrlw      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
-
-            "vpaddw      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
-            "vpaddw      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
-            "vpaddw      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
-            "vpaddw      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
-            "vpsrlw      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
-
-            "vpaddw      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
-            "vpaddw      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
-            "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
-            "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
-            "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
-
-            "vpaddw      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
-            "vpaddw      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
-            "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
-            "vpaddw      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
-            "vpsrlw      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
-
-            "vpackuswb   %%ymm0,%%ymm4,%%ymm4          \n"
-            "vmovdqu     %%ymm4,(%1)                   \n"  // store above
-            "vpackuswb   %%ymm2,%%ymm5,%%ymm5          \n"
-            "vmovdqu     %%ymm5,(%1,%4)                \n"  // store below
-
-            "lea         0x10(%0),%0                   \n"
-            "lea         0x20(%1),%1                   \n"  // 16 sample to 32 sample
-            "sub         $0x20,%2                      \n"
-            "jg          1b                            \n"
-            "vzeroupper                                \n"
-            : "+r"(src_ptr),                // %0
-    "+r"(dst_ptr),                // %1
-    "+r"(dst_width)               // %2
-            : "r"((intptr_t) (src_stride)),  // %3
-    "r"((intptr_t) (dst_stride)),  // %4
-    "m"(kLinearMadd31)            // %5
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-    "xmm7");
-}
-
-#endif
-
-#ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2
-
-void ScaleRowUp2_Linear_12_AVX2(const uint16_t *src_ptr,
-                                uint16_t *dst_ptr,
-                                int dst_width) {
-    asm volatile(
-            "vbroadcastf128 %3,%%ymm5                  \n"
-            "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
-            "vpsrlw      $15,%%ymm4,%%ymm4             \n"
-            "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
-
-            LABELALIGN
-            "1:                                        \n"
-            "vmovdqu     (%0),%%ymm0                   \n"  // 0123456789ABCDEF (16b)
-            "vmovdqu     2(%0),%%ymm1                  \n"  // 123456789ABCDEF0 (16b)
-
-            "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 012389AB4567CDEF
-            "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 12349ABC5678DEF0
-
-            "vpunpckhwd  %%ymm1,%%ymm0,%%ymm2          \n"  // 899AABBCCDDEEFF0 (near)
-            "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"  // 0112233445566778 (near)
-            "vpshufb     %%ymm5,%%ymm2,%%ymm3          \n"  // 98A9BACBDCEDFE0F (far)
-            "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"  // 1021324354657687 (far)
-
-            "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // far+2
-            "vpaddw      %%ymm4,%%ymm3,%%ymm3          \n"  // far+2
-            "vpaddw      %%ymm0,%%ymm1,%%ymm1          \n"  // near+far+2
-            "vpaddw      %%ymm2,%%ymm3,%%ymm3          \n"  // near+far+2
-            "vpaddw      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near
-            "vpaddw      %%ymm2,%%ymm2,%%ymm2          \n"  // 2*near
-            "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 3*near+far+2
-            "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 3*near+far+2
-
-            "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far
-            "vpsrlw      $2,%%ymm2,%%ymm2              \n"  // 3/4*near+1/4*far
-            "vmovdqu     %%ymm0,(%1)                   \n"
-            "vmovdqu     %%ymm2,32(%1)                 \n"
-
-            "lea         0x20(%0),%0                   \n"
-            "lea         0x40(%1),%1                   \n"  // 16 sample to 32 sample
-            "sub         $0x20,%2                      \n"
-            "jg          1b                            \n"
-            "vzeroupper                                \n"
-            : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width)         // %2
-            : "m"(kLinearShuffleFar)  // %3
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-#endif
-
-#ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2
-
-void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t *src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint16_t *dst_ptr,
-                                  ptrdiff_t dst_stride,
-                                  int dst_width) {
-    asm volatile(
-            "vbroadcastf128 %5,%%ymm5                  \n"
-            "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
-            "vpsrlw      $15,%%ymm4,%%ymm4             \n"
-            "vpsllw      $3,%%ymm4,%%ymm4              \n"  // all 8
-
-            LABELALIGN
-            "1:                                        \n"
-
-            "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b)
-            "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b)
-            "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0123000045670000
-            "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1234000056780000
-            "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"  // 0112233445566778 (near)
-            "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"  // 1021324354657687 (far)
-            "vpaddw      %%ymm0,%%ymm1,%%ymm1          \n"  // near+far
-            "vpaddw      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near
-            "vpaddw      %%ymm0,%%ymm1,%%ymm2          \n"  // 3*near+far (1)
-
-            "vmovdqu     (%0,%3,2),%%xmm0              \n"  // 01234567 (16b)
-            "vmovdqu     2(%0,%3,2),%%xmm1             \n"  // 12345678 (16b)
-            "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0123000045670000
-            "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1234000056780000
-            "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"  // 0112233445566778 (near)
-            "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"  // 1021324354657687 (far)
-            "vpaddw      %%ymm0,%%ymm1,%%ymm1          \n"  // near+far
-            "vpaddw      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near
-            "vpaddw      %%ymm0,%%ymm1,%%ymm3          \n"  // 3*near+far (2)
-
-            "vpaddw      %%ymm2,%%ymm2,%%ymm0          \n"  // 6*near+2*far (1)
-            "vpaddw      %%ymm4,%%ymm3,%%ymm1          \n"  // 3*near+far+8 (2)
-            "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9*near+3*far (1)
-            "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9 3 3 1 + 8 (1)
-            "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16
-            "vmovdqu     %%ymm0,(%1)                   \n"  // store above
-
-            "vpaddw      %%ymm3,%%ymm3,%%ymm0          \n"  // 6*near+2*far (2)
-            "vpaddw      %%ymm4,%%ymm2,%%ymm1          \n"  // 3*near+far+8 (1)
-            "vpaddw      %%ymm0,%%ymm3,%%ymm0          \n"  // 9*near+3*far (2)
-            "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9 3 3 1 + 8 (2)
-            "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16
-            "vmovdqu     %%ymm0,(%1,%4,2)              \n"  // store below
-
-            "lea         0x10(%0),%0                   \n"
-            "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
-            "sub         $0x10,%2                      \n"
-            "jg          1b                            \n"
-            "vzeroupper                                \n"
-            : "+r"(src_ptr),                // %0
-    "+r"(dst_ptr),                // %1
-    "+r"(dst_width)               // %2
-            : "r"((intptr_t) (src_stride)),  // %3
-    "r"((intptr_t) (dst_stride)),  // %4
-    "m"(kLinearShuffleFar)        // %5
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-#endif
-
-#ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2
-
-void ScaleRowUp2_Linear_16_AVX2(const uint16_t *src_ptr,
-                                uint16_t *dst_ptr,
-                                int dst_width) {
-    asm volatile(
-            "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
-            "vpsrld      $31,%%ymm4,%%ymm4             \n"
-            "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2
-
-            LABELALIGN
-            "1:                                        \n"
-            "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b, 1u1v)
-            "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b, 1u1v)
-
-            "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
-            "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
-
-            "vpshufd     $0b10110001,%%ymm0,%%ymm2     \n"  // 10325476 (lo, far)
-            "vpshufd     $0b10110001,%%ymm1,%%ymm3     \n"  // 21436587 (hi, far)
-
-            "vpaddd      %%ymm4,%%ymm2,%%ymm2          \n"  // far+2 (lo)
-            "vpaddd      %%ymm4,%%ymm3,%%ymm3          \n"  // far+2 (hi)
-            "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far+2 (lo)
-            "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far+2 (hi)
-            "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
-            "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
-            "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far+2 (lo)
-            "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far+2 (hi)
-
-            "vpsrld      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
-            "vpsrld      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
-            "vpackusdw   %%ymm1,%%ymm0,%%ymm0          \n"
-            "vpshufd     $0b11011000,%%ymm0,%%ymm0     \n"
-            "vmovdqu     %%ymm0,(%1)                   \n"
-
-            "lea         0x10(%0),%0                   \n"
-            "lea         0x20(%1),%1                   \n"  // 8 pixel to 16 pixel
-            "sub         $0x10,%2                      \n"
-            "jg          1b                            \n"
-            "vzeroupper                                \n"
-            : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-            :
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
-}
-
-#endif
-
-#ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2
-
-void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t *src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint16_t *dst_ptr,
-                                  ptrdiff_t dst_stride,
-                                  int dst_width) {
-    asm volatile(
-            "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
-            "vpsrld      $31,%%ymm6,%%ymm6             \n"
-            "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8
-
-            LABELALIGN
-            "1:                                        \n"
-
-            "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b, 1u1v)
-            "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b, 1u1v)
-            "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
-            "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
-            "vpshufd     $0b10110001,%%ymm0,%%ymm2     \n"  // 10325476 (lo, far)
-            "vpshufd     $0b10110001,%%ymm1,%%ymm3     \n"  // 21436587 (hi, far)
-            "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far (lo)
-            "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far (hi)
-            "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
-            "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
-            "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far (1, lo)
-            "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far (1, hi)
-
-            "vmovdqu     (%0,%3,2),%%xmm2              \n"  // 01234567 (16b, 1u1v)
-            "vmovdqu     2(%0,%3,2),%%xmm3             \n"  // 12345678 (16b, 1u1v)
-            "vpmovzxwd   %%xmm2,%%ymm2                 \n"  // 01234567 (32b, 1u1v)
-            "vpmovzxwd   %%xmm3,%%ymm3                 \n"  // 12345678 (32b, 1u1v)
-            "vpshufd     $0b10110001,%%ymm2,%%ymm4     \n"  // 10325476 (lo, far)
-            "vpshufd     $0b10110001,%%ymm3,%%ymm5     \n"  // 21436587 (hi, far)
-            "vpaddd      %%ymm2,%%ymm4,%%ymm4          \n"  // near+far (lo)
-            "vpaddd      %%ymm3,%%ymm5,%%ymm5          \n"  // near+far (hi)
-            "vpaddd      %%ymm2,%%ymm2,%%ymm2          \n"  // 2*near (lo)
-            "vpaddd      %%ymm3,%%ymm3,%%ymm3          \n"  // 2*near (hi)
-            "vpaddd      %%ymm2,%%ymm4,%%ymm2          \n"  // 3*near+far (2, lo)
-            "vpaddd      %%ymm3,%%ymm5,%%ymm3          \n"  // 3*near+far (2, hi)
-
-            "vpaddd      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
-            "vpaddd      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
-            "vpaddd      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
-            "vpaddd      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
-            "vpsrld      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
-
-            "vpaddd      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
-            "vpaddd      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
-            "vpaddd      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
-            "vpaddd      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
-            "vpsrld      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
-
-            "vpaddd      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
-            "vpaddd      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
-            "vpaddd      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
-            "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
-            "vpsrld      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
-
-            "vpaddd      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
-            "vpaddd      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
-            "vpaddd      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
-            "vpaddd      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
-            "vpsrld      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
-
-            "vpackusdw   %%ymm0,%%ymm4,%%ymm4          \n"
-            "vpshufd     $0b11011000,%%ymm4,%%ymm4     \n"
-            "vmovdqu     %%ymm4,(%1)                   \n"  // store above
-            "vpackusdw   %%ymm2,%%ymm5,%%ymm5          \n"
-            "vpshufd     $0b11011000,%%ymm5,%%ymm5     \n"
-            "vmovdqu     %%ymm5,(%1,%4,2)              \n"  // store below
-
-            "lea         0x10(%0),%0                   \n"
-            "lea         0x20(%1),%1                   \n"  // 8 pixel to 16 pixel
-            "sub         $0x10,%2                      \n"
-            "jg          1b                            \n"
-            "vzeroupper                                \n"
-            : "+r"(src_ptr),                // %0
-    "+r"(dst_ptr),                // %1
-    "+r"(dst_width)               // %2
-            : "r"((intptr_t) (src_stride)),  // %3
-    "r"((intptr_t) (dst_stride))   // %4
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-
-#endif
-
-// Reads 16xN bytes and produces 16 shorts at a time.
-void ScaleAddRow_SSE2(const uint8_t *src_ptr,
-                      uint16_t *dst_ptr,
-                      int src_width) {
-    asm volatile("pxor        %%xmm5,%%xmm5                 \n"
-
-                 // 16 pixel loop.
-                 LABELALIGN
-                 "1:                                        \n"
-                 "movdqu      (%0),%%xmm3                   \n"
-                 "lea         0x10(%0),%0                   \n"  // src_ptr += 16
-                 "movdqu      (%1),%%xmm0                   \n"
-                 "movdqu      0x10(%1),%%xmm1               \n"
-                 "movdqa      %%xmm3,%%xmm2                 \n"
-                 "punpcklbw   %%xmm5,%%xmm2                 \n"
-                 "punpckhbw   %%xmm5,%%xmm3                 \n"
-                 "paddusw     %%xmm2,%%xmm0                 \n"
-                 "paddusw     %%xmm3,%%xmm1                 \n"
-                 "movdqu      %%xmm0,(%1)                   \n"
-                 "movdqu      %%xmm1,0x10(%1)               \n"
-                 "lea         0x20(%1),%1                   \n"
-                 "sub         $0x10,%2                      \n"
-                 "jg          1b                            \n"
-            : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(src_width)  // %2
-            :
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-
-#ifdef HAS_SCALEADDROW_AVX2
-
-// Reads 32 bytes and accumulates to 32 shorts at a time.
-void ScaleAddRow_AVX2(const uint8_t *src_ptr,
-                      uint16_t *dst_ptr,
-                      int src_width) {
-    asm volatile("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
-
-                 LABELALIGN
-                 "1:                                        \n"
-                 "vmovdqu     (%0),%%ymm3                   \n"
-                 "lea         0x20(%0),%0                   \n"  // src_ptr += 32
-                 "vpermq      $0xd8,%%ymm3,%%ymm3           \n"
-                 "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
-                 "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
-                 "vpaddusw    (%1),%%ymm2,%%ymm0            \n"
-                 "vpaddusw    0x20(%1),%%ymm3,%%ymm1        \n"
-                 "vmovdqu     %%ymm0,(%1)                   \n"
-                 "vmovdqu     %%ymm1,0x20(%1)               \n"
-                 "lea         0x40(%1),%1                   \n"
-                 "sub         $0x20,%2                      \n"
-                 "jg          1b                            \n"
-                 "vzeroupper                                \n"
-            : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(src_width)  // %2
-            :
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
-}
-
-#endif  // HAS_SCALEADDROW_AVX2
-
-// Constant for making pixels signed to avoid pmaddubsw
-// saturation.
-static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-                              0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
-
-// Constant for making pixels unsigned and adding .5 for rounding.
-static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
-                               0x4040, 0x4040, 0x4040, 0x4040};
-
-// Bilinear column filtering. SSSE3 version.
-void ScaleFilterCols_SSSE3(uint8_t *dst_ptr,
-                           const uint8_t *src_ptr,
-                           int dst_width,
-                           int x,
-                           int dx) {
-    intptr_t x0, x1, temp_pixel;
-    asm volatile(
-            "movd        %6,%%xmm2                     \n"
-            "movd        %7,%%xmm3                     \n"
-            "movl        $0x04040000,%k2               \n"
-            "movd        %k2,%%xmm5                    \n"
-            "pcmpeqb     %%xmm6,%%xmm6                 \n"
-            "psrlw       $0x9,%%xmm6                   \n"  // 0x007f007f
-            "pcmpeqb     %%xmm7,%%xmm7                 \n"
-            "psrlw       $15,%%xmm7                    \n"  // 0x00010001
-
-            "pextrw      $0x1,%%xmm2,%k3               \n"
-            "subl        $0x2,%5                       \n"
-            "jl          29f                           \n"
-            "movdqa      %%xmm2,%%xmm0                 \n"
-            "paddd       %%xmm3,%%xmm0                 \n"
-            "punpckldq   %%xmm0,%%xmm2                 \n"
-            "punpckldq   %%xmm3,%%xmm3                 \n"
-            "paddd       %%xmm3,%%xmm3                 \n"
-            "pextrw      $0x3,%%xmm2,%k4               \n"
-
-            LABELALIGN
-            "2:                                        \n"
-            "movdqa      %%xmm2,%%xmm1                 \n"
-            "paddd       %%xmm3,%%xmm2                 \n"
-            "movzwl      0x00(%1,%3,1),%k2             \n"
-            "movd        %k2,%%xmm0                    \n"
-            "psrlw       $0x9,%%xmm1                   \n"
-            "movzwl      0x00(%1,%4,1),%k2             \n"
-            "movd        %k2,%%xmm4                    \n"
-            "pshufb      %%xmm5,%%xmm1                 \n"
-            "punpcklwd   %%xmm4,%%xmm0                 \n"
-            "psubb       %8,%%xmm0                     \n"  // make pixels signed.
-            "pxor        %%xmm6,%%xmm1                 \n"  // 128 - f = (f ^ 127 ) +
-            // 1
-            "paddusb     %%xmm7,%%xmm1                 \n"
-            "pmaddubsw   %%xmm0,%%xmm1                 \n"
-            "pextrw      $0x1,%%xmm2,%k3               \n"
-            "pextrw      $0x3,%%xmm2,%k4               \n"
-            "paddw       %9,%%xmm1                     \n"  // make pixels unsigned.
-            "psrlw       $0x7,%%xmm1                   \n"
-            "packuswb    %%xmm1,%%xmm1                 \n"
-            "movd        %%xmm1,%k2                    \n"
-            "mov         %w2,(%0)                      \n"
-            "lea         0x2(%0),%0                    \n"
-            "subl        $0x2,%5                       \n"
-            "jge         2b                            \n"
-
-            LABELALIGN
-            "29:                                       \n"
-            "addl        $0x1,%5                       \n"
-            "jl          99f                           \n"
-            "movzwl      0x00(%1,%3,1),%k2             \n"
-            "movd        %k2,%%xmm0                    \n"
-            "psrlw       $0x9,%%xmm2                   \n"
-            "pshufb      %%xmm5,%%xmm2                 \n"
-            "psubb       %8,%%xmm0                     \n"  // make pixels signed.
-            "pxor        %%xmm6,%%xmm2                 \n"
-            "paddusb     %%xmm7,%%xmm2                 \n"
-            "pmaddubsw   %%xmm0,%%xmm2                 \n"
-            "paddw       %9,%%xmm2                     \n"  // make pixels unsigned.
-            "psrlw       $0x7,%%xmm2                   \n"
-            "packuswb    %%xmm2,%%xmm2                 \n"
-            "movd        %%xmm2,%k2                    \n"
-            "mov         %b2,(%0)                      \n"
-            "99:                                       \n"
-            : "+r"(dst_ptr),      // %0
-    "+r"(src_ptr),      // %1
-    "=&a"(temp_pixel),  // %2
-    "=&r"(x0),          // %3
-    "=&r"(x1),          // %4
-#if defined(__x86_64__)
-    "+rm"(dst_width)  // %5
-#else
-        "+m"(dst_width)  // %5
-#endif
-            : "rm"(x),   // %6
-    "rm"(dx),  // %7
-#if defined(__x86_64__)
-    "x"(kFsub80),  // %8
-    "x"(kFadd40)   // %9
-#else
-        "m"(kFsub80),    // %8
-        "m"(kFadd40)     // %9
-#endif
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-    "xmm7");
-}
-
-// Reads 4 pixels, duplicates them and writes 8 pixels.
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-void ScaleColsUp2_SSE2(uint8_t *dst_ptr,
-                       const uint8_t *src_ptr,
-                       int dst_width,
-                       int x,
-                       int dx) {
-    (void) x;
-    (void) dx;
-    asm volatile(LABELALIGN
-            "1:                                        \n"
-            "movdqu      (%1),%%xmm0                   \n"
-            "lea         0x10(%1),%1                   \n"
-            "movdqa      %%xmm0,%%xmm1                 \n"
-            "punpcklbw   %%xmm0,%%xmm0                 \n"
-            "punpckhbw   %%xmm1,%%xmm1                 \n"
-            "movdqu      %%xmm0,(%0)                   \n"
-            "movdqu      %%xmm1,0x10(%0)               \n"
-            "lea         0x20(%0),%0                   \n"
-            "sub         $0x20,%2                      \n"
-            "jg          1b                            \n"
-
-            : "+r"(dst_ptr),   // %0
-    "+r"(src_ptr),   // %1
-    "+r"(dst_width)  // %2
-            ::"memory",
-    "cc", "xmm0", "xmm1");
-}
-
-// Divide num by div and return as 16.16 fixed point result.
-int FixedDiv_X86(int num, int div) {
-    asm volatile(
-            "cdq                                       \n"
-            "shld        $0x10,%%eax,%%edx             \n"
-            "shl         $0x10,%%eax                   \n"
-            "idiv        %1                            \n"
-            "mov         %0, %%eax                     \n"
-            : "+a"(num)  // %0
-            : "c"(div)   // %1
-            : "memory", "cc", "edx");
-    return num;
-}
-
-// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
-int FixedDiv1_X86(int num, int div) {
-    asm volatile(
-            "cdq                                       \n"
-            "shld        $0x10,%%eax,%%edx             \n"
-            "shl         $0x10,%%eax                   \n"
-            "sub         $0x10001,%%eax                \n"
-            "sbb         $0x0,%%edx                    \n"
-            "sub         $0x1,%1                       \n"
-            "idiv        %1                            \n"
-            "mov         %0, %%eax                     \n"
-            : "+a"(num)  // %0
-            : "c"(div)   // %1
-            : "memory", "cc", "edx");
-    return num;
-}
-
-#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) || \
-    defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
-
-// Shuffle table for splitting UV into upper and lower part of register.
-static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u,
-                                      1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u};
-static const uvec8 kShuffleMergeUV = {0u, 8u, 2u, 10u, 4u, 12u,
-                                      6u, 14u, 0x80, 0x80, 0x80, 0x80,
-                                      0x80, 0x80, 0x80, 0x80};
-#endif
-
-#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
-
-void ScaleUVRowDown2Box_SSSE3(const uint8_t *src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t *dst_ptr,
-                              int dst_width) {
-    asm volatile(
-            "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 01010101
-            "psrlw       $0xf,%%xmm4                   \n"
-            "packuswb    %%xmm4,%%xmm4                 \n"
-            "pxor        %%xmm5, %%xmm5                \n"  // zero
-            "movdqa      %4,%%xmm1                     \n"  // split shuffler
-            "movdqa      %5,%%xmm3                     \n"  // merge shuffler
-
-            LABELALIGN
-            "1:                                        \n"
-            "movdqu      (%0),%%xmm0                   \n"  // 8 UV row 0
-            "movdqu      0x00(%0,%3,1),%%xmm2          \n"  // 8 UV row 1
-            "lea         0x10(%0),%0                   \n"
-            "pshufb      %%xmm1,%%xmm0                 \n"  // uuuuvvvv
-            "pshufb      %%xmm1,%%xmm2                 \n"
-            "pmaddubsw   %%xmm4,%%xmm0                 \n"  // horizontal add
-            "pmaddubsw   %%xmm4,%%xmm2                 \n"
-            "paddw       %%xmm2,%%xmm0                 \n"  // vertical add
-            "psrlw       $0x1,%%xmm0                   \n"  // round
-            "pavgw       %%xmm5,%%xmm0                 \n"
-            "pshufb      %%xmm3,%%xmm0                 \n"  // merge uv
-            "movq        %%xmm0,(%1)                   \n"
-            "lea         0x8(%1),%1                    \n"  // 4 UV
-            "sub         $0x4,%2                       \n"
-            "jg          1b                            \n"
-            : "+r"(src_ptr),                // %0
-    "+r"(dst_ptr),                // %1
-    "+r"(dst_width)               // %2
-            : "r"((intptr_t) (src_stride)),  // %3
-    "m"(kShuffleSplitUV),         // %4
-    "m"(kShuffleMergeUV)          // %5
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-#endif  // HAS_SCALEUVROWDOWN2BOX_SSSE3
-
-#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
-
-void ScaleUVRowDown2Box_AVX2(const uint8_t *src_ptr,
-                             ptrdiff_t src_stride,
-                             uint8_t *dst_ptr,
-                             int dst_width) {
-    asm volatile(
-            "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 01010101
-            "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
-            "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
-            "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"  // zero
-            "vbroadcastf128 %4,%%ymm1                  \n"  // split shuffler
-            "vbroadcastf128 %5,%%ymm3                  \n"  // merge shuffler
-
-            LABELALIGN
-            "1:                                        \n"
-            "vmovdqu     (%0),%%ymm0                   \n"  // 16 UV row 0
-            "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"  // 16 UV row 1
-            "lea         0x20(%0),%0                   \n"
-            "vpshufb     %%ymm1,%%ymm0,%%ymm0          \n"  // uuuuvvvv
-            "vpshufb     %%ymm1,%%ymm2,%%ymm2          \n"
-            "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"  // horizontal add
-            "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
-            "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"  // vertical add
-            "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"  // round
-            "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
-            "vpshufb     %%ymm3,%%ymm0,%%ymm0          \n"  // merge uv
-            "vpermq      $0xd8,%%ymm0,%%ymm0           \n"  // combine qwords
-            "vmovdqu     %%xmm0,(%1)                   \n"
-            "lea         0x10(%1),%1                   \n"  // 8 UV
-            "sub         $0x8,%2                       \n"
-            "jg          1b                            \n"
-            "vzeroupper                                \n"
-            : "+r"(src_ptr),                // %0
-    "+r"(dst_ptr),                // %1
-    "+r"(dst_width)               // %2
-            : "r"((intptr_t) (src_stride)),  // %3
-    "m"(kShuffleSplitUV),         // %4
-    "m"(kShuffleMergeUV)          // %5
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-#endif  // HAS_SCALEUVROWDOWN2BOX_AVX2
-
-static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3,
-                                      3, 1, 3, 1, 1, 3, 1, 3};
-
-#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3
-
-void ScaleUVRowUp2_Linear_SSSE3(const uint8_t *src_ptr,
-                                uint8_t *dst_ptr,
-                                int dst_width) {
-    asm volatile(
-            "pcmpeqw     %%xmm4,%%xmm4                 \n"
-            "psrlw       $15,%%xmm4                    \n"
-            "psllw       $1,%%xmm4                     \n"  // all 2
-            "movdqa      %3,%%xmm3                     \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "movq        (%0),%%xmm0                   \n"  // 00112233 (1u1v)
-            "movq        2(%0),%%xmm1                  \n"  // 11223344 (1u1v)
-            "punpcklbw   %%xmm1,%%xmm0                 \n"  // 0101121223233434 (2u2v)
-            "movdqa      %%xmm0,%%xmm2                 \n"
-            "punpckhdq   %%xmm0,%%xmm2                 \n"  // 2323232334343434 (2u2v)
-            "punpckldq   %%xmm0,%%xmm0                 \n"  // 0101010112121212 (2u2v)
-            "pmaddubsw   %%xmm3,%%xmm2                 \n"  // 3*near+far (1u1v16, hi)
-            "pmaddubsw   %%xmm3,%%xmm0                 \n"  // 3*near+far (1u1v16, lo)
-            "paddw       %%xmm4,%%xmm0                 \n"  // 3*near+far+2 (lo)
-            "paddw       %%xmm4,%%xmm2                 \n"  // 3*near+far+2 (hi)
-            "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
-            "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far (hi)
-            "packuswb    %%xmm2,%%xmm0                 \n"
-            "movdqu      %%xmm0,(%1)                   \n"
-
-            "lea         0x8(%0),%0                    \n"
-            "lea         0x10(%1),%1                   \n"  // 4 uv to 8 uv
-            "sub         $0x8,%2                       \n"
-            "jg          1b                            \n"
-            : "+r"(src_ptr),        // %0
-    "+r"(dst_ptr),        // %1
-    "+r"(dst_width)       // %2
-            : "m"(kUVLinearMadd31)  // %3
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-
-#endif
-
-#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3
-
-void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t *src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint8_t *dst_ptr,
-                                  ptrdiff_t dst_stride,
-                                  int dst_width) {
-    asm volatile(
-            "pcmpeqw     %%xmm6,%%xmm6                 \n"
-            "psrlw       $15,%%xmm6                    \n"
-            "psllw       $3,%%xmm6                     \n"  // all 8
-            "movdqa      %5,%%xmm7                     \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "movq        (%0),%%xmm0                   \n"  // 00112233 (1u1v)
-            "movq        2(%0),%%xmm1                  \n"  // 11223344 (1u1v)
-            "punpcklbw   %%xmm1,%%xmm0                 \n"  // 0101121223233434 (2u2v)
-            "movdqa      %%xmm0,%%xmm2                 \n"
-            "punpckhdq   %%xmm0,%%xmm2                 \n"  // 2323232334343434 (2u2v)
-            "punpckldq   %%xmm0,%%xmm0                 \n"  // 0101010112121212 (2u2v)
-            "pmaddubsw   %%xmm7,%%xmm2                 \n"  // 3*near+far (1u1v16, hi)
-            "pmaddubsw   %%xmm7,%%xmm0                 \n"  // 3*near+far (1u1v16, lo)
-
-            "movq        (%0,%3),%%xmm1                \n"
-            "movq        2(%0,%3),%%xmm4               \n"
-            "punpcklbw   %%xmm4,%%xmm1                 \n"
-            "movdqa      %%xmm1,%%xmm3                 \n"
-            "punpckhdq   %%xmm1,%%xmm3                 \n"
-            "punpckldq   %%xmm1,%%xmm1                 \n"
-            "pmaddubsw   %%xmm7,%%xmm3                 \n"  // 3*near+far (2, hi)
-            "pmaddubsw   %%xmm7,%%xmm1                 \n"  // 3*near+far (2, lo)
-
-            // xmm0 xmm2
-            // xmm1 xmm3
-
-            "movdqa      %%xmm0,%%xmm4                 \n"
-            "movdqa      %%xmm1,%%xmm5                 \n"
-            "paddw       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
-            "paddw       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
-            "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
-            "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
-            "psrlw       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
-
-            "movdqa      %%xmm1,%%xmm5                 \n"
-            "paddw       %%xmm1,%%xmm5                 \n"  // 6*near+2*far (2, lo)
-            "paddw       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
-            "paddw       %%xmm1,%%xmm5                 \n"  // 9*near+3*far (2, lo)
-            "paddw       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
-            "psrlw       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
-
-            "movdqa      %%xmm2,%%xmm0                 \n"
-            "movdqa      %%xmm3,%%xmm1                 \n"
-            "paddw       %%xmm2,%%xmm0                 \n"  // 6*near+2*far (1, hi)
-            "paddw       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (2, hi)
-            "paddw       %%xmm2,%%xmm0                 \n"  // 9*near+3*far (1, hi)
-            "paddw       %%xmm1,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
-            "psrlw       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
-
-            "movdqa      %%xmm3,%%xmm1                 \n"
-            "paddw       %%xmm3,%%xmm1                 \n"  // 6*near+2*far (2, hi)
-            "paddw       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
-            "paddw       %%xmm3,%%xmm1                 \n"  // 9*near+3*far (2, hi)
-            "paddw       %%xmm2,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, hi)
-            "psrlw       $4,%%xmm1                     \n"  // ^ div by 16 (2, hi)
-
-            "packuswb    %%xmm0,%%xmm4                 \n"
-            "movdqu      %%xmm4,(%1)                   \n"  // store above
-            "packuswb    %%xmm1,%%xmm5                 \n"
-            "movdqu      %%xmm5,(%1,%4)                \n"  // store below
-
-            "lea         0x8(%0),%0                    \n"
-            "lea         0x10(%1),%1                   \n"  // 4 uv to 8 uv
-            "sub         $0x8,%2                       \n"
-            "jg          1b                            \n"
-            : "+r"(src_ptr),                // %0
-    "+r"(dst_ptr),                // %1
-    "+r"(dst_width)               // %2
-            : "r"((intptr_t) (src_stride)),  // %3
-    "r"((intptr_t) (dst_stride)),  // %4
-    "m"(kUVLinearMadd31)          // %5
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-    "xmm7");
-}
-
-#endif
-
-#ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2
-
-void ScaleUVRowUp2_Linear_AVX2(const uint8_t *src_ptr,
-                               uint8_t *dst_ptr,
-                               int dst_width) {
-    asm volatile(
-            "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
-            "vpsrlw      $15,%%ymm4,%%ymm4             \n"
-            "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
-            "vbroadcastf128 %3,%%ymm3                  \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "vmovdqu     (%0),%%xmm0                   \n"
-            "vmovdqu     2(%0),%%xmm1                  \n"
-            "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
-            "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
-            "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
-            "vpunpckhdq  %%ymm0,%%ymm0,%%ymm2          \n"
-            "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"
-            "vpmaddubsw  %%ymm3,%%ymm2,%%ymm1          \n"  // 3*near+far (hi)
-            "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0          \n"  // 3*near+far (lo)
-            "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"  // 3*near+far+2 (lo)
-            "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // 3*near+far+2 (hi)
-            "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
-            "vpsrlw      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
-            "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-            "vmovdqu     %%ymm0,(%1)                   \n"
-
-            "lea         0x10(%0),%0                   \n"
-            "lea         0x20(%1),%1                   \n"  // 8 uv to 16 uv
-            "sub         $0x10,%2                      \n"
-            "jg          1b                            \n"
-            "vzeroupper                                \n"
-            : "+r"(src_ptr),        // %0
-    "+r"(dst_ptr),        // %1
-    "+r"(dst_width)       // %2
-            : "m"(kUVLinearMadd31)  // %3
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
-}
-
-#endif
-
-#ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2
-
-void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t *src_ptr,
-                                 ptrdiff_t src_stride,
-                                 uint8_t *dst_ptr,
-                                 ptrdiff_t dst_stride,
-                                 int dst_width) {
-    asm volatile(
-            "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
-            "vpsrlw      $15,%%ymm6,%%ymm6             \n"
-            "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
-            "vbroadcastf128 %5,%%ymm7                  \n"
-
-            LABELALIGN
-            "1:                                        \n"
-            "vmovdqu     (%0),%%xmm0                   \n"
-            "vmovdqu     2(%0),%%xmm1                  \n"
-            "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
-            "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
-            "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
-            "vpunpckhdq  %%ymm0,%%ymm0,%%ymm2          \n"
-            "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"
-            "vpmaddubsw  %%ymm7,%%ymm2,%%ymm1          \n"  // 3*near+far (1, hi)
-            "vpmaddubsw  %%ymm7,%%ymm0,%%ymm0          \n"  // 3*near+far (1, lo)
-
-            "vmovdqu     (%0,%3),%%xmm2                \n"  // 0123456789ABCDEF
-            "vmovdqu     2(%0,%3),%%xmm3               \n"  // 123456789ABCDEF0
-            "vpermq      $0b11011000,%%ymm2,%%ymm2     \n"
-            "vpermq      $0b11011000,%%ymm3,%%ymm3     \n"
-            "vpunpcklbw  %%ymm3,%%ymm2,%%ymm2          \n"
-            "vpunpckhdq  %%ymm2,%%ymm2,%%ymm4          \n"
-            "vpunpckldq  %%ymm2,%%ymm2,%%ymm2          \n"
-            "vpmaddubsw  %%ymm7,%%ymm4,%%ymm3          \n"  // 3*near+far (2, hi)
-            "vpmaddubsw  %%ymm7,%%ymm2,%%ymm2          \n"  // 3*near+far (2, lo)
-
-            // ymm0 ymm1
-            // ymm2 ymm3
-
-            "vpaddw      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
-            "vpaddw      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
-            "vpaddw      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
-            "vpaddw      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
-            "vpsrlw      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
-
-            "vpaddw      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
-            "vpaddw      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
-            "vpaddw      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
-            "vpaddw      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
-            "vpsrlw      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
-
-            "vpaddw      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
-            "vpaddw      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
-            "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
-            "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
-            "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
-
-            "vpaddw      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
-            "vpaddw      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
-            "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
-            "vpaddw      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
-            "vpsrlw      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
-
-            "vpackuswb   %%ymm0,%%ymm4,%%ymm4          \n"
-            "vmovdqu     %%ymm4,(%1)                   \n"  // store above
-            "vpackuswb   %%ymm2,%%ymm5,%%ymm5          \n"
-            "vmovdqu     %%ymm5,(%1,%4)                \n"  // store below
-
-            "lea         0x10(%0),%0                   \n"
-            "lea         0x20(%1),%1                   \n"  // 8 uv to 16 uv
-            "sub         $0x10,%2                      \n"
-            "jg          1b                            \n"
-            "vzeroupper                                \n"
-            : "+r"(src_ptr),                // %0
-    "+r"(dst_ptr),                // %1
-    "+r"(dst_width)               // %2
-            : "r"((intptr_t) (src_stride)),  // %3
-    "r"((intptr_t) (dst_stride)),  // %4
-    "m"(kUVLinearMadd31)          // %5
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-    "xmm7");
-}
-
-#endif
-
-#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
-
-void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t *src_ptr,
-                                   uint16_t *dst_ptr,
-                                   int dst_width) {
-    asm volatile(
-            "pxor        %%xmm5,%%xmm5                 \n"
-            "pcmpeqd     %%xmm4,%%xmm4                 \n"
-            "psrld       $31,%%xmm4                    \n"
-            "pslld       $1,%%xmm4                     \n"  // all 2
-
-            LABELALIGN
-            "1:                                        \n"
-            "movq        (%0),%%xmm0                   \n"  // 0011 (16b, 1u1v)
-            "movq        4(%0),%%xmm1                  \n"  // 1122 (16b, 1u1v)
-
-            "punpcklwd   %%xmm5,%%xmm0                 \n"  // 0011 (32b, 1u1v)
-            "punpcklwd   %%xmm5,%%xmm1                 \n"  // 1122 (32b, 1u1v)
-
-            "movdqa      %%xmm0,%%xmm2                 \n"
-            "movdqa      %%xmm1,%%xmm3                 \n"
-
-            "pshufd      $0b01001110,%%xmm2,%%xmm2     \n"  // 1100 (lo, far)
-            "pshufd      $0b01001110,%%xmm3,%%xmm3     \n"  // 2211 (hi, far)
-
-            "paddd       %%xmm4,%%xmm2                 \n"  // far+2 (lo)
-            "paddd       %%xmm4,%%xmm3                 \n"  // far+2 (hi)
-            "paddd       %%xmm0,%%xmm2                 \n"  // near+far+2 (lo)
-            "paddd       %%xmm1,%%xmm3                 \n"  // near+far+2 (hi)
-            "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (lo)
-            "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (hi)
-            "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far+2 (lo)
-            "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far+2 (hi)
-
-            "psrld       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
-            "psrld       $2,%%xmm1                     \n"  // 3/4*near+1/4*far (hi)
-            "packusdw    %%xmm1,%%xmm0                 \n"
-            "movdqu      %%xmm0,(%1)                   \n"
-
-            "lea         0x8(%0),%0                    \n"
-            "lea         0x10(%1),%1                   \n"  // 2 uv to 4 uv
-            "sub         $0x4,%2                       \n"
-            "jg          1b                            \n"
-            : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-            :
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-
-#endif
-
-#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
-
-void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t *src_ptr,
-                                     ptrdiff_t src_stride,
-                                     uint16_t *dst_ptr,
-                                     ptrdiff_t dst_stride,
-                                     int dst_width) {
-    asm volatile(
-            "pxor        %%xmm7,%%xmm7                 \n"
-            "pcmpeqd     %%xmm6,%%xmm6                 \n"
-            "psrld       $31,%%xmm6                    \n"
-            "pslld       $3,%%xmm6                     \n"  // all 8
-
-            LABELALIGN
-            "1:                                        \n"
-            "movq        (%0),%%xmm0                   \n"  // 0011 (16b, 1u1v)
-            "movq        4(%0),%%xmm1                  \n"  // 1122 (16b, 1u1v)
-            "punpcklwd   %%xmm7,%%xmm0                 \n"  // 0011 (near) (32b, 1u1v)
-            "punpcklwd   %%xmm7,%%xmm1                 \n"  // 1122 (near) (32b, 1u1v)
-            "movdqa      %%xmm0,%%xmm2                 \n"
-            "movdqa      %%xmm1,%%xmm3                 \n"
-            "pshufd      $0b01001110,%%xmm2,%%xmm2     \n"  // 1100 (far) (1, lo)
-            "pshufd      $0b01001110,%%xmm3,%%xmm3     \n"  // 2211 (far) (1, hi)
-            "paddd       %%xmm0,%%xmm2                 \n"  // near+far (1, lo)
-            "paddd       %%xmm1,%%xmm3                 \n"  // near+far (1, hi)
-            "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (1, lo)
-            "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (1, hi)
-            "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far (1, lo)
-            "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
-
-            "movq        (%0,%3,2),%%xmm2              \n"
-            "movq        4(%0,%3,2),%%xmm3             \n"
-            "punpcklwd   %%xmm7,%%xmm2                 \n"
-            "punpcklwd   %%xmm7,%%xmm3                 \n"
-            "movdqa      %%xmm2,%%xmm4                 \n"
-            "movdqa      %%xmm3,%%xmm5                 \n"
-            "pshufd      $0b01001110,%%xmm4,%%xmm4     \n"  // 1100 (far) (2, lo)
-            "pshufd      $0b01001110,%%xmm5,%%xmm5     \n"  // 2211 (far) (2, hi)
-            "paddd       %%xmm2,%%xmm4                 \n"  // near+far (2, lo)
-            "paddd       %%xmm3,%%xmm5                 \n"  // near+far (2, hi)
-            "paddd       %%xmm2,%%xmm2                 \n"  // 2*near (2, lo)
-            "paddd       %%xmm3,%%xmm3                 \n"  // 2*near (2, hi)
-            "paddd       %%xmm4,%%xmm2                 \n"  // 3*near+far (2, lo)
-            "paddd       %%xmm5,%%xmm3                 \n"  // 3*near+far (2, hi)
-
-            "movdqa      %%xmm0,%%xmm4                 \n"
-            "movdqa      %%xmm2,%%xmm5                 \n"
-            "paddd       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
-            "paddd       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
-            "paddd       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
-            "paddd       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
-            "psrld       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
-
-            "movdqa      %%xmm2,%%xmm5                 \n"
-            "paddd       %%xmm2,%%xmm5                 \n"  // 6*near+2*far (2, lo)
-            "paddd       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
-            "paddd       %%xmm2,%%xmm5                 \n"  // 9*near+3*far (2, lo)
-            "paddd       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
-            "psrld       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
-
-            "movdqa      %%xmm1,%%xmm0                 \n"
-            "movdqa      %%xmm3,%%xmm2                 \n"
-            "paddd       %%xmm1,%%xmm0                 \n"  // 6*near+2*far (1, hi)
-            "paddd       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (2, hi)
-            "paddd       %%xmm1,%%xmm0                 \n"  // 9*near+3*far (1, hi)
-            "paddd       %%xmm2,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
-            "psrld       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
-
-            "movdqa      %%xmm3,%%xmm2                 \n"
-            "paddd       %%xmm3,%%xmm2                 \n"  // 6*near+2*far (2, hi)
-            "paddd       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (1, hi)
-            "paddd       %%xmm3,%%xmm2                 \n"  // 9*near+3*far (2, hi)
-            "paddd       %%xmm1,%%xmm2                 \n"  // 9 3 3 1 + 8 (2, hi)
-            "psrld       $4,%%xmm2                     \n"  // ^ div by 16 (2, hi)
-
-            "packusdw    %%xmm0,%%xmm4                 \n"
-            "movdqu      %%xmm4,(%1)                   \n"  // store above
-            "packusdw    %%xmm2,%%xmm5                 \n"
-            "movdqu      %%xmm5,(%1,%4,2)              \n"  // store below
-
-            "lea         0x8(%0),%0                    \n"
-            "lea         0x10(%1),%1                   \n"  // 2 uv to 4 uv
-            "sub         $0x4,%2                       \n"
-            "jg          1b                            \n"
-            : "+r"(src_ptr),                // %0
-    "+r"(dst_ptr),                // %1
-    "+r"(dst_width)               // %2
-            : "r"((intptr_t) (src_stride)),  // %3
-    "r"((intptr_t) (dst_stride))   // %4
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-    "xmm7");
-}
-
-#endif
-
-#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
-
-void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t *src_ptr,
-                                  uint16_t *dst_ptr,
-                                  int dst_width) {
-    asm volatile(
-            "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
-            "vpsrld      $31,%%ymm4,%%ymm4             \n"
-            "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2
-
-            LABELALIGN
-            "1:                                        \n"
-            "vmovdqu     (%0),%%xmm0                   \n"  // 00112233 (16b, 1u1v)
-            "vmovdqu     4(%0),%%xmm1                  \n"  // 11223344 (16b, 1u1v)
-
-            "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
-            "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
-
-            "vpshufd     $0b01001110,%%ymm0,%%ymm2     \n"  // 11003322 (lo, far)
-            "vpshufd     $0b01001110,%%ymm1,%%ymm3     \n"  // 22114433 (hi, far)
-
-            "vpaddd      %%ymm4,%%ymm2,%%ymm2          \n"  // far+2 (lo)
-            "vpaddd      %%ymm4,%%ymm3,%%ymm3          \n"  // far+2 (hi)
-            "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far+2 (lo)
-            "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far+2 (hi)
-            "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
-            "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
-            "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far+2 (lo)
-            "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far+2 (hi)
-
-            "vpsrld      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
-            "vpsrld      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
-            "vpackusdw   %%ymm1,%%ymm0,%%ymm0          \n"
-            "vmovdqu     %%ymm0,(%1)                   \n"
-
-            "lea         0x10(%0),%0                   \n"
-            "lea         0x20(%1),%1                   \n"  // 4 uv to 8 uv
-            "sub         $0x8,%2                       \n"
-            "jg          1b                            \n"
-            "vzeroupper                                \n"
-            : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-            :
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
-}
-
-#endif
-
-#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
-
-void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t *src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint16_t *dst_ptr,
-                                    ptrdiff_t dst_stride,
-                                    int dst_width) {
-    asm volatile(
-            "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
-            "vpsrld      $31,%%ymm6,%%ymm6             \n"
-            "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8
-
-            LABELALIGN
-            "1:                                        \n"
-
-            "vmovdqu     (%0),%%xmm0                   \n"  // 00112233 (16b, 1u1v)
-            "vmovdqu     4(%0),%%xmm1                  \n"  // 11223344 (16b, 1u1v)
-            "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
-            "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
-            "vpshufd     $0b01001110,%%ymm0,%%ymm2     \n"  // 11003322 (lo, far)
-            "vpshufd     $0b01001110,%%ymm1,%%ymm3     \n"  // 22114433 (hi, far)
-            "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far (lo)
-            "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far (hi)
-            "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
-            "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
-            "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far (lo)
-            "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far (hi)
-
-            "vmovdqu     (%0,%3,2),%%xmm2              \n"  // 00112233 (16b, 1u1v)
-            "vmovdqu     4(%0,%3,2),%%xmm3             \n"  // 11223344 (16b, 1u1v)
-            "vpmovzxwd   %%xmm2,%%ymm2                 \n"  // 01234567 (32b, 1u1v)
-            "vpmovzxwd   %%xmm3,%%ymm3                 \n"  // 12345678 (32b, 1u1v)
-            "vpshufd     $0b01001110,%%ymm2,%%ymm4     \n"  // 11003322 (lo, far)
-            "vpshufd     $0b01001110,%%ymm3,%%ymm5     \n"  // 22114433 (hi, far)
-            "vpaddd      %%ymm2,%%ymm4,%%ymm4          \n"  // near+far (lo)
-            "vpaddd      %%ymm3,%%ymm5,%%ymm5          \n"  // near+far (hi)
-            "vpaddd      %%ymm2,%%ymm2,%%ymm2          \n"  // 2*near (lo)
-            "vpaddd      %%ymm3,%%ymm3,%%ymm3          \n"  // 2*near (hi)
-            "vpaddd      %%ymm2,%%ymm4,%%ymm2          \n"  // 3*near+far (lo)
-            "vpaddd      %%ymm3,%%ymm5,%%ymm3          \n"  // 3*near+far (hi)
-
-            "vpaddd      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
-            "vpaddd      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
-            "vpaddd      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
-            "vpaddd      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
-            "vpsrld      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
-
-            "vpaddd      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
-            "vpaddd      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
-            "vpaddd      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
-            "vpaddd      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
-            "vpsrld      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
-
-            "vpaddd      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
-            "vpaddd      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
-            "vpaddd      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
-            "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
-            "vpsrld      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
-
-            "vpaddd      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
-            "vpaddd      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
-            "vpaddd      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
-            "vpaddd      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
-            "vpsrld      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
-
-            "vpackusdw   %%ymm0,%%ymm4,%%ymm4          \n"
-            "vmovdqu     %%ymm4,(%1)                   \n"  // store above
-            "vpackusdw   %%ymm2,%%ymm5,%%ymm5          \n"
-            "vmovdqu     %%ymm5,(%1,%4,2)              \n"  // store below
-
-            "lea         0x10(%0),%0                   \n"
-            "lea         0x20(%1),%1                   \n"  // 4 uv to 8 uv
-            "sub         $0x8,%2                       \n"
-            "jg          1b                            \n"
-            "vzeroupper                                \n"
-            : "+r"(src_ptr),                // %0
-    "+r"(dst_ptr),                // %1
-    "+r"(dst_width)               // %2
-            : "r"((intptr_t) (src_stride)),  // %3
-    "r"((intptr_t) (dst_stride))   // %4
-            : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-
-#endif
-
-#endif  // defined(__x86_64__) || defined(__i386__)
diff --git a/pkg/encoder/yuv/libyuv/scale_row.h b/pkg/encoder/yuv/libyuv/scale_row.h
deleted file mode 100644
index 16389cdcf..000000000
--- a/pkg/encoder/yuv/libyuv/scale_row.h
+++ /dev/null
@@ -1,768 +0,0 @@
-/*
- *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_
-#define INCLUDE_LIBYUV_SCALE_ROW_H_
-
-#include "basic_types.h"
-#include "scale.h"
-
-#if defined(__pnacl__) || defined(__CLR_VER) || \
-    (defined(__native_client__) && defined(__x86_64__)) || \
-    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
-#define LIBYUV_DISABLE_X86
-#endif
-#if defined(__native_client__)
-#define LIBYUV_DISABLE_NEON
-#endif
-// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
-#if defined(__has_feature)
-#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_NEON)
-#define LIBYUV_DISABLE_NEON
-#endif
-#if __has_feature(memory_sanitizer) && !defined(LIBYUV_DISABLE_X86)
-#define LIBYUV_DISABLE_X86
-#endif
-#endif
-// GCC >= 4.7.0 required for AVX2.
-#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
-#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
-#define GCC_HAS_AVX2 1
-#endif  // GNUC >= 4.7
-#endif  // __GNUC__
-
-// The following are available on all x86 platforms:
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-#define HAS_FIXEDDIV1_X86
-#define HAS_FIXEDDIV_X86
-#define HAS_SCALEADDROW_SSE2
-#define HAS_SCALECOLSUP2_SSE2
-#define HAS_SCALEFILTERCOLS_SSSE3
-#define HAS_SCALEROWDOWN2_SSSE3
-#define HAS_SCALEROWDOWN34_SSSE3
-#define HAS_SCALEROWDOWN38_SSSE3
-#define HAS_SCALEROWDOWN4_SSSE3
-#endif
-
-// The following are available for gcc/clang x86 platforms:
-// TODO(fbarchard): Port to Visual C
-#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
-#define HAS_SCALEUVROWDOWN2BOX_SSSE3
-#define HAS_SCALEROWUP2_LINEAR_SSE2
-#define HAS_SCALEROWUP2_LINEAR_SSSE3
-#define HAS_SCALEROWUP2_BILINEAR_SSE2
-#define HAS_SCALEROWUP2_BILINEAR_SSSE3
-#define HAS_SCALEROWUP2_LINEAR_12_SSSE3
-#define HAS_SCALEROWUP2_BILINEAR_12_SSSE3
-#define HAS_SCALEROWUP2_LINEAR_16_SSE2
-#define HAS_SCALEROWUP2_BILINEAR_16_SSE2
-#define HAS_SCALEUVROWUP2_LINEAR_SSSE3
-#define HAS_SCALEUVROWUP2_BILINEAR_SSSE3
-#define HAS_SCALEUVROWUP2_LINEAR_16_SSE41
-#define HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
-#endif
-
-// The following are available for gcc/clang x86 platforms, but
-// require clang 3.4 or gcc 4.7.
-// TODO(fbarchard): Port to Visual C
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || defined(__i386__)) && \
-    (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
-#define HAS_SCALEUVROWDOWN2BOX_AVX2
-#define HAS_SCALEROWUP2_LINEAR_AVX2
-#define HAS_SCALEROWUP2_BILINEAR_AVX2
-#define HAS_SCALEROWUP2_LINEAR_12_AVX2
-#define HAS_SCALEROWUP2_BILINEAR_12_AVX2
-#define HAS_SCALEROWUP2_LINEAR_16_AVX2
-#define HAS_SCALEROWUP2_BILINEAR_16_AVX2
-#define HAS_SCALEUVROWUP2_LINEAR_AVX2
-#define HAS_SCALEUVROWUP2_BILINEAR_AVX2
-#define HAS_SCALEUVROWUP2_LINEAR_16_AVX2
-#define HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
-#endif
-
-// The following are available on all x86 platforms, but
-// require VS2012, clang 3.4 or gcc 4.7.
-// The code supports NaCL but requires a new compiler and validator.
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
-     defined(GCC_HAS_AVX2))
-#define HAS_SCALEADDROW_AVX2
-#define HAS_SCALEROWDOWN2_AVX2
-#define HAS_SCALEROWDOWN4_AVX2
-#endif
-
-// Scale ARGB vertically with bilinear interpolation.
-void ScalePlaneVertical(int src_height,
-                        int dst_width,
-                        int dst_height,
-                        int src_stride,
-                        int dst_stride,
-                        const uint8_t *src_argb,
-                        uint8_t *dst_argb,
-                        int x,
-                        int y,
-                        int dy,
-                        int bpp,
-                        enum FilterMode filtering);
-
-// Simplify the filtering based on scale factors.
-enum FilterMode ScaleFilterReduce(int src_width,
-                                  int src_height,
-                                  int dst_width,
-                                  int dst_height,
-                                  enum FilterMode filtering);
-
-// Divide num by div and return as 16.16 fixed point result.
-int FixedDiv_X86(int num, int div);
-
-int FixedDiv1_X86(int num, int div);
-
-#ifdef HAS_FIXEDDIV_X86
-#define FixedDiv FixedDiv_X86
-#define FixedDiv1 FixedDiv1_X86
-#endif
-
-// Compute slope values for stepping.
-void ScaleSlope(int src_width,
-                int src_height,
-                int dst_width,
-                int dst_height,
-                enum FilterMode filtering,
-                int *x,
-                int *y,
-                int *dx,
-                int *dy);
-
-void ScaleRowDown2_C(const uint8_t *src_ptr,
-                     ptrdiff_t src_stride,
-                     uint8_t *dst,
-                     int dst_width);
-
-void ScaleRowDown2Linear_C(const uint8_t *src_ptr,
-                           ptrdiff_t src_stride,
-                           uint8_t *dst,
-                           int dst_width);
-
-void ScaleRowDown2Box_C(const uint8_t *src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t *dst,
-                        int dst_width);
-
-void ScaleRowDown2Box_Odd_C(const uint8_t *src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t *dst,
-                            int dst_width);
-
-void ScaleRowDown4_C(const uint8_t *src_ptr,
-                     ptrdiff_t src_stride,
-                     uint8_t *dst,
-                     int dst_width);
-
-void ScaleRowDown4Box_C(const uint8_t *src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t *dst,
-                        int dst_width);
-
-void ScaleRowDown34_C(const uint8_t *src_ptr,
-                      ptrdiff_t src_stride,
-                      uint8_t *dst,
-                      int dst_width);
-
-void ScaleRowDown34_0_Box_C(const uint8_t *src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t *d,
-                            int dst_width);
-
-void ScaleRowDown34_1_Box_C(const uint8_t *src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t *d,
-                            int dst_width);
-
-void ScaleRowUp2_Linear_C(const uint8_t *src_ptr,
-                          uint8_t *dst_ptr,
-                          int dst_width);
-
-void ScaleRowUp2_Bilinear_C(const uint8_t *src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t *dst_ptr,
-                            ptrdiff_t dst_stride,
-                            int dst_width);
-
-void ScaleRowUp2_Linear_16_C(const uint16_t *src_ptr,
-                             uint16_t *dst_ptr,
-                             int dst_width);
-
-void ScaleRowUp2_Bilinear_16_C(const uint16_t *src_ptr,
-                               ptrdiff_t src_stride,
-                               uint16_t *dst_ptr,
-                               ptrdiff_t dst_stride,
-                               int dst_width);
-
-void ScaleRowUp2_Linear_Any_C(const uint8_t *src_ptr,
-                              uint8_t *dst_ptr,
-                              int dst_width);
-
-void ScaleRowUp2_Bilinear_Any_C(const uint8_t *src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t *dst_ptr,
-                                ptrdiff_t dst_stride,
-                                int dst_width);
-
-void ScaleRowUp2_Linear_16_Any_C(const uint16_t *src_ptr,
-                                 uint16_t *dst_ptr,
-                                 int dst_width);
-
-void ScaleRowUp2_Bilinear_16_Any_C(const uint16_t *src_ptr,
-                                   ptrdiff_t src_stride,
-                                   uint16_t *dst_ptr,
-                                   ptrdiff_t dst_stride,
-                                   int dst_width);
-
-void ScaleCols_C(uint8_t *dst_ptr,
-                 const uint8_t *src_ptr,
-                 int dst_width,
-                 int x,
-                 int dx);
-
-void ScaleColsUp2_C(uint8_t *dst_ptr,
-                    const uint8_t *src_ptr,
-                    int dst_width,
-                    int,
-                    int);
-
-void ScaleFilterCols_C(uint8_t *dst_ptr,
-                       const uint8_t *src_ptr,
-                       int dst_width,
-                       int x,
-                       int dx);
-
-void ScaleFilterCols64_C(uint8_t *dst_ptr,
-                         const uint8_t *src_ptr,
-                         int dst_width,
-                         int x32,
-                         int dx);
-
-void ScaleRowDown38_C(const uint8_t *src_ptr,
-                      ptrdiff_t src_stride,
-                      uint8_t *dst,
-                      int dst_width);
-
-void ScaleRowDown38_3_Box_C(const uint8_t *src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t *dst_ptr,
-                            int dst_width);
-
-void ScaleRowDown38_2_Box_C(const uint8_t *src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t *dst_ptr,
-                            int dst_width);
-
-void ScaleAddRow_C(const uint8_t *src_ptr, uint16_t *dst_ptr, int src_width);
-
-void ScaleUVRowDown2_C(const uint8_t *src_uv,
-                       ptrdiff_t src_stride,
-                       uint8_t *dst_uv,
-                       int dst_width);
-
-void ScaleUVRowDown2Linear_C(const uint8_t *src_uv,
-                             ptrdiff_t src_stride,
-                             uint8_t *dst_uv,
-                             int dst_width);
-
-void ScaleUVRowDown2Box_C(const uint8_t *src_uv,
-                          ptrdiff_t src_stride,
-                          uint8_t *dst_uv,
-                          int dst_width);
-
-void ScaleUVRowDownEven_C(const uint8_t *src_uv,
-                          ptrdiff_t src_stride,
-                          int src_stepx,
-                          uint8_t *dst_uv,
-                          int dst_width);
-
-void ScaleUVRowUp2_Linear_C(const uint8_t *src_ptr,
-                            uint8_t *dst_ptr,
-                            int dst_width);
-
-void ScaleUVRowUp2_Bilinear_C(const uint8_t *src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t *dst_ptr,
-                              ptrdiff_t dst_stride,
-                              int dst_width);
-
-void ScaleUVRowUp2_Linear_Any_C(const uint8_t *src_ptr,
-                                uint8_t *dst_ptr,
-                                int dst_width);
-
-void ScaleUVRowUp2_Bilinear_Any_C(const uint8_t *src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint8_t *dst_ptr,
-                                  ptrdiff_t dst_stride,
-                                  int dst_width);
-
-void ScaleUVRowUp2_Linear_16_C(const uint16_t *src_ptr,
-                               uint16_t *dst_ptr,
-                               int dst_width);
-
-void ScaleUVRowUp2_Bilinear_16_C(const uint16_t *src_ptr,
-                                 ptrdiff_t src_stride,
-                                 uint16_t *dst_ptr,
-                                 ptrdiff_t dst_stride,
-                                 int dst_width);
-
-void ScaleUVRowUp2_Linear_16_Any_C(const uint16_t *src_ptr,
-                                   uint16_t *dst_ptr,
-                                   int dst_width);
-
-void ScaleUVRowUp2_Bilinear_16_Any_C(const uint16_t *src_ptr,
-                                     ptrdiff_t src_stride,
-                                     uint16_t *dst_ptr,
-                                     ptrdiff_t dst_stride,
-                                     int dst_width);
-
-// Specialized scalers for x86.
-void ScaleRowDown2_SSSE3(const uint8_t *src_ptr,
-                         ptrdiff_t src_stride,
-                         uint8_t *dst_ptr,
-                         int dst_width);
-
-void ScaleRowDown2Linear_SSSE3(const uint8_t *src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t *dst_ptr,
-                               int dst_width);
-
-void ScaleRowDown2Box_SSSE3(const uint8_t *src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t *dst_ptr,
-                            int dst_width);
-
-void ScaleRowDown2_AVX2(const uint8_t *src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t *dst_ptr,
-                        int dst_width);
-
-void ScaleRowDown2Linear_AVX2(const uint8_t *src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t *dst_ptr,
-                              int dst_width);
-
-void ScaleRowDown2Box_AVX2(const uint8_t *src_ptr,
-                           ptrdiff_t src_stride,
-                           uint8_t *dst_ptr,
-                           int dst_width);
-
-void ScaleRowDown4_SSSE3(const uint8_t *src_ptr,
-                         ptrdiff_t src_stride,
-                         uint8_t *dst_ptr,
-                         int dst_width);
-
-void ScaleRowDown4Box_SSSE3(const uint8_t *src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t *dst_ptr,
-                            int dst_width);
-
-void ScaleRowDown4_AVX2(const uint8_t *src_ptr,
-                        ptrdiff_t src_stride,
-                        uint8_t *dst_ptr,
-                        int dst_width);
-
-void ScaleRowDown4Box_AVX2(const uint8_t *src_ptr,
-                           ptrdiff_t src_stride,
-                           uint8_t *dst_ptr,
-                           int dst_width);
-
-void ScaleRowDown34_SSSE3(const uint8_t *src_ptr,
-                          ptrdiff_t src_stride,
-                          uint8_t *dst_ptr,
-                          int dst_width);
-
-void ScaleRowDown34_1_Box_SSSE3(const uint8_t *src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t *dst_ptr,
-                                int dst_width);
-
-void ScaleRowDown34_0_Box_SSSE3(const uint8_t *src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t *dst_ptr,
-                                int dst_width);
-
-void ScaleRowDown38_SSSE3(const uint8_t *src_ptr,
-                          ptrdiff_t src_stride,
-                          uint8_t *dst_ptr,
-                          int dst_width);
-
-void ScaleRowDown38_3_Box_SSSE3(const uint8_t *src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t *dst_ptr,
-                                int dst_width);
-
-void ScaleRowDown38_2_Box_SSSE3(const uint8_t *src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t *dst_ptr,
-                                int dst_width);
-
-void ScaleRowUp2_Linear_SSE2(const uint8_t *src_ptr,
-                             uint8_t *dst_ptr,
-                             int dst_width);
-
-void ScaleRowUp2_Bilinear_SSE2(const uint8_t *src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t *dst_ptr,
-                               ptrdiff_t dst_stride,
-                               int dst_width);
-
-void ScaleRowUp2_Linear_12_SSSE3(const uint16_t *src_ptr,
-                                 uint16_t *dst_ptr,
-                                 int dst_width);
-
-void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t *src_ptr,
-                                   ptrdiff_t src_stride,
-                                   uint16_t *dst_ptr,
-                                   ptrdiff_t dst_stride,
-                                   int dst_width);
-
-void ScaleRowUp2_Linear_16_SSE2(const uint16_t *src_ptr,
-                                uint16_t *dst_ptr,
-                                int dst_width);
-
-void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t *src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint16_t *dst_ptr,
-                                  ptrdiff_t dst_stride,
-                                  int dst_width);
-
-void ScaleRowUp2_Linear_SSSE3(const uint8_t *src_ptr,
-                              uint8_t *dst_ptr,
-                              int dst_width);
-
-void ScaleRowUp2_Bilinear_SSSE3(const uint8_t *src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t *dst_ptr,
-                                ptrdiff_t dst_stride,
-                                int dst_width);
-
-void ScaleRowUp2_Linear_AVX2(const uint8_t *src_ptr,
-                             uint8_t *dst_ptr,
-                             int dst_width);
-
-void ScaleRowUp2_Bilinear_AVX2(const uint8_t *src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t *dst_ptr,
-                               ptrdiff_t dst_stride,
-                               int dst_width);
-
-void ScaleRowUp2_Linear_12_AVX2(const uint16_t *src_ptr,
-                                uint16_t *dst_ptr,
-                                int dst_width);
-
-void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t *src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint16_t *dst_ptr,
-                                  ptrdiff_t dst_stride,
-                                  int dst_width);
-
-void ScaleRowUp2_Linear_16_AVX2(const uint16_t *src_ptr,
-                                uint16_t *dst_ptr,
-                                int dst_width);
-
-void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t *src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint16_t *dst_ptr,
-                                  ptrdiff_t dst_stride,
-                                  int dst_width);
-
-void ScaleRowUp2_Linear_Any_SSE2(const uint8_t *src_ptr,
-                                 uint8_t *dst_ptr,
-                                 int dst_width);
-
-void ScaleRowUp2_Bilinear_Any_SSE2(const uint8_t *src_ptr,
-                                   ptrdiff_t src_stride,
-                                   uint8_t *dst_ptr,
-                                   ptrdiff_t dst_stride,
-                                   int dst_width);
-
-void ScaleRowUp2_Linear_12_Any_SSSE3(const uint16_t *src_ptr,
-                                     uint16_t *dst_ptr,
-                                     int dst_width);
-
-void ScaleRowUp2_Bilinear_12_Any_SSSE3(const uint16_t *src_ptr,
-                                       ptrdiff_t src_stride,
-                                       uint16_t *dst_ptr,
-                                       ptrdiff_t dst_stride,
-                                       int dst_width);
-
-void ScaleRowUp2_Linear_16_Any_SSE2(const uint16_t *src_ptr,
-                                    uint16_t *dst_ptr,
-                                    int dst_width);
-
-void ScaleRowUp2_Bilinear_16_Any_SSE2(const uint16_t *src_ptr,
-                                      ptrdiff_t src_stride,
-                                      uint16_t *dst_ptr,
-                                      ptrdiff_t dst_stride,
-                                      int dst_width);
-
-void ScaleRowUp2_Linear_Any_SSSE3(const uint8_t *src_ptr,
-                                  uint8_t *dst_ptr,
-                                  int dst_width);
-
-void ScaleRowUp2_Bilinear_Any_SSSE3(const uint8_t *src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint8_t *dst_ptr,
-                                    ptrdiff_t dst_stride,
-                                    int dst_width);
-
-void ScaleRowUp2_Linear_Any_AVX2(const uint8_t *src_ptr,
-                                 uint8_t *dst_ptr,
-                                 int dst_width);
-
-void ScaleRowUp2_Bilinear_Any_AVX2(const uint8_t *src_ptr,
-                                   ptrdiff_t src_stride,
-                                   uint8_t *dst_ptr,
-                                   ptrdiff_t dst_stride,
-                                   int dst_width);
-
-void ScaleRowUp2_Linear_12_Any_AVX2(const uint16_t *src_ptr,
-                                    uint16_t *dst_ptr,
-                                    int dst_width);
-
-void ScaleRowUp2_Bilinear_12_Any_AVX2(const uint16_t *src_ptr,
-                                      ptrdiff_t src_stride,
-                                      uint16_t *dst_ptr,
-                                      ptrdiff_t dst_stride,
-                                      int dst_width);
-
-void ScaleRowUp2_Linear_16_Any_AVX2(const uint16_t *src_ptr,
-                                    uint16_t *dst_ptr,
-                                    int dst_width);
-
-void ScaleRowUp2_Bilinear_16_Any_AVX2(const uint16_t *src_ptr,
-                                      ptrdiff_t src_stride,
-                                      uint16_t *dst_ptr,
-                                      ptrdiff_t dst_stride,
-                                      int dst_width);
-
-void ScaleRowDown2_Any_SSSE3(const uint8_t *src_ptr,
-                             ptrdiff_t src_stride,
-                             uint8_t *dst_ptr,
-                             int dst_width);
-
-void ScaleRowDown2Linear_Any_SSSE3(const uint8_t *src_ptr,
-                                   ptrdiff_t src_stride,
-                                   uint8_t *dst_ptr,
-                                   int dst_width);
-
-void ScaleRowDown2Box_Any_SSSE3(const uint8_t *src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t *dst_ptr,
-                                int dst_width);
-
-void ScaleRowDown2Box_Odd_SSSE3(const uint8_t *src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t *dst_ptr,
-                                int dst_width);
-
-void ScaleRowDown2_Any_AVX2(const uint8_t *src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t *dst_ptr,
-                            int dst_width);
-
-void ScaleRowDown2Linear_Any_AVX2(const uint8_t *src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint8_t *dst_ptr,
-                                  int dst_width);
-
-void ScaleRowDown2Box_Any_AVX2(const uint8_t *src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t *dst_ptr,
-                               int dst_width);
-
-void ScaleRowDown2Box_Odd_AVX2(const uint8_t *src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t *dst_ptr,
-                               int dst_width);
-
-void ScaleRowDown4_Any_SSSE3(const uint8_t *src_ptr,
-                             ptrdiff_t src_stride,
-                             uint8_t *dst_ptr,
-                             int dst_width);
-
-void ScaleRowDown4Box_Any_SSSE3(const uint8_t *src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8_t *dst_ptr,
-                                int dst_width);
-
-void ScaleRowDown4_Any_AVX2(const uint8_t *src_ptr,
-                            ptrdiff_t src_stride,
-                            uint8_t *dst_ptr,
-                            int dst_width);
-
-void ScaleRowDown4Box_Any_AVX2(const uint8_t *src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8_t *dst_ptr,
-                               int dst_width);
-
-void ScaleRowDown34_Any_SSSE3(const uint8_t *src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t *dst_ptr,
-                              int dst_width);
-
-void ScaleRowDown34_1_Box_Any_SSSE3(const uint8_t *src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint8_t *dst_ptr,
-                                    int dst_width);
-
-void ScaleRowDown34_0_Box_Any_SSSE3(const uint8_t *src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint8_t *dst_ptr,
-                                    int dst_width);
-
-void ScaleRowDown38_Any_SSSE3(const uint8_t *src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t *dst_ptr,
-                              int dst_width);
-
-void ScaleRowDown38_3_Box_Any_SSSE3(const uint8_t *src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint8_t *dst_ptr,
-                                    int dst_width);
-
-void ScaleRowDown38_2_Box_Any_SSSE3(const uint8_t *src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint8_t *dst_ptr,
-                                    int dst_width);
-
-void ScaleAddRow_SSE2(const uint8_t *src_ptr, uint16_t *dst_ptr, int src_width);
-
-void ScaleAddRow_AVX2(const uint8_t *src_ptr, uint16_t *dst_ptr, int src_width);
-
-void ScaleAddRow_Any_SSE2(const uint8_t *src_ptr,
-                          uint16_t *dst_ptr,
-                          int src_width);
-
-void ScaleAddRow_Any_AVX2(const uint8_t *src_ptr,
-                          uint16_t *dst_ptr,
-                          int src_width);
-
-void ScaleFilterCols_SSSE3(uint8_t *dst_ptr,
-                           const uint8_t *src_ptr,
-                           int dst_width,
-                           int x,
-                           int dx);
-
-void ScaleColsUp2_SSE2(uint8_t *dst_ptr,
-                       const uint8_t *src_ptr,
-                       int dst_width,
-                       int x,
-                       int dx);
-
-// UV Row functions
-void ScaleUVRowDown2Box_SSSE3(const uint8_t *src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t *dst_uv,
-                              int dst_width);
-
-void ScaleUVRowDown2Box_AVX2(const uint8_t *src_ptr,
-                             ptrdiff_t src_stride,
-                             uint8_t *dst_uv,
-                             int dst_width);
-
-void ScaleUVRowDown2Box_Any_SSSE3(const uint8_t *src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint8_t *dst_ptr,
-                                  int dst_width);
-
-void ScaleUVRowDown2Box_Any_AVX2(const uint8_t *src_ptr,
-                                 ptrdiff_t src_stride,
-                                 uint8_t *dst_ptr,
-                                 int dst_width);
-
-void ScaleUVRowUp2_Linear_SSSE3(const uint8_t *src_ptr,
-                                uint8_t *dst_ptr,
-                                int dst_width);
-
-void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t *src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint8_t *dst_ptr,
-                                  ptrdiff_t dst_stride,
-                                  int dst_width);
-
-void ScaleUVRowUp2_Linear_Any_SSSE3(const uint8_t *src_ptr,
-                                    uint8_t *dst_ptr,
-                                    int dst_width);
-
-void ScaleUVRowUp2_Bilinear_Any_SSSE3(const uint8_t *src_ptr,
-                                      ptrdiff_t src_stride,
-                                      uint8_t *dst_ptr,
-                                      ptrdiff_t dst_stride,
-                                      int dst_width);
-
-void ScaleUVRowUp2_Linear_AVX2(const uint8_t *src_ptr,
-                               uint8_t *dst_ptr,
-                               int dst_width);
-
-void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t *src_ptr,
-                                 ptrdiff_t src_stride,
-                                 uint8_t *dst_ptr,
-                                 ptrdiff_t dst_stride,
-                                 int dst_width);
-
-void ScaleUVRowUp2_Linear_Any_AVX2(const uint8_t *src_ptr,
-                                   uint8_t *dst_ptr,
-                                   int dst_width);
-
-void ScaleUVRowUp2_Bilinear_Any_AVX2(const uint8_t *src_ptr,
-                                     ptrdiff_t src_stride,
-                                     uint8_t *dst_ptr,
-                                     ptrdiff_t dst_stride,
-                                     int dst_width);
-
-void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t *src_ptr,
-                                   uint16_t *dst_ptr,
-                                   int dst_width);
-
-void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t *src_ptr,
-                                     ptrdiff_t src_stride,
-                                     uint16_t *dst_ptr,
-                                     ptrdiff_t dst_stride,
-                                     int dst_width);
-
-void ScaleUVRowUp2_Linear_16_Any_SSE41(const uint16_t *src_ptr,
-                                       uint16_t *dst_ptr,
-                                       int dst_width);
-
-void ScaleUVRowUp2_Bilinear_16_Any_SSE41(const uint16_t *src_ptr,
-                                         ptrdiff_t src_stride,
-                                         uint16_t *dst_ptr,
-                                         ptrdiff_t dst_stride,
-                                         int dst_width);
-
-void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t *src_ptr,
-                                  uint16_t *dst_ptr,
-                                  int dst_width);
-
-void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t *src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint16_t *dst_ptr,
-                                    ptrdiff_t dst_stride,
-                                    int dst_width);
-
-void ScaleUVRowUp2_Linear_16_Any_AVX2(const uint16_t *src_ptr,
-                                      uint16_t *dst_ptr,
-                                      int dst_width);
-
-void ScaleUVRowUp2_Bilinear_16_Any_AVX2(const uint16_t *src_ptr,
-                                        ptrdiff_t src_stride,
-                                        uint16_t *dst_ptr,
-                                        ptrdiff_t dst_stride,
-                                        int dst_width);
-
-#endif  // INCLUDE_LIBYUV_SCALE_ROW_H_
\ No newline at end of file
diff --git a/pkg/encoder/yuv/libyuv/version.h b/pkg/encoder/yuv/libyuv/version.h
deleted file mode 100644
index d45ef09d6..000000000
--- a/pkg/encoder/yuv/libyuv/version.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_VERSION_H_
-#define INCLUDE_LIBYUV_VERSION_H_
-
-#define LIBYUV_VERSION 1875
-
-#endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/pkg/encoder/yuv/libyuv/video_common.c b/pkg/encoder/yuv/libyuv/video_common.c
deleted file mode 100644
index e492402e8..000000000
--- a/pkg/encoder/yuv/libyuv/video_common.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "video_common.h"
-
-struct FourCCAliasEntry {
-    uint32_t alias;
-    uint32_t canonical;
-};
-
-#define NUM_ALIASES 18
-static const struct FourCCAliasEntry kFourCCAliases[NUM_ALIASES] = {
-        {FOURCC_IYUV, FOURCC_I420},
-        {FOURCC_YU12, FOURCC_I420},
-        {FOURCC_YU16, FOURCC_I422},
-        {FOURCC_YU24, FOURCC_I444},
-        {FOURCC_YUYV, FOURCC_YUY2},
-        {FOURCC_YUVS, FOURCC_YUY2},  // kCMPixelFormat_422YpCbCr8_yuvs
-        {FOURCC_HDYC, FOURCC_UYVY},
-        {FOURCC_2VUY, FOURCC_UYVY},  // kCMPixelFormat_422YpCbCr8
-        {FOURCC_JPEG, FOURCC_MJPG},  // Note: JPEG has DHT while MJPG does not.
-        {FOURCC_DMB1, FOURCC_MJPG},
-        {FOURCC_BA81, FOURCC_BGGR},  // deprecated.
-        {FOURCC_RGB3, FOURCC_RAW},
-        {FOURCC_BGR3, FOURCC_24BG},
-        {FOURCC_CM32, FOURCC_BGRA},  // kCMPixelFormat_32ARGB
-        {FOURCC_CM24, FOURCC_RAW},   // kCMPixelFormat_24RGB
-        {FOURCC_L555, FOURCC_RGBO},  // kCMPixelFormat_16LE555
-        {FOURCC_L565, FOURCC_RGBP},  // kCMPixelFormat_16LE565
-        {FOURCC_5551, FOURCC_RGBO},  // kCMPixelFormat_16LE5551
-};
-
-LIBYUV_API
-uint32_t CanonicalFourCC(uint32_t fourcc) {
-    int i;
-    for (i = 0; i < NUM_ALIASES; ++i) {
-        if (kFourCCAliases[i].alias == fourcc) {
-            return kFourCCAliases[i].canonical;
-        }
-    }
-    // Not an alias, so return it as-is.
-    return fourcc;
-}
diff --git a/pkg/encoder/yuv/libyuv/video_common.h b/pkg/encoder/yuv/libyuv/video_common.h
deleted file mode 100644
index e2aacf44c..000000000
--- a/pkg/encoder/yuv/libyuv/video_common.h
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-// Common definitions for video, including fourcc and VideoFormat.
-
-#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_
-#define INCLUDE_LIBYUV_VIDEO_COMMON_H_
-
-#include "basic_types.h"
-
-//////////////////////////////////////////////////////////////////////////////
-// Definition of FourCC codes
-//////////////////////////////////////////////////////////////////////////////
-
-// Convert four characters to a FourCC code.
-// Needs to be a macro otherwise the OS X compiler complains when the kFormat*
-// constants are used in a switch.
-#ifdef __cplusplus
-#define FOURCC(a, b, c, d)                                        \
-  ((static_cast<uint32_t>(a)) | (static_cast<uint32_t>(b) << 8) | \
-   (static_cast<uint32_t>(c) << 16) | /* NOLINT */                \
-   (static_cast<uint32_t>(d) << 24))  /* NOLINT */
-#else
-#define FOURCC(a, b, c, d)                                     \
-  (((uint32_t)(a)) | ((uint32_t)(b) << 8) |       /* NOLINT */ \
-   ((uint32_t)(c) << 16) | ((uint32_t)(d) << 24)) /* NOLINT */
-#endif
-
-// Some pages discussing FourCC codes:
-//   http://www.fourcc.org/yuv.php
-//   http://v4l2spec.bytesex.org/spec/book1.htm
-//   http://developer.apple.com/quicktime/icefloe/dispatch020.html
-//   http://msdn.microsoft.com/library/windows/desktop/dd206750.aspx#nv12
-//   http://people.xiph.org/~xiphmont/containers/nut/nut4cc.txt
-
-// FourCC codes grouped according to implementation efficiency.
-// Primary formats should convert in 1 efficient step.
-// Secondary formats are converted in 2 steps.
-// Auxilliary formats call primary converters.
-enum FourCC {
-    // 10 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
-    FOURCC_I420 = FOURCC('I', '4', '2', '0'),
-    FOURCC_I422 = FOURCC('I', '4', '2', '2'),
-    FOURCC_I444 = FOURCC('I', '4', '4', '4'),
-    FOURCC_I400 = FOURCC('I', '4', '0', '0'),
-    FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
-    FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
-    FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
-    FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
-    FOURCC_I010 = FOURCC('I', '0', '1', '0'),  // bt.601 10 bit 420
-    FOURCC_I210 = FOURCC('I', '2', '1', '0'),  // bt.601 10 bit 422
-
-    // 1 Secondary YUV format: row biplanar.  deprecated.
-    FOURCC_M420 = FOURCC('M', '4', '2', '0'),
-
-    // 13 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc 2 64 bpp
-    FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
-    FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
-    FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
-    FOURCC_AR30 = FOURCC('A', 'R', '3', '0'),  // 10 bit per channel. 2101010.
-    FOURCC_AB30 = FOURCC('A', 'B', '3', '0'),  // ABGR version of 10 bit
-    FOURCC_AR64 = FOURCC('A', 'R', '6', '4'),  // 16 bit per channel.
-    FOURCC_AB64 = FOURCC('A', 'B', '6', '4'),  // ABGR version of 16 bit
-    FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
-    FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),
-    FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
-    FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'),  // rgb565 LE.
-    FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'),  // argb1555 LE.
-    FOURCC_R444 = FOURCC('R', '4', '4', '4'),  // argb4444 LE.
-
-    // 1 Primary Compressed YUV format.
-    FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
-
-    // 14 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
-    FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
-    FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
-    FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
-    FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'),  // Linux version of I420.
-    FOURCC_J420 =
-    FOURCC('J', '4', '2', '0'),  // jpeg (bt.601 full), unofficial fourcc
-    FOURCC_J422 =
-    FOURCC('J', '4', '2', '2'),  // jpeg (bt.601 full), unofficial fourcc
-    FOURCC_J444 =
-    FOURCC('J', '4', '4', '4'),  // jpeg (bt.601 full), unofficial fourcc
-    FOURCC_J400 =
-    FOURCC('J', '4', '0', '0'),  // jpeg (bt.601 full), unofficial fourcc
-    FOURCC_F420 = FOURCC('F', '4', '2', '0'),  // bt.709 full, unofficial fourcc
-    FOURCC_F422 = FOURCC('F', '4', '2', '2'),  // bt.709 full, unofficial fourcc
-    FOURCC_F444 = FOURCC('F', '4', '4', '4'),  // bt.709 full, unofficial fourcc
-    FOURCC_H420 = FOURCC('H', '4', '2', '0'),  // bt.709, unofficial fourcc
-    FOURCC_H422 = FOURCC('H', '4', '2', '2'),  // bt.709, unofficial fourcc
-    FOURCC_H444 = FOURCC('H', '4', '4', '4'),  // bt.709, unofficial fourcc
-    FOURCC_U420 = FOURCC('U', '4', '2', '0'),  // bt.2020, unofficial fourcc
-    FOURCC_U422 = FOURCC('U', '4', '2', '2'),  // bt.2020, unofficial fourcc
-    FOURCC_U444 = FOURCC('U', '4', '4', '4'),  // bt.2020, unofficial fourcc
-    FOURCC_F010 = FOURCC('F', '0', '1', '0'),  // bt.709 full range 10 bit 420
-    FOURCC_H010 = FOURCC('H', '0', '1', '0'),  // bt.709 10 bit 420
-    FOURCC_U010 = FOURCC('U', '0', '1', '0'),  // bt.2020 10 bit 420
-    FOURCC_F210 = FOURCC('F', '2', '1', '0'),  // bt.709 full range 10 bit 422
-    FOURCC_H210 = FOURCC('H', '2', '1', '0'),  // bt.709 10 bit 422
-    FOURCC_U210 = FOURCC('U', '2', '1', '0'),  // bt.2020 10 bit 422
-    FOURCC_P010 = FOURCC('P', '0', '1', '0'),
-    FOURCC_P210 = FOURCC('P', '2', '1', '0'),
-
-    // 14 Auxiliary aliases.  CanonicalFourCC() maps these to canonical fourcc.
-    FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'),  // Alias for I420.
-    FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'),  // Alias for I422.
-    FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'),  // Alias for I444.
-    FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'),  // Alias for YUY2.
-    FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'),  // Alias for YUY2 on Mac.
-    FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'),  // Alias for UYVY.
-    FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'),  // Alias for UYVY on Mac.
-    FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'),  // Alias for MJPG.
-    FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'),  // Alias for MJPG on Mac.
-    FOURCC_BA81 = FOURCC('B', 'A', '8', '1'),  // Alias for BGGR.
-    FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'),  // Alias for RAW.
-    FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'),  // Alias for 24BG.
-    FOURCC_CM32 = FOURCC(0, 0, 0, 32),  // Alias for BGRA kCMPixelFormat_32ARGB
-    FOURCC_CM24 = FOURCC(0, 0, 0, 24),  // Alias for RAW kCMPixelFormat_24RGB
-    FOURCC_L555 = FOURCC('L', '5', '5', '5'),  // Alias for RGBO.
-    FOURCC_L565 = FOURCC('L', '5', '6', '5'),  // Alias for RGBP.
-    FOURCC_5551 = FOURCC('5', '5', '5', '1'),  // Alias for RGBO.
-
-    // deprecated formats.  Not supported, but defined for backward compatibility.
-    FOURCC_I411 = FOURCC('I', '4', '1', '1'),
-    FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),
-    FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
-    FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
-    FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
-    FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
-    FOURCC_H264 = FOURCC('H', '2', '6', '4'),
-
-    // Match any fourcc.
-    FOURCC_ANY = -1,
-};
-
-enum FourCCBpp {
-    // Canonical fourcc codes used in our code.
-    FOURCC_BPP_I420 = 12,
-    FOURCC_BPP_I422 = 16,
-    FOURCC_BPP_I444 = 24,
-    FOURCC_BPP_I411 = 12,
-    FOURCC_BPP_I400 = 8,
-    FOURCC_BPP_NV21 = 12,
-    FOURCC_BPP_NV12 = 12,
-    FOURCC_BPP_YUY2 = 16,
-    FOURCC_BPP_UYVY = 16,
-    FOURCC_BPP_M420 = 12,  // deprecated
-    FOURCC_BPP_Q420 = 12,
-    FOURCC_BPP_ARGB = 32,
-    FOURCC_BPP_BGRA = 32,
-    FOURCC_BPP_ABGR = 32,
-    FOURCC_BPP_RGBA = 32,
-    FOURCC_BPP_AR30 = 32,
-    FOURCC_BPP_AB30 = 32,
-    FOURCC_BPP_AR64 = 64,
-    FOURCC_BPP_AB64 = 64,
-    FOURCC_BPP_24BG = 24,
-    FOURCC_BPP_RAW = 24,
-    FOURCC_BPP_RGBP = 16,
-    FOURCC_BPP_RGBO = 16,
-    FOURCC_BPP_R444 = 16,
-    FOURCC_BPP_RGGB = 8,
-    FOURCC_BPP_BGGR = 8,
-    FOURCC_BPP_GRBG = 8,
-    FOURCC_BPP_GBRG = 8,
-    FOURCC_BPP_YV12 = 12,
-    FOURCC_BPP_YV16 = 16,
-    FOURCC_BPP_YV24 = 24,
-    FOURCC_BPP_YU12 = 12,
-    FOURCC_BPP_J420 = 12,
-    FOURCC_BPP_J400 = 8,
-    FOURCC_BPP_H420 = 12,
-    FOURCC_BPP_H422 = 16,
-    FOURCC_BPP_I010 = 15,
-    FOURCC_BPP_I210 = 20,
-    FOURCC_BPP_H010 = 15,
-    FOURCC_BPP_H210 = 20,
-    FOURCC_BPP_P010 = 15,
-    FOURCC_BPP_P210 = 20,
-    FOURCC_BPP_MJPG = 0,  // 0 means unknown.
-    FOURCC_BPP_H264 = 0,
-    FOURCC_BPP_IYUV = 12,
-    FOURCC_BPP_YU16 = 16,
-    FOURCC_BPP_YU24 = 24,
-    FOURCC_BPP_YUYV = 16,
-    FOURCC_BPP_YUVS = 16,
-    FOURCC_BPP_HDYC = 16,
-    FOURCC_BPP_2VUY = 16,
-    FOURCC_BPP_JPEG = 1,
-    FOURCC_BPP_DMB1 = 1,
-    FOURCC_BPP_BA81 = 8,
-    FOURCC_BPP_RGB3 = 24,
-    FOURCC_BPP_BGR3 = 24,
-    FOURCC_BPP_CM32 = 32,
-    FOURCC_BPP_CM24 = 24,
-
-    // Match any fourcc.
-    FOURCC_BPP_ANY = 0,  // 0 means unknown.
-};
-
-// Converts fourcc aliases into canonical ones.
-LIBYUV_API uint32_t CanonicalFourCC(uint32_t fourcc);
-
-#endif  // INCLUDE_LIBYUV_VIDEO_COMMON_H_
\ No newline at end of file
diff --git a/pkg/encoder/yuv/yuv_test.go b/pkg/encoder/yuv/yuv_test.go
index 6b67c29f0..3f07aa69d 100644
--- a/pkg/encoder/yuv/yuv_test.go
+++ b/pkg/encoder/yuv/yuv_test.go
@@ -115,6 +115,9 @@ func TestYuvPredefined(t *testing.T) {
 	frame := RawFrame{Data: im, Stride: 32, W: 32, H: 32}
 	a := pc.Process(frame, 0, PixFmt(libyuv.FourccAbgr))
 
+	v := libyuv.Version()
+	t.Logf("%v", v)
+
 	if len(a) != len(should) {
 		t.Fatalf("diffrent size a: %v, o: %v", len(a), len(should))
 	}