summaryrefslogtreecommitdiff
path: root/simd
diff options
context:
space:
mode:
authorElliott Hughes <enh@google.com>2020-12-02 18:09:57 -0800
committerElliott Hughes <enh@google.com>2020-12-02 18:24:38 -0800
commit98e581f8227b9846b7adc92c0c63f5ed2384ff4b (patch)
tree26effec72dbace7671158aeddc9fde1e3e9cd1a9 /simd
parentff78ee5a324a7d37e0bfffd6152ea37056f29931 (diff)
parentd5148db386ceb4a608058320071cbed890bd6ad2 (diff)
Switch to chromium upstream.
Bug: https://issuetracker.google.com/135180511 Test: treehugger Change-Id: I0c78ec9b07debdb501a96df0d6cb2f9a24b5fc84
Diffstat (limited to 'simd')
-rwxr-xr-xsimd/CMakeLists.txt8
-rw-r--r--simd/arm/arm/jccolext-neon.c145
-rw-r--r--simd/arm/arm/jsimd.c (renamed from simd/arm/jsimd.c)248
-rw-r--r--simd/arm/arm/jsimd_neon.S499
-rw-r--r--simd/arm/arm64/jccolext-neon.c312
-rw-r--r--simd/arm/arm64/jsimd.c (renamed from simd/arm64/jsimd.c)264
-rw-r--r--simd/arm/arm64/jsimd_neon.S538
-rw-r--r--simd/arm/common/jccolor-neon.c158
-rw-r--r--simd/arm/common/jcgray-neon.c118
-rw-r--r--simd/arm/common/jcgryext-neon.c107
-rw-r--r--simd/arm/common/jcsample-neon.c191
-rw-r--r--simd/arm/common/jdcolext-neon.c330
-rw-r--r--simd/arm/common/jdcolor-neon.c134
-rw-r--r--simd/arm/common/jdmerge-neon.c138
-rw-r--r--simd/arm/common/jdmrgext-neon.c607
-rw-r--r--simd/arm/common/jdsample-neon.c557
-rw-r--r--simd/arm/common/jfdctfst-neon.c211
-rw-r--r--simd/arm/common/jfdctint-neon.c371
-rw-r--r--simd/arm/common/jidctfst-neon.c454
-rw-r--r--simd/arm/common/jidctint-neon.c758
-rw-r--r--simd/arm/common/jidctred-neon.c469
-rw-r--r--simd/arm/common/jquanti-neon.c190
-rw-r--r--simd/arm/jsimd_neon.S2878
-rw-r--r--simd/arm64/jsimd_neon.S3432
-rw-r--r--simd/i386/jccolext-avx2.asm6
-rw-r--r--simd/i386/jccolext-mmx.asm10
-rw-r--r--simd/i386/jccolext-sse2.asm6
-rw-r--r--simd/i386/jccolor-avx2.asm2
-rw-r--r--simd/i386/jccolor-mmx.asm2
-rw-r--r--simd/i386/jccolor-sse2.asm2
-rw-r--r--simd/i386/jcgray-avx2.asm2
-rw-r--r--simd/i386/jcgray-mmx.asm2
-rw-r--r--simd/i386/jcgray-sse2.asm2
-rw-r--r--simd/i386/jcgryext-avx2.asm6
-rw-r--r--simd/i386/jcgryext-mmx.asm10
-rw-r--r--simd/i386/jcgryext-sse2.asm6
-rw-r--r--simd/i386/jchuff-sse2.asm53
-rw-r--r--simd/i386/jcphuff-sse2.asm2
-rw-r--r--simd/i386/jcsample-avx2.asm2
-rw-r--r--simd/i386/jcsample-mmx.asm2
-rw-r--r--simd/i386/jcsample-sse2.asm2
-rw-r--r--simd/i386/jdcolext-avx2.asm6
-rw-r--r--simd/i386/jdcolext-mmx.asm10
-rw-r--r--simd/i386/jdcolext-sse2.asm6
-rw-r--r--simd/i386/jdcolor-avx2.asm2
-rw-r--r--simd/i386/jdcolor-mmx.asm2
-rw-r--r--simd/i386/jdcolor-sse2.asm2
-rw-r--r--simd/i386/jdmerge-avx2.asm2
-rw-r--r--simd/i386/jdmerge-mmx.asm2
-rw-r--r--simd/i386/jdmerge-sse2.asm2
-rw-r--r--simd/i386/jdmrgext-avx2.asm6
-rw-r--r--simd/i386/jdmrgext-mmx.asm10
-rw-r--r--simd/i386/jdmrgext-sse2.asm6
-rw-r--r--simd/i386/jdsample-avx2.asm2
-rw-r--r--simd/i386/jdsample-mmx.asm2
-rw-r--r--simd/i386/jdsample-sse2.asm2
-rw-r--r--simd/i386/jfdctflt-3dn.asm2
-rw-r--r--simd/i386/jfdctflt-sse.asm2
-rw-r--r--simd/i386/jfdctfst-mmx.asm2
-rw-r--r--simd/i386/jfdctfst-sse2.asm2
-rw-r--r--simd/i386/jfdctint-avx2.asm2
-rw-r--r--simd/i386/jfdctint-mmx.asm2
-rw-r--r--simd/i386/jfdctint-sse2.asm2
-rw-r--r--simd/i386/jidctflt-3dn.asm34
-rw-r--r--simd/i386/jidctflt-sse.asm6
-rw-r--r--simd/i386/jidctflt-sse2.asm6
-rw-r--r--simd/i386/jidctfst-mmx.asm6
-rw-r--r--simd/i386/jidctfst-sse2.asm6
-rw-r--r--simd/i386/jidctint-avx2.asm6
-rw-r--r--simd/i386/jidctint-mmx.asm6
-rw-r--r--simd/i386/jidctint-sse2.asm6
-rw-r--r--simd/i386/jidctred-mmx.asm18
-rw-r--r--simd/i386/jidctred-sse2.asm10
-rw-r--r--simd/i386/jquant-3dn.asm2
-rw-r--r--simd/i386/jquant-mmx.asm2
-rw-r--r--simd/i386/jquant-sse.asm2
-rw-r--r--simd/i386/jquantf-sse2.asm2
-rw-r--r--simd/i386/jquanti-avx2.asm2
-rw-r--r--simd/i386/jquanti-sse2.asm2
-rw-r--r--simd/i386/jsimd.c12
-rw-r--r--simd/i386/jsimdcpu.asm2
-rw-r--r--simd/jsimd.h93
-rw-r--r--simd/loongson/jccolext-mmi.c483
-rw-r--r--simd/loongson/jccolor-mmi.c148
-rw-r--r--simd/loongson/jcsample-mmi.c100
-rw-r--r--simd/loongson/jcsample.h28
-rw-r--r--simd/loongson/jdcolext-mmi.c424
-rw-r--r--simd/loongson/jdcolor-mmi.c139
-rw-r--r--simd/loongson/jdsample-mmi.c245
-rw-r--r--simd/loongson/jfdctint-mmi.c398
-rw-r--r--simd/loongson/jidctint-mmi.c571
-rw-r--r--simd/loongson/jquanti-mmi.c130
-rw-r--r--simd/loongson/jsimd.c610
-rw-r--r--simd/loongson/jsimd_mmi.h57
-rw-r--r--simd/loongson/loongson-mmintrin.h1324
-rw-r--r--simd/mips/jsimd.c1123
-rw-r--r--simd/mips/jsimd_dspr2.S4479
-rw-r--r--simd/mips/jsimd_dspr2_asm.h292
-rw-r--r--simd/nasm/jcolsamp.inc2
-rw-r--r--simd/nasm/jdct.inc2
-rw-r--r--simd/nasm/jpeg_nbits_table.inc4097
-rw-r--r--simd/nasm/jsimdext.inc31
-rw-r--r--simd/powerpc/jccolext-altivec.c269
-rw-r--r--simd/powerpc/jccolor-altivec.c116
-rw-r--r--simd/powerpc/jcgray-altivec.c111
-rw-r--r--simd/powerpc/jcgryext-altivec.c228
-rw-r--r--simd/powerpc/jcsample-altivec.c159
-rw-r--r--simd/powerpc/jcsample.h28
-rw-r--r--simd/powerpc/jdcolext-altivec.c276
-rw-r--r--simd/powerpc/jdcolor-altivec.c106
-rw-r--r--simd/powerpc/jdmerge-altivec.c130
-rw-r--r--simd/powerpc/jdmrgext-altivec.c329
-rw-r--r--simd/powerpc/jdsample-altivec.c400
-rw-r--r--simd/powerpc/jfdctfst-altivec.c154
-rw-r--r--simd/powerpc/jfdctint-altivec.c258
-rw-r--r--simd/powerpc/jidctfst-altivec.c255
-rw-r--r--simd/powerpc/jidctint-altivec.c357
-rw-r--r--simd/powerpc/jquanti-altivec.c250
-rw-r--r--simd/powerpc/jsimd.c872
-rw-r--r--simd/powerpc/jsimd_altivec.h98
-rw-r--r--simd/x86_64/jccolext-avx2.asm6
-rw-r--r--simd/x86_64/jccolext-sse2.asm6
-rw-r--r--simd/x86_64/jccolor-avx2.asm2
-rw-r--r--simd/x86_64/jccolor-sse2.asm2
-rw-r--r--simd/x86_64/jcgray-avx2.asm2
-rw-r--r--simd/x86_64/jcgray-sse2.asm2
-rw-r--r--simd/x86_64/jcgryext-avx2.asm6
-rw-r--r--simd/x86_64/jcgryext-sse2.asm6
-rw-r--r--simd/x86_64/jchuff-sse2.asm13
-rw-r--r--simd/x86_64/jcphuff-sse2.asm2
-rw-r--r--simd/x86_64/jcsample-avx2.asm2
-rw-r--r--simd/x86_64/jcsample-sse2.asm2
-rw-r--r--simd/x86_64/jdcolext-avx2.asm6
-rw-r--r--simd/x86_64/jdcolext-sse2.asm6
-rw-r--r--simd/x86_64/jdcolor-avx2.asm2
-rw-r--r--simd/x86_64/jdcolor-sse2.asm2
-rw-r--r--simd/x86_64/jdmerge-avx2.asm2
-rw-r--r--simd/x86_64/jdmerge-sse2.asm2
-rw-r--r--simd/x86_64/jdmrgext-avx2.asm6
-rw-r--r--simd/x86_64/jdmrgext-sse2.asm6
-rw-r--r--simd/x86_64/jdsample-avx2.asm2
-rw-r--r--simd/x86_64/jdsample-sse2.asm2
-rw-r--r--simd/x86_64/jfdctflt-sse.asm2
-rw-r--r--simd/x86_64/jfdctfst-sse2.asm2
-rw-r--r--simd/x86_64/jfdctint-avx2.asm2
-rw-r--r--simd/x86_64/jfdctint-sse2.asm2
-rw-r--r--simd/x86_64/jidctflt-sse2.asm6
-rw-r--r--simd/x86_64/jidctfst-sse2.asm6
-rw-r--r--simd/x86_64/jidctint-avx2.asm6
-rw-r--r--simd/x86_64/jidctint-sse2.asm6
-rw-r--r--simd/x86_64/jidctred-sse2.asm10
-rw-r--r--simd/x86_64/jquantf-sse2.asm2
-rw-r--r--simd/x86_64/jquanti-avx2.asm2
-rw-r--r--simd/x86_64/jquanti-sse2.asm2
-rw-r--r--simd/x86_64/jsimd.c12
-rw-r--r--simd/x86_64/jsimdcpu.asm2
156 files changed, 7010 insertions, 25751 deletions
diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt
index 3472c0d..5c8009a 100755
--- a/simd/CMakeLists.txt
+++ b/simd/CMakeLists.txt
@@ -38,6 +38,14 @@ elseif(CPU_TYPE STREQUAL "i386")
endif()
endif()
+if(NOT REQUIRE_SIMD)
+ include(CheckLanguage)
+ check_language(ASM_NASM)
+ if(NOT CMAKE_ASM_NASM_COMPILER)
+ simd_fail("SIMD extensions disabled: could not find NASM compiler")
+ return()
+ endif()
+endif()
enable_language(ASM_NASM)
message(STATUS "CMAKE_ASM_NASM_COMPILER = ${CMAKE_ASM_NASM_COMPILER}")
diff --git a/simd/arm/arm/jccolext-neon.c b/simd/arm/arm/jccolext-neon.c
new file mode 100644
index 0000000..4f22e1f
--- /dev/null
+++ b/simd/arm/arm/jccolext-neon.c
@@ -0,0 +1,145 @@
+/*
+ * jccolext-neon.c - colorspace conversion (Arm NEON)
+ *
+ * Copyright 2020 The Chromium Authors. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-neon.c */
+
+/*
+ * RGB -> YCbCr conversion is defined by the following equations:
+ * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128
+ * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ * 0.29899597 = 19595 * 2^-16
+ * 0.58700561 = 38470 * 2^-16
+ * 0.11399841 = 7471 * 2^-16
+ * 0.16874695 = 11059 * 2^-16
+ * 0.33125305 = 21709 * 2^-16
+ * 0.50000000 = 32768 * 2^-16
+ * 0.41868592 = 27439 * 2^-16
+ * 0.08131409 = 5329 * 2^-16
+ * These constants are defined in jccolor-neon.c
+ *
+ * To ensure rounding gives correct values, we add 0.5 to Cb and Cr.
+ */
+
+void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width,
+ JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf,
+ JDIMENSION output_row,
+ int num_rows)
+{
+ /* Pointer to RGB(X/A) input data. */
+ JSAMPROW inptr;
+ /* Pointers to Y, Cb and Cr output data. */
+ JSAMPROW outptr0, outptr1, outptr2;
+
+ /* Setup conversion constants. */
+#if defined(__clang__)
+ const uint16x4x2_t consts = vld1_u16_x2(jsimd_rgb_ycc_neon_consts);
+#else
+ /* GCC does not currently support the intrinsic vld1_<type>_x2(). */
+ const uint16x4_t consts1 = vld1_u16(jsimd_rgb_ycc_neon_consts);
+ const uint16x4_t consts2 = vld1_u16(jsimd_rgb_ycc_neon_consts + 4);
+ const uint16x4x2_t consts = { consts1, consts2 };
+#endif
+ const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
+
+ while (--num_rows >= 0) {
+ inptr = *input_buf++;
+ outptr0 = output_buf[0][output_row];
+ outptr1 = output_buf[1][output_row];
+ outptr2 = output_buf[2][output_row];
+ output_row++;
+
+ int cols_remaining = image_width;
+ for (; cols_remaining > 0; cols_remaining -= 8) {
+
+ /* To prevent buffer overread by the vector load instructions, the */
+ /* last (image_width % 8) columns of data are first memcopied to a */
+ /* temporary buffer large enough to accommodate the vector load. */
+ if (cols_remaining < 8) {
+ ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
+ memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+ inptr = tmp_buf;
+ }
+
+#if RGB_PIXELSIZE == 4
+ uint8x8x4_t input_pixels = vld4_u8(inptr);
+#else
+ uint8x8x3_t input_pixels = vld3_u8(inptr);
+#endif
+ uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
+ uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
+ uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
+
+ /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+ uint32x4_t y_low = vmull_lane_u16(vget_low_u16(r), consts.val[0], 0);
+ y_low = vmlal_lane_u16(y_low, vget_low_u16(g), consts.val[0], 1);
+ y_low = vmlal_lane_u16(y_low, vget_low_u16(b), consts.val[0], 2);
+ uint32x4_t y_high = vmull_lane_u16(vget_high_u16(r), consts.val[0], 0);
+ y_high = vmlal_lane_u16(y_high, vget_high_u16(g), consts.val[0], 1);
+ y_high = vmlal_lane_u16(y_high, vget_high_u16(b), consts.val[0], 2);
+
+ /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
+ uint32x4_t cb_low = scaled_128_5;
+ cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(r), consts.val[0], 3);
+ cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(g), consts.val[1], 0);
+ cb_low = vmlal_lane_u16(cb_low, vget_low_u16(b), consts.val[1], 1);
+ uint32x4_t cb_high = scaled_128_5;
+ cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(r), consts.val[0], 3);
+ cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(g), consts.val[1], 0);
+ cb_high = vmlal_lane_u16(cb_high, vget_high_u16(b), consts.val[1], 1);
+
+ /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
+ uint32x4_t cr_low = scaled_128_5;
+ cr_low = vmlal_lane_u16(cr_low, vget_low_u16(r), consts.val[1], 1);
+ cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(g), consts.val[1], 2);
+ cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(b), consts.val[1], 3);
+ uint32x4_t cr_high = scaled_128_5;
+ cr_high = vmlal_lane_u16(cr_high, vget_high_u16(r), consts.val[1], 1);
+ cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(g), consts.val[1], 2);
+ cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(b), consts.val[1], 3);
+
+ /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+ uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_low, 16),
+ vrshrn_n_u32(y_high, 16));
+ /* Descale Cb values (right shift) and narrow to 16-bit. */
+ uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_low, 16),
+ vshrn_n_u32(cb_high, 16));
+ /* Descale Cr values (right shift) and narrow to 16-bit. */
+ uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_low, 16),
+ vshrn_n_u32(cr_high, 16));
+ /* Narrow Y, Cb and Cr values to 8-bit and store to memory. Buffer */
+ /* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes. */
+ vst1_u8(outptr0, vmovn_u16(y_u16));
+ vst1_u8(outptr1, vmovn_u16(cb_u16));
+ vst1_u8(outptr2, vmovn_u16(cr_u16));
+
+ /* Increment pointers. */
+ inptr += (8 * RGB_PIXELSIZE);
+ outptr0 += 8;
+ outptr1 += 8;
+ outptr2 += 8;
+ }
+ }
+}
diff --git a/simd/arm/jsimd.c b/simd/arm/arm/jsimd.c
index 45f9b04..c0d5d90 100644
--- a/simd/arm/jsimd.c
+++ b/simd/arm/arm/jsimd.c
@@ -17,12 +17,12 @@
*/
#define JPEG_INTERNALS
-#include "../../jinclude.h"
-#include "../../jpeglib.h"
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
#include "../../jsimd.h"
-#include "../../jdct.h"
-#include "../../jsimddct.h"
-#include "../jsimd.h"
#include <stdio.h>
#include <string.h>
@@ -164,6 +164,19 @@ jsimd_can_rgb_ycc(void)
GLOBAL(int)
jsimd_can_rgb_gray(void)
{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
return 0;
}
@@ -246,6 +259,37 @@ jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
JSAMPIMAGE output_buf, JDIMENSION output_row,
int num_rows)
{
+ void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ neonfct = jsimd_extrgb_gray_convert_neon;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_extrgbx_gray_convert_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct = jsimd_extbgr_gray_convert_neon;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_extbgrx_gray_convert_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_extxbgr_gray_convert_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_extxrgb_gray_convert_neon;
+ break;
+ default:
+ neonfct = jsimd_extrgb_gray_convert_neon;
+ break;
+ }
+
+ neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
}
GLOBAL(void)
@@ -298,12 +342,38 @@ jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
GLOBAL(int)
jsimd_can_h2v2_downsample(void)
{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
return 0;
}
GLOBAL(int)
jsimd_can_h2v1_downsample(void)
{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
return 0;
}
@@ -311,23 +381,51 @@ GLOBAL(void)
jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY output_data)
{
+ jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
}
GLOBAL(void)
jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY output_data)
{
+ jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor, compptr->width_in_blocks,
+ input_data, output_data);
}
GLOBAL(int)
jsimd_can_h2v2_upsample(void)
{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
return 0;
}
GLOBAL(int)
jsimd_can_h2v1_upsample(void)
{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
return 0;
}
@@ -335,17 +433,32 @@ GLOBAL(void)
jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
+ jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
}
GLOBAL(void)
jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
+ jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
}
GLOBAL(int)
jsimd_can_h2v2_fancy_upsample(void)
{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
return 0;
}
@@ -366,10 +479,30 @@ jsimd_can_h2v1_fancy_upsample(void)
return 0;
}
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
GLOBAL(void)
jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
+ jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
}
GLOBAL(void)
@@ -381,15 +514,46 @@ jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
output_data_ptr);
}
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
GLOBAL(int)
jsimd_can_h2v2_merged_upsample(void)
{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
return 0;
}
GLOBAL(int)
jsimd_can_h2v1_merged_upsample(void)
{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
return 0;
}
@@ -397,12 +561,74 @@ GLOBAL(void)
jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
{
+ void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct = jsimd_h2v2_extbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon;
+ break;
+ default:
+ neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+ break;
+ }
+
+ neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
}
GLOBAL(void)
jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
{
+ void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct = jsimd_h2v1_extbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon;
+ break;
+ default:
+ neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+ break;
+ }
+
+ neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
}
GLOBAL(int)
@@ -448,6 +674,17 @@ jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
GLOBAL(int)
jsimd_can_fdct_islow(void)
{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
return 0;
}
@@ -477,6 +714,7 @@ jsimd_can_fdct_float(void)
GLOBAL(void)
jsimd_fdct_islow(DCTELEM *data)
{
+ jsimd_fdct_islow_neon(data);
}
GLOBAL(void)
diff --git a/simd/arm/arm/jsimd_neon.S b/simd/arm/arm/jsimd_neon.S
new file mode 100644
index 0000000..2c45324
--- /dev/null
+++ b/simd/arm/arm/jsimd_neon.S
@@ -0,0 +1,499 @@
+/*
+ * ARMv7 NEON optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
+ * All Rights Reserved.
+ * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+ * Copyright (C) 2014, Siarhei Siamashka. All Rights Reserved.
+ * Copyright (C) 2014, Linaro Limited. All Rights Reserved.
+ * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */
+#endif
+
+.text
+.fpu neon
+.arch armv7a
+.object_arch armv4
+.arm
+.syntax unified
+
+
+#define RESPECT_STRICT_ALIGNMENT 1
+
+
+/*****************************************************************************/
+
+/* Supplementary macro for setting function attributes */
+.macro asm_function fname
+#ifdef __APPLE__
+ .private_extern _\fname
+ .globl _\fname
+_\fname:
+#else
+ .global \fname
+#ifdef __ELF__
+ .hidden \fname
+ .type \fname, %function
+#endif
+\fname:
+#endif
+.endm
+
+
+#define CENTERJSAMPLE 128
+
+/*****************************************************************************/
+
+/*
+ * GLOBAL(JOCTET*)
+ * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
+ * JCOEFPTR block, int last_dc_val,
+ * c_derived_tbl *dctbl, c_derived_tbl *actbl)
+ *
+ */
+
+.macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
+ sub \PUT_BITS, \PUT_BITS, #0x8
+ lsr \TMP, \PUT_BUFFER, \PUT_BITS
+ uxtb \TMP, \TMP
+ strb \TMP, [\BUFFER, #1]!
+ cmp \TMP, #0xff
+ /*it eq*/
+ strbeq \ZERO, [\BUFFER, #1]!
+.endm
+
+.macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE
+ /*lsl \PUT_BUFFER, \PUT_BUFFER, \SIZE*/
+ add \PUT_BITS, \SIZE
+ /*orr \PUT_BUFFER, \PUT_BUFFER, \CODE*/
+ orr \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE
+.endm
+
+.macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
+ cmp \PUT_BITS, #0x10
+ blt 15f
+ eor \ZERO, \ZERO, \ZERO
+ emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
+ emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
+15:
+.endm
+
+.balign 16
+jsimd_huff_encode_one_block_neon_consts:
+ .byte 0x01
+ .byte 0x02
+ .byte 0x04
+ .byte 0x08
+ .byte 0x10
+ .byte 0x20
+ .byte 0x40
+ .byte 0x80
+
+asm_function jsimd_huff_encode_one_block_neon
+ push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+ add r7, sp, #0x1c
+ sub r4, sp, #0x40
+ bfc r4, #0, #5
+ mov sp, r4 /* align sp on 32 bytes */
+ vst1.64 {d8, d9, d10, d11}, [r4, :128]!
+ vst1.64 {d12, d13, d14, d15}, [r4, :128]
+ sub sp, #0x140 /* reserve 320 bytes */
+ str r0, [sp, #0x18] /* working state > sp + Ox18 */
+ add r4, sp, #0x20 /* r4 = t1 */
+ ldr lr, [r7, #0x8] /* lr = dctbl */
+ sub r10, r1, #0x1 /* r10=buffer-- */
+ ldrsh r1, [r2]
+ mov r9, #0x10
+ mov r8, #0x1
+ adr r5, jsimd_huff_encode_one_block_neon_consts
+ /* prepare data */
+ vld1.8 {d26}, [r5, :64]
+ veor q8, q8, q8
+ veor q9, q9, q9
+ vdup.16 q14, r9
+ vdup.16 q15, r8
+ veor q10, q10, q10
+ veor q11, q11, q11
+ sub r1, r1, r3
+ add r9, r2, #0x22
+ add r8, r2, #0x18
+ add r3, r2, #0x36
+ vmov.16 d0[0], r1
+ vld1.16 {d2[0]}, [r9, :16]
+ vld1.16 {d4[0]}, [r8, :16]
+ vld1.16 {d6[0]}, [r3, :16]
+ add r1, r2, #0x2
+ add r9, r2, #0x30
+ add r8, r2, #0x26
+ add r3, r2, #0x28
+ vld1.16 {d0[1]}, [r1, :16]
+ vld1.16 {d2[1]}, [r9, :16]
+ vld1.16 {d4[1]}, [r8, :16]
+ vld1.16 {d6[1]}, [r3, :16]
+ add r1, r2, #0x10
+ add r9, r2, #0x40
+ add r8, r2, #0x34
+ add r3, r2, #0x1a
+ vld1.16 {d0[2]}, [r1, :16]
+ vld1.16 {d2[2]}, [r9, :16]
+ vld1.16 {d4[2]}, [r8, :16]
+ vld1.16 {d6[2]}, [r3, :16]
+ add r1, r2, #0x20
+ add r9, r2, #0x32
+ add r8, r2, #0x42
+ add r3, r2, #0xc
+ vld1.16 {d0[3]}, [r1, :16]
+ vld1.16 {d2[3]}, [r9, :16]
+ vld1.16 {d4[3]}, [r8, :16]
+ vld1.16 {d6[3]}, [r3, :16]
+ add r1, r2, #0x12
+ add r9, r2, #0x24
+ add r8, r2, #0x50
+ add r3, r2, #0xe
+ vld1.16 {d1[0]}, [r1, :16]
+ vld1.16 {d3[0]}, [r9, :16]
+ vld1.16 {d5[0]}, [r8, :16]
+ vld1.16 {d7[0]}, [r3, :16]
+ add r1, r2, #0x4
+ add r9, r2, #0x16
+ add r8, r2, #0x60
+ add r3, r2, #0x1c
+ vld1.16 {d1[1]}, [r1, :16]
+ vld1.16 {d3[1]}, [r9, :16]
+ vld1.16 {d5[1]}, [r8, :16]
+ vld1.16 {d7[1]}, [r3, :16]
+ add r1, r2, #0x6
+ add r9, r2, #0x8
+ add r8, r2, #0x52
+ add r3, r2, #0x2a
+ vld1.16 {d1[2]}, [r1, :16]
+ vld1.16 {d3[2]}, [r9, :16]
+ vld1.16 {d5[2]}, [r8, :16]
+ vld1.16 {d7[2]}, [r3, :16]
+ add r1, r2, #0x14
+ add r9, r2, #0xa
+ add r8, r2, #0x44
+ add r3, r2, #0x38
+ vld1.16 {d1[3]}, [r1, :16]
+ vld1.16 {d3[3]}, [r9, :16]
+ vld1.16 {d5[3]}, [r8, :16]
+ vld1.16 {d7[3]}, [r3, :16]
+ vcgt.s16 q8, q8, q0
+ vcgt.s16 q9, q9, q1
+ vcgt.s16 q10, q10, q2
+ vcgt.s16 q11, q11, q3
+ vabs.s16 q0, q0
+ vabs.s16 q1, q1
+ vabs.s16 q2, q2
+ vabs.s16 q3, q3
+ veor q8, q8, q0
+ veor q9, q9, q1
+ veor q10, q10, q2
+ veor q11, q11, q3
+ add r9, r4, #0x20
+ add r8, r4, #0x80
+ add r3, r4, #0xa0
+ vclz.i16 q0, q0
+ vclz.i16 q1, q1
+ vclz.i16 q2, q2
+ vclz.i16 q3, q3
+ vsub.i16 q0, q14, q0
+ vsub.i16 q1, q14, q1
+ vsub.i16 q2, q14, q2
+ vsub.i16 q3, q14, q3
+ vst1.16 {d0, d1, d2, d3}, [r4, :256]
+ vst1.16 {d4, d5, d6, d7}, [r9, :256]
+ vshl.s16 q0, q15, q0
+ vshl.s16 q1, q15, q1
+ vshl.s16 q2, q15, q2
+ vshl.s16 q3, q15, q3
+ vsub.i16 q0, q0, q15
+ vsub.i16 q1, q1, q15
+ vsub.i16 q2, q2, q15
+ vsub.i16 q3, q3, q15
+ vand q8, q8, q0
+ vand q9, q9, q1
+ vand q10, q10, q2
+ vand q11, q11, q3
+ vst1.16 {d16, d17, d18, d19}, [r8, :256]
+ vst1.16 {d20, d21, d22, d23}, [r3, :256]
+ add r1, r2, #0x46
+ add r9, r2, #0x3a
+ add r8, r2, #0x74
+ add r3, r2, #0x6a
+ vld1.16 {d8[0]}, [r1, :16]
+ vld1.16 {d10[0]}, [r9, :16]
+ vld1.16 {d12[0]}, [r8, :16]
+ vld1.16 {d14[0]}, [r3, :16]
+ veor q8, q8, q8
+ veor q9, q9, q9
+ veor q10, q10, q10
+ veor q11, q11, q11
+ add r1, r2, #0x54
+ add r9, r2, #0x2c
+ add r8, r2, #0x76
+ add r3, r2, #0x78
+ vld1.16 {d8[1]}, [r1, :16]
+ vld1.16 {d10[1]}, [r9, :16]
+ vld1.16 {d12[1]}, [r8, :16]
+ vld1.16 {d14[1]}, [r3, :16]
+ add r1, r2, #0x62
+ add r9, r2, #0x1e
+ add r8, r2, #0x68
+ add r3, r2, #0x7a
+ vld1.16 {d8[2]}, [r1, :16]
+ vld1.16 {d10[2]}, [r9, :16]
+ vld1.16 {d12[2]}, [r8, :16]
+ vld1.16 {d14[2]}, [r3, :16]
+ add r1, r2, #0x70
+ add r9, r2, #0x2e
+ add r8, r2, #0x5a
+ add r3, r2, #0x6c
+ vld1.16 {d8[3]}, [r1, :16]
+ vld1.16 {d10[3]}, [r9, :16]
+ vld1.16 {d12[3]}, [r8, :16]
+ vld1.16 {d14[3]}, [r3, :16]
+ add r1, r2, #0x72
+ add r9, r2, #0x3c
+ add r8, r2, #0x4c
+ add r3, r2, #0x5e
+ vld1.16 {d9[0]}, [r1, :16]
+ vld1.16 {d11[0]}, [r9, :16]
+ vld1.16 {d13[0]}, [r8, :16]
+ vld1.16 {d15[0]}, [r3, :16]
+ add r1, r2, #0x64
+ add r9, r2, #0x4a
+ add r8, r2, #0x3e
+ add r3, r2, #0x6e
+ vld1.16 {d9[1]}, [r1, :16]
+ vld1.16 {d11[1]}, [r9, :16]
+ vld1.16 {d13[1]}, [r8, :16]
+ vld1.16 {d15[1]}, [r3, :16]
+ add r1, r2, #0x56
+ add r9, r2, #0x58
+ add r8, r2, #0x4e
+ add r3, r2, #0x7c
+ vld1.16 {d9[2]}, [r1, :16]
+ vld1.16 {d11[2]}, [r9, :16]
+ vld1.16 {d13[2]}, [r8, :16]
+ vld1.16 {d15[2]}, [r3, :16]
+ add r1, r2, #0x48
+ add r9, r2, #0x66
+ add r8, r2, #0x5c
+ add r3, r2, #0x7e
+ vld1.16 {d9[3]}, [r1, :16]
+ vld1.16 {d11[3]}, [r9, :16]
+ vld1.16 {d13[3]}, [r8, :16]
+ vld1.16 {d15[3]}, [r3, :16]
+ vcgt.s16 q8, q8, q4
+ vcgt.s16 q9, q9, q5
+ vcgt.s16 q10, q10, q6
+ vcgt.s16 q11, q11, q7
+ vabs.s16 q4, q4
+ vabs.s16 q5, q5
+ vabs.s16 q6, q6
+ vabs.s16 q7, q7
+ veor q8, q8, q4
+ veor q9, q9, q5
+ veor q10, q10, q6
+ veor q11, q11, q7
+ add r1, r4, #0x40
+ add r9, r4, #0x60
+ add r8, r4, #0xc0
+ add r3, r4, #0xe0
+ vclz.i16 q4, q4
+ vclz.i16 q5, q5
+ vclz.i16 q6, q6
+ vclz.i16 q7, q7
+ vsub.i16 q4, q14, q4
+ vsub.i16 q5, q14, q5
+ vsub.i16 q6, q14, q6
+ vsub.i16 q7, q14, q7
+ vst1.16 {d8, d9, d10, d11}, [r1, :256]
+ vst1.16 {d12, d13, d14, d15}, [r9, :256]
+ vshl.s16 q4, q15, q4
+ vshl.s16 q5, q15, q5
+ vshl.s16 q6, q15, q6
+ vshl.s16 q7, q15, q7
+ vsub.i16 q4, q4, q15
+ vsub.i16 q5, q5, q15
+ vsub.i16 q6, q6, q15
+ vsub.i16 q7, q7, q15
+ vand q8, q8, q4
+ vand q9, q9, q5
+ vand q10, q10, q6
+ vand q11, q11, q7
+ vst1.16 {d16, d17, d18, d19}, [r8, :256]
+ vst1.16 {d20, d21, d22, d23}, [r3, :256]
+ ldr r12, [r7, #0xc] /* r12 = actbl */
+ add r1, lr, #0x400 /* r1 = dctbl->ehufsi */
+ mov r9, r12 /* r9 = actbl */
+ add r6, r4, #0x80 /* r6 = t2 */
+ ldr r11, [r0, #0x8] /* r11 = put_buffer */
+ ldr r4, [r0, #0xc] /* r4 = put_bits */
+ ldrh r2, [r6, #-128] /* r2 = nbits */
+ ldrh r3, [r6] /* r3 = temp2 & (((JLONG)1)<<nbits) - 1; */
+ ldr r0, [lr, r2, lsl #2]
+ ldrb r5, [r1, r2]
+ put_bits r11, r4, r0, r5
+ checkbuf15 r10, r11, r4, r5, r0
+ put_bits r11, r4, r3, r2
+ checkbuf15 r10, r11, r4, r5, r0
+ mov lr, r6 /* lr = t2 */
+ add r5, r9, #0x400 /* r5 = actbl->ehufsi */
+ ldrsb r6, [r5, #0xf0] /* r6 = actbl->ehufsi[0xf0] */
+ veor q8, q8, q8
+ vceq.i16 q0, q0, q8
+ vceq.i16 q1, q1, q8
+ vceq.i16 q2, q2, q8
+ vceq.i16 q3, q3, q8
+ vceq.i16 q4, q4, q8
+ vceq.i16 q5, q5, q8
+ vceq.i16 q6, q6, q8
+ vceq.i16 q7, q7, q8
+ vmovn.i16 d0, q0
+ vmovn.i16 d2, q1
+ vmovn.i16 d4, q2
+ vmovn.i16 d6, q3
+ vmovn.i16 d8, q4
+ vmovn.i16 d10, q5
+ vmovn.i16 d12, q6
+ vmovn.i16 d14, q7
+ vand d0, d0, d26
+ vand d2, d2, d26
+ vand d4, d4, d26
+ vand d6, d6, d26
+ vand d8, d8, d26
+ vand d10, d10, d26
+ vand d12, d12, d26
+ vand d14, d14, d26
+ vpadd.i8 d0, d0, d2
+ vpadd.i8 d4, d4, d6
+ vpadd.i8 d8, d8, d10
+ vpadd.i8 d12, d12, d14
+ vpadd.i8 d0, d0, d4
+ vpadd.i8 d8, d8, d12
+ vpadd.i8 d0, d0, d8
+ vmov.32 r1, d0[1]
+ vmov.32 r8, d0[0]
+ mvn r1, r1
+ mvn r8, r8
+ lsrs r1, r1, #0x1
+ rrx r8, r8 /* shift in last r1 bit while shifting out DC bit */
+ rbit r1, r1 /* r1 = index1 */
+ rbit r8, r8 /* r8 = index0 */
+ ldr r0, [r9, #0x3c0] /* r0 = actbl->ehufco[0xf0] */
+ str r1, [sp, #0x14] /* index1 > sp + 0x14 */
+ cmp r8, #0x0
+ beq 6f
+1:
+ clz r2, r8
+ add lr, lr, r2, lsl #1
+ lsl r8, r8, r2
+ ldrh r1, [lr, #-126]
+2:
+ cmp r2, #0x10
+ blt 3f
+ sub r2, r2, #0x10
+ put_bits r11, r4, r0, r6
+ cmp r4, #0x10
+ blt 2b
+ eor r3, r3, r3
+ emit_byte r10, r11, r4, r3, r12
+ emit_byte r10, r11, r4, r3, r12
+ b 2b
+3:
+ add r2, r1, r2, lsl #4
+ ldrh r3, [lr, #2]!
+ ldr r12, [r9, r2, lsl #2]
+ ldrb r2, [r5, r2]
+ put_bits r11, r4, r12, r2
+ checkbuf15 r10, r11, r4, r2, r12
+ put_bits r11, r4, r3, r1
+ checkbuf15 r10, r11, r4, r2, r12
+ lsls r8, r8, #0x1
+ bne 1b
+6:
+ add r12, sp, #0x20 /* r12 = t1 */
+ ldr r8, [sp, #0x14] /* r8 = index1 */
+ adds r12, #0xc0 /* r12 = t2 + (DCTSIZE2/2) */
+ cmp r8, #0x0
+ beq 6f
+ clz r2, r8
+ sub r12, r12, lr
+ lsl r8, r8, r2
+ add r2, r2, r12, lsr #1
+ add lr, lr, r2, lsl #1
+ b 7f
+1:
+ clz r2, r8
+ add lr, lr, r2, lsl #1
+ lsl r8, r8, r2
+7:
+ ldrh r1, [lr, #-126]
+2:
+ cmp r2, #0x10
+ blt 3f
+ sub r2, r2, #0x10
+ put_bits r11, r4, r0, r6
+ cmp r4, #0x10
+ blt 2b
+ eor r3, r3, r3
+ emit_byte r10, r11, r4, r3, r12
+ emit_byte r10, r11, r4, r3, r12
+ b 2b
+3:
+ add r2, r1, r2, lsl #4
+ ldrh r3, [lr, #2]!
+ ldr r12, [r9, r2, lsl #2]
+ ldrb r2, [r5, r2]
+ put_bits r11, r4, r12, r2
+ checkbuf15 r10, r11, r4, r2, r12
+ put_bits r11, r4, r3, r1
+ checkbuf15 r10, r11, r4, r2, r12
+ lsls r8, r8, #0x1
+ bne 1b
+6:
+ add r0, sp, #0x20
+ add r0, #0xfe
+ cmp lr, r0
+ bhs 1f
+ ldr r1, [r9]
+ ldrb r0, [r5]
+ put_bits r11, r4, r1, r0
+ checkbuf15 r10, r11, r4, r0, r1
+1:
+ ldr r12, [sp, #0x18]
+ str r11, [r12, #0x8]
+ str r4, [r12, #0xc]
+ add r0, r10, #0x1
+ add r4, sp, #0x140
+ vld1.64 {d8, d9, d10, d11}, [r4, :128]!
+ vld1.64 {d12, d13, d14, d15}, [r4, :128]
+ sub r4, r7, #0x1c
+ mov sp, r4
+ pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+
+.purgem emit_byte
+.purgem put_bits
+.purgem checkbuf15
diff --git a/simd/arm/arm64/jccolext-neon.c b/simd/arm/arm64/jccolext-neon.c
new file mode 100644
index 0000000..89f520a
--- /dev/null
+++ b/simd/arm/arm64/jccolext-neon.c
@@ -0,0 +1,312 @@
+/*
+ * jccolext-neon.c - colorspace conversion (Arm NEON)
+ *
+ * Copyright 2020 The Chromium Authors. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-neon.c */
+
+/*
+ * RGB -> YCbCr conversion is defined by the following equations:
+ * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128
+ * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ * 0.29899597 = 19595 * 2^-16
+ * 0.58700561 = 38470 * 2^-16
+ * 0.11399841 = 7471 * 2^-16
+ * 0.16874695 = 11059 * 2^-16
+ * 0.33125305 = 21709 * 2^-16
+ * 0.50000000 = 32768 * 2^-16
+ * 0.41868592 = 27439 * 2^-16
+ * 0.08131409 = 5329 * 2^-16
+ * These constants are defined in jccolor-neon.c
+ *
+ * To ensure rounding gives correct values, we add 0.5 to Cb and Cr.
+ */
+
+void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width,
+ JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf,
+ JDIMENSION output_row,
+ int num_rows)
+{
+ /* Pointer to RGB(X/A) input data. */
+ JSAMPROW inptr;
+ /* Pointers to Y, Cb and Cr output data. */
+ JSAMPROW outptr0, outptr1, outptr2;
+
+ /* Setup conversion constants. */
+ const uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts);
+ const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
+
+ while (--num_rows >= 0) {
+ inptr = *input_buf++;
+ outptr0 = output_buf[0][output_row];
+ outptr1 = output_buf[1][output_row];
+ outptr2 = output_buf[2][output_row];
+ output_row++;
+
+ int cols_remaining = image_width;
+ for (; cols_remaining >= 16; cols_remaining -= 16) {
+
+#if RGB_PIXELSIZE == 4
+ uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+ uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+ uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+ uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+ uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+ uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+ uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+ uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+ /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+ uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
+ y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
+ y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
+ uint32x4_t y_lh = vmull_high_laneq_u16(r_l, consts, 0);
+ y_lh = vmlal_high_laneq_u16(y_lh, g_l, consts, 1);
+ y_lh = vmlal_high_laneq_u16(y_lh, b_l, consts, 2);
+ uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
+ y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
+ y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
+ uint32x4_t y_hh = vmull_high_laneq_u16(r_h, consts, 0);
+ y_hh = vmlal_high_laneq_u16(y_hh, g_h, consts, 1);
+ y_hh = vmlal_high_laneq_u16(y_hh, b_h, consts, 2);
+
+ /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
+ uint32x4_t cb_ll = scaled_128_5;
+ cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
+ cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
+ cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
+ uint32x4_t cb_lh = scaled_128_5;
+ cb_lh = vmlsl_high_laneq_u16(cb_lh, r_l, consts, 3);
+ cb_lh = vmlsl_high_laneq_u16(cb_lh, g_l, consts, 4);
+ cb_lh = vmlal_high_laneq_u16(cb_lh, b_l, consts, 5);
+ uint32x4_t cb_hl = scaled_128_5;
+ cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
+ cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
+ cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
+ uint32x4_t cb_hh = scaled_128_5;
+ cb_hh = vmlsl_high_laneq_u16(cb_hh, r_h, consts, 3);
+ cb_hh = vmlsl_high_laneq_u16(cb_hh, g_h, consts, 4);
+ cb_hh = vmlal_high_laneq_u16(cb_hh, b_h, consts, 5);
+
+ /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
+ uint32x4_t cr_ll = scaled_128_5;
+ cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
+ cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
+ cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
+ uint32x4_t cr_lh = scaled_128_5;
+ cr_lh = vmlal_high_laneq_u16(cr_lh, r_l, consts, 5);
+ cr_lh = vmlsl_high_laneq_u16(cr_lh, g_l, consts, 6);
+ cr_lh = vmlsl_high_laneq_u16(cr_lh, b_l, consts, 7);
+ uint32x4_t cr_hl = scaled_128_5;
+ cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
+ cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
+ cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
+ uint32x4_t cr_hh = scaled_128_5;
+ cr_hh = vmlal_high_laneq_u16(cr_hh, r_h, consts, 5);
+ cr_hh = vmlsl_high_laneq_u16(cr_hh, g_h, consts, 6);
+ cr_hh = vmlsl_high_laneq_u16(cr_hh, b_h, consts, 7);
+
+ /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+ uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+ vrshrn_n_u32(y_lh, 16));
+ uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+ vrshrn_n_u32(y_hh, 16));
+ /* Descale Cb values (right shift) and narrow to 16-bit. */
+ uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
+ vshrn_n_u32(cb_lh, 16));
+ uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
+ vshrn_n_u32(cb_hh, 16));
+ /* Descale Cr values (right shift) and narrow to 16-bit. */
+ uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
+ vshrn_n_u32(cr_lh, 16));
+ uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
+ vshrn_n_u32(cr_hh, 16));
+ /* Narrow Y, Cb and Cr values to 8-bit and store to memory. Buffer */
+ /* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes. */
+ vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+ vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
+ vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
+
+ /* Increment pointers. */
+ inptr += (16 * RGB_PIXELSIZE);
+ outptr0 += 16;
+ outptr1 += 16;
+ outptr2 += 16;
+ }
+
+ if (cols_remaining > 8) {
+ /* To prevent buffer overread by the vector load instructions, the */
+ /* last (image_width % 16) columns of data are first memcopied to a */
+ /* temporary buffer large enough to accommodate the vector load. */
+ ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
+ memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+ inptr = tmp_buf;
+
+#if RGB_PIXELSIZE == 4
+ uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+ uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+ uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+ uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+ uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+ uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+ uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+ uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+ /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+ uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
+ y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
+ y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
+ uint32x4_t y_lh = vmull_high_laneq_u16(r_l, consts, 0);
+ y_lh = vmlal_high_laneq_u16(y_lh, g_l, consts, 1);
+ y_lh = vmlal_high_laneq_u16(y_lh, b_l, consts, 2);
+ uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
+ y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
+ y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
+ uint32x4_t y_hh = vmull_high_laneq_u16(r_h, consts, 0);
+ y_hh = vmlal_high_laneq_u16(y_hh, g_h, consts, 1);
+ y_hh = vmlal_high_laneq_u16(y_hh, b_h, consts, 2);
+
+ /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
+ uint32x4_t cb_ll = scaled_128_5;
+ cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
+ cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
+ cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
+ uint32x4_t cb_lh = scaled_128_5;
+ cb_lh = vmlsl_high_laneq_u16(cb_lh, r_l, consts, 3);
+ cb_lh = vmlsl_high_laneq_u16(cb_lh, g_l, consts, 4);
+ cb_lh = vmlal_high_laneq_u16(cb_lh, b_l, consts, 5);
+ uint32x4_t cb_hl = scaled_128_5;
+ cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
+ cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
+ cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
+ uint32x4_t cb_hh = scaled_128_5;
+ cb_hh = vmlsl_high_laneq_u16(cb_hh, r_h, consts, 3);
+ cb_hh = vmlsl_high_laneq_u16(cb_hh, g_h, consts, 4);
+ cb_hh = vmlal_high_laneq_u16(cb_hh, b_h, consts, 5);
+
+ /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
+ uint32x4_t cr_ll = scaled_128_5;
+ cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
+ cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
+ cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
+ uint32x4_t cr_lh = scaled_128_5;
+ cr_lh = vmlal_high_laneq_u16(cr_lh, r_l, consts, 5);
+ cr_lh = vmlsl_high_laneq_u16(cr_lh, g_l, consts, 6);
+ cr_lh = vmlsl_high_laneq_u16(cr_lh, b_l, consts, 7);
+ uint32x4_t cr_hl = scaled_128_5;
+ cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
+ cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
+ cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
+ uint32x4_t cr_hh = scaled_128_5;
+ cr_hh = vmlal_high_laneq_u16(cr_hh, r_h, consts, 5);
+ cr_hh = vmlsl_high_laneq_u16(cr_hh, g_h, consts, 6);
+ cr_hh = vmlsl_high_laneq_u16(cr_hh, b_h, consts, 7);
+
+ /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+ uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+ vrshrn_n_u32(y_lh, 16));
+ uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+ vrshrn_n_u32(y_hh, 16));
+ /* Descale Cb values (right shift) and narrow to 16-bit. */
+ uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
+ vshrn_n_u32(cb_lh, 16));
+ uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
+ vshrn_n_u32(cb_hh, 16));
+ /* Descale Cr values (right shift) and narrow to 16-bit. */
+ uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
+ vshrn_n_u32(cr_lh, 16));
+ uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
+ vshrn_n_u32(cr_hh, 16));
+ /* Narrow Y, Cb and Cr values to 8-bit and store to memory. Buffer */
+ /* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes. */
+ vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+ vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
+ vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
+
+ } else if (cols_remaining > 0) {
+ /* To prevent buffer overread by the vector load instructions, the */
+ /* last (image_width % 8) columns of data are first memcopied to a */
+ /* temporary buffer large enough to accommodate the vector load. */
+ ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
+ memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+ inptr = tmp_buf;
+
+#if RGB_PIXELSIZE == 4
+ uint8x8x4_t input_pixels = vld4_u8(inptr);
+#else
+ uint8x8x3_t input_pixels = vld3_u8(inptr);
+#endif
+ uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
+ uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
+ uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
+
+ /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+ uint32x4_t y_l = vmull_laneq_u16(vget_low_u16(r), consts, 0);
+ y_l = vmlal_laneq_u16(y_l, vget_low_u16(g), consts, 1);
+ y_l = vmlal_laneq_u16(y_l, vget_low_u16(b), consts, 2);
+ uint32x4_t y_h = vmull_high_laneq_u16(r, consts, 0);
+ y_h = vmlal_high_laneq_u16(y_h, g, consts, 1);
+ y_h = vmlal_high_laneq_u16(y_h, b, consts, 2);
+
+ /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
+ uint32x4_t cb_l = scaled_128_5;
+ cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(r), consts, 3);
+ cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(g), consts, 4);
+ cb_l = vmlal_laneq_u16(cb_l, vget_low_u16(b), consts, 5);
+ uint32x4_t cb_h = scaled_128_5;
+ cb_h = vmlsl_high_laneq_u16(cb_h, r, consts, 3);
+ cb_h = vmlsl_high_laneq_u16(cb_h, g, consts, 4);
+ cb_h = vmlal_high_laneq_u16(cb_h, b, consts, 5);
+
+ /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
+ uint32x4_t cr_l = scaled_128_5;
+ cr_l = vmlal_laneq_u16(cr_l, vget_low_u16(r), consts, 5);
+ cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(g), consts, 6);
+ cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(b), consts, 7);
+ uint32x4_t cr_h = scaled_128_5;
+ cr_h = vmlal_high_laneq_u16(cr_h, r, consts, 5);
+ cr_h = vmlsl_high_laneq_u16(cr_h, g, consts, 6);
+ cr_h = vmlsl_high_laneq_u16(cr_h, b, consts, 7);
+
+ /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+ uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_l, 16),
+ vrshrn_n_u32(y_h, 16));
+ /* Descale Cb values (right shift) and narrow to 16-bit. */
+ uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_l, 16),
+ vshrn_n_u32(cb_h, 16));
+ /* Descale Cr values (right shift) and narrow to 16-bit. */
+ uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_l, 16),
+ vshrn_n_u32(cr_h, 16));
+ /* Narrow Y, Cb and Cr values to 8-bit and store to memory. Buffer */
+ /* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes. */
+ vst1_u8(outptr0, vmovn_u16(y_u16));
+ vst1_u8(outptr1, vmovn_u16(cb_u16));
+ vst1_u8(outptr2, vmovn_u16(cr_u16));
+ }
+ }
+}
diff --git a/simd/arm64/jsimd.c b/simd/arm/arm64/jsimd.c
index 0e6c7b9..ca29cd6 100644
--- a/simd/arm64/jsimd.c
+++ b/simd/arm/arm64/jsimd.c
@@ -16,25 +16,22 @@
*/
#define JPEG_INTERNALS
-#include "../../jinclude.h"
-#include "../../jpeglib.h"
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
#include "../../jsimd.h"
-#include "../../jdct.h"
-#include "../../jsimddct.h"
-#include "../jsimd.h"
#include <stdio.h>
#include <string.h>
#include <ctype.h>
-#define JSIMD_FASTLD3 1
-#define JSIMD_FASTST3 2
#define JSIMD_FASTTBL 4
static unsigned int simd_support = ~0;
static unsigned int simd_huffman = 1;
-static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 |
- JSIMD_FASTTBL;
+static unsigned int simd_features = JSIMD_FASTTBL;
#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
@@ -154,16 +151,6 @@ init_simd(void)
env = getenv("JSIMD_NOHUFFENC");
if ((env != NULL) && (strcmp(env, "1") == 0))
simd_huffman = 0;
- env = getenv("JSIMD_FASTLD3");
- if ((env != NULL) && (strcmp(env, "1") == 0))
- simd_features |= JSIMD_FASTLD3;
- if ((env != NULL) && (strcmp(env, "0") == 0))
- simd_features &= ~JSIMD_FASTLD3;
- env = getenv("JSIMD_FASTST3");
- if ((env != NULL) && (strcmp(env, "1") == 0))
- simd_features |= JSIMD_FASTST3;
- if ((env != NULL) && (strcmp(env, "0") == 0))
- simd_features &= ~JSIMD_FASTST3;
#endif
}
@@ -189,6 +176,19 @@ jsimd_can_rgb_ycc(void)
GLOBAL(int)
jsimd_can_rgb_gray(void)
{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
return 0;
}
@@ -237,20 +237,14 @@ jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
switch (cinfo->in_color_space) {
case JCS_EXT_RGB:
- if (simd_features & JSIMD_FASTLD3)
- neonfct = jsimd_extrgb_ycc_convert_neon;
- else
- neonfct = jsimd_extrgb_ycc_convert_neon_slowld3;
+ neonfct = jsimd_extrgb_ycc_convert_neon;
break;
case JCS_EXT_RGBX:
case JCS_EXT_RGBA:
neonfct = jsimd_extrgbx_ycc_convert_neon;
break;
case JCS_EXT_BGR:
- if (simd_features & JSIMD_FASTLD3)
- neonfct = jsimd_extbgr_ycc_convert_neon;
- else
- neonfct = jsimd_extbgr_ycc_convert_neon_slowld3;
+ neonfct = jsimd_extbgr_ycc_convert_neon;
break;
case JCS_EXT_BGRX:
case JCS_EXT_BGRA:
@@ -265,10 +259,7 @@ jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
neonfct = jsimd_extxrgb_ycc_convert_neon;
break;
default:
- if (simd_features & JSIMD_FASTLD3)
- neonfct = jsimd_extrgb_ycc_convert_neon;
- else
- neonfct = jsimd_extrgb_ycc_convert_neon_slowld3;
+ neonfct = jsimd_extrgb_ycc_convert_neon;
break;
}
@@ -280,6 +271,37 @@ jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
JSAMPIMAGE output_buf, JDIMENSION output_row,
int num_rows)
{
+ void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+ switch (cinfo->in_color_space) {
+ case JCS_EXT_RGB:
+ neonfct = jsimd_extrgb_gray_convert_neon;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_extrgbx_gray_convert_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct = jsimd_extbgr_gray_convert_neon;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_extbgrx_gray_convert_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_extxbgr_gray_convert_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_extxrgb_gray_convert_neon;
+ break;
+ default:
+ neonfct = jsimd_extrgb_gray_convert_neon;
+ break;
+ }
+
+ neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
}
GLOBAL(void)
@@ -291,20 +313,14 @@ jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
switch (cinfo->out_color_space) {
case JCS_EXT_RGB:
- if (simd_features & JSIMD_FASTST3)
- neonfct = jsimd_ycc_extrgb_convert_neon;
- else
- neonfct = jsimd_ycc_extrgb_convert_neon_slowst3;
+ neonfct = jsimd_ycc_extrgb_convert_neon;
break;
case JCS_EXT_RGBX:
case JCS_EXT_RGBA:
neonfct = jsimd_ycc_extrgbx_convert_neon;
break;
case JCS_EXT_BGR:
- if (simd_features & JSIMD_FASTST3)
- neonfct = jsimd_ycc_extbgr_convert_neon;
- else
- neonfct = jsimd_ycc_extbgr_convert_neon_slowst3;
+ neonfct = jsimd_ycc_extbgr_convert_neon;
break;
case JCS_EXT_BGRX:
case JCS_EXT_BGRA:
@@ -319,11 +335,7 @@ jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
neonfct = jsimd_ycc_extxrgb_convert_neon;
break;
default:
- if (simd_features & JSIMD_FASTST3)
- neonfct = jsimd_ycc_extrgb_convert_neon;
- else
- neonfct = jsimd_ycc_extrgb_convert_neon_slowst3;
- break;
+ neonfct = jsimd_ycc_extrgb_convert_neon;
}
neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
@@ -397,12 +409,34 @@ jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
GLOBAL(int)
jsimd_can_h2v2_upsample(void)
{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
return 0;
}
GLOBAL(int)
jsimd_can_h2v1_upsample(void)
{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
return 0;
}
@@ -410,23 +444,66 @@ GLOBAL(void)
jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
+ jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
}
GLOBAL(void)
jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
+ jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
}
GLOBAL(int)
jsimd_can_h2v2_fancy_upsample(void)
{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
return 0;
}
GLOBAL(int)
jsimd_can_h2v1_fancy_upsample(void)
{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
return 0;
}
@@ -434,23 +511,60 @@ GLOBAL(void)
jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
+ jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
}
GLOBAL(void)
jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
+ jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
}
GLOBAL(int)
jsimd_can_h2v2_merged_upsample(void)
{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
return 0;
}
GLOBAL(int)
jsimd_can_h2v1_merged_upsample(void)
{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
return 0;
}
@@ -458,12 +572,74 @@ GLOBAL(void)
jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
{
+ void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct = jsimd_h2v2_extbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon;
+ break;
+ default:
+ neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+ break;
+ }
+
+ neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
}
GLOBAL(void)
jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
{
+ void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+ switch (cinfo->out_color_space) {
+ case JCS_EXT_RGB:
+ neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+ break;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGR:
+ neonfct = jsimd_h2v1_extbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon;
+ break;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon;
+ break;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon;
+ break;
+ default:
+ neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+ break;
+ }
+
+ neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
}
GLOBAL(int)
diff --git a/simd/arm/arm64/jsimd_neon.S b/simd/arm/arm64/jsimd_neon.S
new file mode 100644
index 0000000..898cf2c
--- /dev/null
+++ b/simd/arm/arm64/jsimd_neon.S
@@ -0,0 +1,538 @@
+/*
+ * ARMv8 NEON optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
+ * All Rights Reserved.
+ * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+ * Copyright (C) 2013-2014, Linaro Limited. All Rights Reserved.
+ * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
+ * Copyright (C) 2014-2016, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved.
+ * Copyright (C) 2016, Siarhei Siamashka. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */
+#endif
+
+#if defined(__APPLE__)
+.section __DATA, __const
+#elif defined(_WIN32)
+.section .rdata
+#else
+.section .rodata, "a", %progbits
+#endif
+
+/* Constants for jsimd_huff_encode_one_block_neon() */
+
+.balign 16
+Ljsimd_huff_encode_one_block_neon_consts:
+ .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
+ .byte 0, 1, 2, 3, 16, 17, 32, 33, \
+ 18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */
+ .byte 34, 35, 48, 49, 255, 255, 50, 51, \
+ 36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */
+ .byte 8, 9, 22, 23, 36, 37, 50, 51, \
+ 255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */
+ .byte 54, 55, 40, 41, 26, 27, 12, 13, \
+ 14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */
+ .byte 6, 7, 20, 21, 34, 35, 48, 49, \
+ 50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */
+ .byte 42, 43, 28, 29, 14, 15, 30, 31, \
+ 44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */
+ .byte 255, 255, 255, 255, 56, 57, 42, 43, \
+ 28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */
+ .byte 26, 27, 40, 41, 42, 43, 28, 29, \
+ 14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */
+ .byte 255, 255, 255, 255, 0, 1, 255, 255, \
+ 255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */
+ .byte 255, 255, 255, 255, 255, 255, 255, 255, \
+ 0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */
+ .byte 255, 255, 255, 255, 255, 255, 255, 255, \
+ 255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */
+ .byte 4, 5, 6, 7, 255, 255, 255, 255, \
+ 255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */
+
+.text
+
+
+#define RESPECT_STRICT_ALIGNMENT 1
+
+
+/*****************************************************************************/
+
+/* Supplementary macro for setting function attributes */
+.macro asm_function fname
+#ifdef __APPLE__
+ .private_extern _\fname
+ .globl _\fname
+_\fname:
+#else
+ .global \fname
+#ifdef __ELF__
+ .hidden \fname
+ .type \fname, %function
+#endif
+\fname:
+#endif
+.endm
+
+/* Get symbol location */
+.macro get_symbol_loc reg, symbol
+#ifdef __APPLE__
+ adrp \reg, \symbol@PAGE
+ add \reg, \reg, \symbol@PAGEOFF
+#else
+ adrp \reg, \symbol
+ add \reg, \reg, :lo12:\symbol
+#endif
+.endm
+
+
+#define CENTERJSAMPLE 128
+
+/*****************************************************************************/
+
+/*
+ * GLOBAL(JOCTET *)
+ * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
+ * JCOEFPTR block, int last_dc_val,
+ * c_derived_tbl *dctbl, c_derived_tbl *actbl)
+ *
+ */
+
+ BUFFER .req x1
+ PUT_BUFFER .req x6
+ PUT_BITS .req x7
+ PUT_BITSw .req w7
+
+.macro emit_byte
+ sub PUT_BITS, PUT_BITS, #0x8
+ lsr x19, PUT_BUFFER, PUT_BITS
+ uxtb w19, w19
+ strb w19, [BUFFER, #1]!
+ cmp w19, #0xff
+ b.ne 14f
+ strb wzr, [BUFFER, #1]!
+14:
+.endm
+.macro put_bits CODE, SIZE
+ lsl PUT_BUFFER, PUT_BUFFER, \SIZE
+ add PUT_BITS, PUT_BITS, \SIZE
+ orr PUT_BUFFER, PUT_BUFFER, \CODE
+.endm
+.macro checkbuf31
+ cmp PUT_BITS, #0x20
+ b.lt 31f
+ emit_byte
+ emit_byte
+ emit_byte
+ emit_byte
+31:
+.endm
+.macro checkbuf47
+ cmp PUT_BITS, #0x30
+ b.lt 47f
+ emit_byte
+ emit_byte
+ emit_byte
+ emit_byte
+ emit_byte
+ emit_byte
+47:
+.endm
+
+.macro generate_jsimd_huff_encode_one_block fast_tbl
+
+.balign 16
+.if \fast_tbl == 1
+asm_function jsimd_huff_encode_one_block_neon
+.else
+asm_function jsimd_huff_encode_one_block_neon_slowtbl
+.endif
+ sub sp, sp, 272
+ sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */
+ /* Save ARM registers */
+ stp x19, x20, [sp]
+ get_symbol_loc x15, Ljsimd_huff_encode_one_block_neon_consts
+ ldr PUT_BUFFER, [x0, #0x10]
+ ldr PUT_BITSw, [x0, #0x18]
+ ldrsh w12, [x2] /* load DC coeff in w12 */
+ /* prepare data */
+.if \fast_tbl == 1
+ ld1 {v23.16b}, [x15], #16
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64
+ ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64
+ ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64
+ ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64
+ sub w12, w12, w3 /* last_dc_val, not used afterwards */
+ /* ZigZag 8x8 */
+ tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b
+ tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b
+ tbl v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b
+ tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b
+ tbl v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b
+ tbl v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b
+ tbl v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b
+ tbl v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b
+ ins v0.h[0], w12
+ tbx v1.16b, {v28.16b}, v16.16b
+ tbx v2.16b, {v29.16b, v30.16b}, v17.16b
+ tbx v5.16b, {v29.16b, v30.16b}, v18.16b
+ tbx v6.16b, {v31.16b}, v19.16b
+.else
+ add x13, x2, #0x22
+ sub w12, w12, w3 /* last_dc_val, not used afterwards */
+ ld1 {v23.16b}, [x15]
+ add x14, x2, #0x18
+ add x3, x2, #0x36
+ ins v0.h[0], w12
+ add x9, x2, #0x2
+ ld1 {v1.h}[0], [x13]
+ add x15, x2, #0x30
+ ld1 {v2.h}[0], [x14]
+ add x19, x2, #0x26
+ ld1 {v3.h}[0], [x3]
+ add x20, x2, #0x28
+ ld1 {v0.h}[1], [x9]
+ add x12, x2, #0x10
+ ld1 {v1.h}[1], [x15]
+ add x13, x2, #0x40
+ ld1 {v2.h}[1], [x19]
+ add x14, x2, #0x34
+ ld1 {v3.h}[1], [x20]
+ add x3, x2, #0x1a
+ ld1 {v0.h}[2], [x12]
+ add x9, x2, #0x20
+ ld1 {v1.h}[2], [x13]
+ add x15, x2, #0x32
+ ld1 {v2.h}[2], [x14]
+ add x19, x2, #0x42
+ ld1 {v3.h}[2], [x3]
+ add x20, x2, #0xc
+ ld1 {v0.h}[3], [x9]
+ add x12, x2, #0x12
+ ld1 {v1.h}[3], [x15]
+ add x13, x2, #0x24
+ ld1 {v2.h}[3], [x19]
+ add x14, x2, #0x50
+ ld1 {v3.h}[3], [x20]
+ add x3, x2, #0xe
+ ld1 {v0.h}[4], [x12]
+ add x9, x2, #0x4
+ ld1 {v1.h}[4], [x13]
+ add x15, x2, #0x16
+ ld1 {v2.h}[4], [x14]
+ add x19, x2, #0x60
+ ld1 {v3.h}[4], [x3]
+ add x20, x2, #0x1c
+ ld1 {v0.h}[5], [x9]
+ add x12, x2, #0x6
+ ld1 {v1.h}[5], [x15]
+ add x13, x2, #0x8
+ ld1 {v2.h}[5], [x19]
+ add x14, x2, #0x52
+ ld1 {v3.h}[5], [x20]
+ add x3, x2, #0x2a
+ ld1 {v0.h}[6], [x12]
+ add x9, x2, #0x14
+ ld1 {v1.h}[6], [x13]
+ add x15, x2, #0xa
+ ld1 {v2.h}[6], [x14]
+ add x19, x2, #0x44
+ ld1 {v3.h}[6], [x3]
+ add x20, x2, #0x38
+ ld1 {v0.h}[7], [x9]
+ add x12, x2, #0x46
+ ld1 {v1.h}[7], [x15]
+ add x13, x2, #0x3a
+ ld1 {v2.h}[7], [x19]
+ add x14, x2, #0x74
+ ld1 {v3.h}[7], [x20]
+ add x3, x2, #0x6a
+ ld1 {v4.h}[0], [x12]
+ add x9, x2, #0x54
+ ld1 {v5.h}[0], [x13]
+ add x15, x2, #0x2c
+ ld1 {v6.h}[0], [x14]
+ add x19, x2, #0x76
+ ld1 {v7.h}[0], [x3]
+ add x20, x2, #0x78
+ ld1 {v4.h}[1], [x9]
+ add x12, x2, #0x62
+ ld1 {v5.h}[1], [x15]
+ add x13, x2, #0x1e
+ ld1 {v6.h}[1], [x19]
+ add x14, x2, #0x68
+ ld1 {v7.h}[1], [x20]
+ add x3, x2, #0x7a
+ ld1 {v4.h}[2], [x12]
+ add x9, x2, #0x70
+ ld1 {v5.h}[2], [x13]
+ add x15, x2, #0x2e
+ ld1 {v6.h}[2], [x14]
+ add x19, x2, #0x5a
+ ld1 {v7.h}[2], [x3]
+ add x20, x2, #0x6c
+ ld1 {v4.h}[3], [x9]
+ add x12, x2, #0x72
+ ld1 {v5.h}[3], [x15]
+ add x13, x2, #0x3c
+ ld1 {v6.h}[3], [x19]
+ add x14, x2, #0x4c
+ ld1 {v7.h}[3], [x20]
+ add x3, x2, #0x5e
+ ld1 {v4.h}[4], [x12]
+ add x9, x2, #0x64
+ ld1 {v5.h}[4], [x13]
+ add x15, x2, #0x4a
+ ld1 {v6.h}[4], [x14]
+ add x19, x2, #0x3e
+ ld1 {v7.h}[4], [x3]
+ add x20, x2, #0x6e
+ ld1 {v4.h}[5], [x9]
+ add x12, x2, #0x56
+ ld1 {v5.h}[5], [x15]
+ add x13, x2, #0x58
+ ld1 {v6.h}[5], [x19]
+ add x14, x2, #0x4e
+ ld1 {v7.h}[5], [x20]
+ add x3, x2, #0x7c
+ ld1 {v4.h}[6], [x12]
+ add x9, x2, #0x48
+ ld1 {v5.h}[6], [x13]
+ add x15, x2, #0x66
+ ld1 {v6.h}[6], [x14]
+ add x19, x2, #0x5c
+ ld1 {v7.h}[6], [x3]
+ add x20, x2, #0x7e
+ ld1 {v4.h}[7], [x9]
+ ld1 {v5.h}[7], [x15]
+ ld1 {v6.h}[7], [x19]
+ ld1 {v7.h}[7], [x20]
+.endif
+ cmlt v24.8h, v0.8h, #0
+ cmlt v25.8h, v1.8h, #0
+ cmlt v26.8h, v2.8h, #0
+ cmlt v27.8h, v3.8h, #0
+ cmlt v28.8h, v4.8h, #0
+ cmlt v29.8h, v5.8h, #0
+ cmlt v30.8h, v6.8h, #0
+ cmlt v31.8h, v7.8h, #0
+ abs v0.8h, v0.8h
+ abs v1.8h, v1.8h
+ abs v2.8h, v2.8h
+ abs v3.8h, v3.8h
+ abs v4.8h, v4.8h
+ abs v5.8h, v5.8h
+ abs v6.8h, v6.8h
+ abs v7.8h, v7.8h
+ eor v24.16b, v24.16b, v0.16b
+ eor v25.16b, v25.16b, v1.16b
+ eor v26.16b, v26.16b, v2.16b
+ eor v27.16b, v27.16b, v3.16b
+ eor v28.16b, v28.16b, v4.16b
+ eor v29.16b, v29.16b, v5.16b
+ eor v30.16b, v30.16b, v6.16b
+ eor v31.16b, v31.16b, v7.16b
+ cmeq v16.8h, v0.8h, #0
+ cmeq v17.8h, v1.8h, #0
+ cmeq v18.8h, v2.8h, #0
+ cmeq v19.8h, v3.8h, #0
+ cmeq v20.8h, v4.8h, #0
+ cmeq v21.8h, v5.8h, #0
+ cmeq v22.8h, v6.8h, #0
+ xtn v16.8b, v16.8h
+ xtn v18.8b, v18.8h
+ xtn v20.8b, v20.8h
+ xtn v22.8b, v22.8h
+ umov w14, v0.h[0]
+ xtn2 v16.16b, v17.8h
+ umov w13, v24.h[0]
+ xtn2 v18.16b, v19.8h
+ clz w14, w14
+ xtn2 v20.16b, v21.8h
+ lsl w13, w13, w14
+ cmeq v17.8h, v7.8h, #0
+ sub w12, w14, #32
+ xtn2 v22.16b, v17.8h
+ lsr w13, w13, w14
+ and v16.16b, v16.16b, v23.16b
+ neg w12, w12
+ and v18.16b, v18.16b, v23.16b
+ add x3, x4, #0x400 /* r1 = dctbl->ehufsi */
+ and v20.16b, v20.16b, v23.16b
+ add x15, sp, #0x90 /* x15 = t2 */
+ and v22.16b, v22.16b, v23.16b
+ ldr w10, [x4, x12, lsl #2]
+ addp v16.16b, v16.16b, v18.16b
+ ldrb w11, [x3, x12]
+ addp v20.16b, v20.16b, v22.16b
+ checkbuf47
+ addp v16.16b, v16.16b, v20.16b
+ put_bits x10, x11
+ addp v16.16b, v16.16b, v18.16b
+ checkbuf47
+ umov x9, v16.D[0]
+ put_bits x13, x12
+ cnt v17.8b, v16.8b
+ mvn x9, x9
+ addv B18, v17.8b
+ add x4, x5, #0x400 /* x4 = actbl->ehufsi */
+ umov w12, v18.b[0]
+ lsr x9, x9, #0x1 /* clear AC coeff */
+ ldr w13, [x5, #0x3c0] /* x13 = actbl->ehufco[0xf0] */
+ rbit x9, x9 /* x9 = index0 */
+ ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */
+ cmp w12, #(64-8)
+ add x11, sp, #16
+ b.lt 4f
+ cbz x9, 6f
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
+ st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
+ st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
+1:
+ clz x2, x9
+ add x15, x15, x2, lsl #1
+ lsl x9, x9, x2
+ ldrh w20, [x15, #-126]
+2:
+ cmp x2, #0x10
+ b.lt 3f
+ sub x2, x2, #0x10
+ checkbuf47
+ put_bits x13, x14
+ b 2b
+3:
+ clz w20, w20
+ ldrh w3, [x15, #2]!
+ sub w11, w20, #32
+ lsl w3, w3, w20
+ neg w11, w11
+ lsr w3, w3, w20
+ add x2, x11, x2, lsl #4
+ lsl x9, x9, #0x1
+ ldr w12, [x5, x2, lsl #2]
+ ldrb w10, [x4, x2]
+ checkbuf31
+ put_bits x12, x10
+ put_bits x3, x11
+ cbnz x9, 1b
+ b 6f
+4:
+ movi v21.8h, #0x0010
+ clz v0.8h, v0.8h
+ clz v1.8h, v1.8h
+ clz v2.8h, v2.8h
+ clz v3.8h, v3.8h
+ clz v4.8h, v4.8h
+ clz v5.8h, v5.8h
+ clz v6.8h, v6.8h
+ clz v7.8h, v7.8h
+ ushl v24.8h, v24.8h, v0.8h
+ ushl v25.8h, v25.8h, v1.8h
+ ushl v26.8h, v26.8h, v2.8h
+ ushl v27.8h, v27.8h, v3.8h
+ ushl v28.8h, v28.8h, v4.8h
+ ushl v29.8h, v29.8h, v5.8h
+ ushl v30.8h, v30.8h, v6.8h
+ ushl v31.8h, v31.8h, v7.8h
+ neg v0.8h, v0.8h
+ neg v1.8h, v1.8h
+ neg v2.8h, v2.8h
+ neg v3.8h, v3.8h
+ neg v4.8h, v4.8h
+ neg v5.8h, v5.8h
+ neg v6.8h, v6.8h
+ neg v7.8h, v7.8h
+ ushl v24.8h, v24.8h, v0.8h
+ ushl v25.8h, v25.8h, v1.8h
+ ushl v26.8h, v26.8h, v2.8h
+ ushl v27.8h, v27.8h, v3.8h
+ ushl v28.8h, v28.8h, v4.8h
+ ushl v29.8h, v29.8h, v5.8h
+ ushl v30.8h, v30.8h, v6.8h
+ ushl v31.8h, v31.8h, v7.8h
+ add v0.8h, v21.8h, v0.8h
+ add v1.8h, v21.8h, v1.8h
+ add v2.8h, v21.8h, v2.8h
+ add v3.8h, v21.8h, v3.8h
+ add v4.8h, v21.8h, v4.8h
+ add v5.8h, v21.8h, v5.8h
+ add v6.8h, v21.8h, v6.8h
+ add v7.8h, v21.8h, v7.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
+ st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
+ st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
+1:
+ clz x2, x9
+ add x15, x15, x2, lsl #1
+ lsl x9, x9, x2
+ ldrh w11, [x15, #-126]
+2:
+ cmp x2, #0x10
+ b.lt 3f
+ sub x2, x2, #0x10
+ checkbuf47
+ put_bits x13, x14
+ b 2b
+3:
+ ldrh w3, [x15, #2]!
+ add x2, x11, x2, lsl #4
+ lsl x9, x9, #0x1
+ ldr w12, [x5, x2, lsl #2]
+ ldrb w10, [x4, x2]
+ checkbuf31
+ put_bits x12, x10
+ put_bits x3, x11
+ cbnz x9, 1b
+6:
+ add x13, sp, #0x10e
+ cmp x15, x13
+ b.hs 1f
+ ldr w12, [x5]
+ ldrb w14, [x4]
+ checkbuf47
+ put_bits x12, x14
+1:
+ str PUT_BUFFER, [x0, #0x10]
+ str PUT_BITSw, [x0, #0x18]
+ ldp x19, x20, [sp], 16
+ add x0, BUFFER, #0x1
+ add sp, sp, 256
+ br x30
+
+.endm
+
+generate_jsimd_huff_encode_one_block 1
+generate_jsimd_huff_encode_one_block 0
+
+ .unreq BUFFER
+ .unreq PUT_BUFFER
+ .unreq PUT_BITS
+ .unreq PUT_BITSw
+
+.purgem emit_byte
+.purgem put_bits
+.purgem checkbuf31
+.purgem checkbuf47
diff --git a/simd/arm/common/jccolor-neon.c b/simd/arm/common/jccolor-neon.c
new file mode 100644
index 0000000..f87c8d9
--- /dev/null
+++ b/simd/arm/common/jccolor-neon.c
@@ -0,0 +1,158 @@
+/*
+ * jccolor-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright 2020 The Chromium Authors. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jconfigint.h"
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <arm_neon.h>
+
+/* RGB -> YCbCr conversion constants. */
+
+#define F_0_298 19595
+#define F_0_587 38470
+#define F_0_113 7471
+#define F_0_168 11059
+#define F_0_331 21709
+#define F_0_500 32768
+#define F_0_418 27439
+#define F_0_081 5329
+
+ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
+ F_0_298, F_0_587,
+ F_0_113, F_0_168,
+ F_0_331, F_0_500,
+ F_0_418, F_0_081
+ };
+
+/* Include inline routines for colorspace extensions. */
+
+#if defined(__aarch64__)
+#include "../arm64/jccolext-neon.c"
+#else
+#include "../arm/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon jsimd_extrgb_ycc_convert_neon
+#if defined(__aarch64__)
+#include "../arm64/jccolext-neon.c"
+#else
+#include "../arm/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon jsimd_extrgbx_ycc_convert_neon
+#if defined(__aarch64__)
+#include "../arm64/jccolext-neon.c"
+#else
+#include "../arm/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon jsimd_extbgr_ycc_convert_neon
+#if defined(__aarch64__)
+#include "../arm64/jccolext-neon.c"
+#else
+#include "../arm/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon jsimd_extbgrx_ycc_convert_neon
+#if defined(__aarch64__)
+#include "../arm64/jccolext-neon.c"
+#else
+#include "../arm/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon jsimd_extxbgr_ycc_convert_neon
+#if defined(__aarch64__)
+#include "../arm64/jccolext-neon.c"
+#else
+#include "../arm/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon jsimd_extxrgb_ycc_convert_neon
+#if defined(__aarch64__)
+#include "../arm64/jccolext-neon.c"
+#else
+#include "../arm/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
diff --git a/simd/arm/common/jcgray-neon.c b/simd/arm/common/jcgray-neon.c
new file mode 100644
index 0000000..39d903f
--- /dev/null
+++ b/simd/arm/common/jcgray-neon.c
@@ -0,0 +1,118 @@
+/*
+ * jcgray-neon.c - grayscale colorspace conversion (Arm NEON)
+ *
+ * Copyright 2020 The Chromium Authors. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jconfigint.h"
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <arm_neon.h>
+
+/* RGB -> Grayscale conversion constants. */
+
+#define F_0_298 19595
+#define F_0_587 38470
+#define F_0_113 7471
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon jsimd_extrgb_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon jsimd_extrgbx_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon jsimd_extbgr_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon jsimd_extbgrx_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon jsimd_extxbgr_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon jsimd_extxrgb_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
diff --git a/simd/arm/common/jcgryext-neon.c b/simd/arm/common/jcgryext-neon.c
new file mode 100644
index 0000000..69ea67f
--- /dev/null
+++ b/simd/arm/common/jcgryext-neon.c
@@ -0,0 +1,107 @@
+/*
+ * jcgryext-neon.c - grayscale colorspace conversion (Arm NEON)
+ *
+ * Copyright 2020 The Chromium Authors. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jcgray-neon.c */
+
+/*
+ * RGB -> Grayscale conversion is defined by the following equation:
+ * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ * 0.29899597 = 19595 * 2^-16
+ * 0.58700561 = 38470 * 2^-16
+ * 0.11399841 = 7471 * 2^-16
+ * These constants are defined in jcgray-neon.c
+ *
+ * We use rounding later to get correct values.
+ *
+ * This is the same computation as the RGB -> Y portion of RGB -> YCbCr.
+ */
+
+void jsimd_rgb_gray_convert_neon(JDIMENSION image_width,
+ JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf,
+ JDIMENSION output_row,
+ int num_rows)
+{
+ JSAMPROW inptr;
+ JSAMPROW outptr;
+
+ while (--num_rows >= 0) {
+ inptr = *input_buf++;
+ outptr = output_buf[0][output_row];
+ output_row++;
+
+ int cols_remaining = image_width;
+ for (; cols_remaining > 0; cols_remaining -= 16) {
+
+ /* To prevent buffer overread by the vector load instructions, the */
+ /* last (image_width % 16) columns of data are first memcopied to a */
+ /* temporary buffer large enough to accommodate the vector load. */
+ if (cols_remaining < 16) {
+ ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
+ memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+ inptr = tmp_buf;
+ }
+
+#if RGB_PIXELSIZE == 4
+ uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+ uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+ uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+ uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+ uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+ uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+ uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+ uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+ /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+ uint32x4_t y_ll = vmull_n_u16(vget_low_u16(r_l), F_0_298);
+ uint32x4_t y_lh = vmull_n_u16(vget_high_u16(r_l), F_0_298);
+ uint32x4_t y_hl = vmull_n_u16(vget_low_u16(r_h), F_0_298);
+ uint32x4_t y_hh = vmull_n_u16(vget_high_u16(r_h), F_0_298);
+ y_ll = vmlal_n_u16(y_ll, vget_low_u16(g_l), F_0_587);
+ y_lh = vmlal_n_u16(y_lh, vget_high_u16(g_l), F_0_587);
+ y_hl = vmlal_n_u16(y_hl, vget_low_u16(g_h), F_0_587);
+ y_hh = vmlal_n_u16(y_hh, vget_high_u16(g_h), F_0_587);
+ y_ll = vmlal_n_u16(y_ll, vget_low_u16(b_l), F_0_113);
+ y_lh = vmlal_n_u16(y_lh, vget_high_u16(b_l), F_0_113);
+ y_hl = vmlal_n_u16(y_hl, vget_low_u16(b_h), F_0_113);
+ y_hh = vmlal_n_u16(y_hh, vget_high_u16(b_h), F_0_113);
+
+ /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+ uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+ vrshrn_n_u32(y_lh, 16));
+ uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+ vrshrn_n_u32(y_hh, 16));
+
+ /* Narrow Y values to 8-bit and store to memory. Buffer overwrite is */
+ /* permitted up to the next multiple of ALIGN_SIZE bytes. */
+ vst1q_u8(outptr, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+
+ /* Increment pointers. */
+ inptr += (16 * RGB_PIXELSIZE);
+ outptr += 16;
+ }
+ }
+}
diff --git a/simd/arm/common/jcsample-neon.c b/simd/arm/common/jcsample-neon.c
new file mode 100644
index 0000000..a5ddf16
--- /dev/null
+++ b/simd/arm/common/jcsample-neon.c
@@ -0,0 +1,191 @@
+/*
+ * jcsample-neon.c - downsampling (Arm NEON)
+ *
+ * Copyright 2020 The Chromium Authors. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jconfigint.h"
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <arm_neon.h>
+
+
+ALIGN(16) static const uint8_t jsimd_h2_downsample_consts[] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 0 */
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 1 */
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 2 */
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 3 */
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 4 */
+ 0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 5 */
+ 0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 6 */
+ 0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 7 */
+ 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 8 */
+ 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, /* Pad 9 */
+ 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, /* Pad 10 */
+ 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, /* Pad 11 */
+ 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
+ 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, /* Pad 12 */
+ 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
+ 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* Pad 13 */
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+ 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, /* Pad 14 */
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Pad 15 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+
+/*
+ * Downsample pixel values of a single chroma component i.e. Cb, Cr.
+ * This version handles the common case of 2:1 horizontal and 1:1 vertical,
+ * without smoothing.
+ */
+
+void jsimd_h2v1_downsample_neon(JDIMENSION image_width,
+ int max_v_samp_factor,
+ JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks,
+ JSAMPARRAY input_data,
+ JSAMPARRAY output_data)
+{
+ JSAMPROW inptr, outptr;
+ /* Load expansion mask to pad remaining elements of last DCT block. */
+ const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
+ const uint8x16_t expand_mask = vld1q_u8(
+ &jsimd_h2_downsample_consts[mask_offset]);
+ /* Load bias pattern alternating every pixel. */
+ const uint16x8_t bias = { 0, 1, 0, 1, 0, 1, 0, 1 };
+
+ for (unsigned outrow = 0; outrow < v_samp_factor; outrow++) {
+ outptr = output_data[outrow];
+ inptr = input_data[outrow];
+
+ /* Downsample all but the last DCT block of pixels. */
+ for (unsigned i = 0; i < width_in_blocks - 1; i++) {
+ uint8x16_t pixels = vld1q_u8(inptr + i * 2 * DCTSIZE);
+ /* Add adjacent pixel values, widen to 16-bit and add bias. */
+ uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
+ /* Divide total by 2 and narrow to 8-bit. */
+ uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
+ /* Store samples to memory. */
+ vst1_u8(outptr + i * DCTSIZE, samples_u8);
+ }
+
+ /* Load pixels in last DCT block into a table. */
+ uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE);
+#if defined(__aarch64__)
+ /* Pad the empty elements with the value of the last pixel. */
+ pixels = vqtbl1q_u8(pixels, expand_mask);
+#else
+ uint8x8x2_t table = { vget_low_u8(pixels), vget_high_u8(pixels) };
+ pixels = vcombine_u8(vtbl2_u8(table, vget_low_u8(expand_mask)),
+ vtbl2_u8(table, vget_high_u8(expand_mask)));
+#endif
+ /* Add adjacent pixel values, widen to 16-bit and add bias. */
+ uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
+ /* Divide total by 2, narrow to 8-bit and store. */
+ uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
+ vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
+ }
+}
+
+
+/*
+ * Downsample pixel values of a single chroma component i.e. Cb, Cr.
+ * This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+ * without smoothing.
+ */
+
+void jsimd_h2v2_downsample_neon(JDIMENSION image_width,
+ int max_v_samp_factor,
+ JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks,
+ JSAMPARRAY input_data,
+ JSAMPARRAY output_data)
+{
+ JSAMPROW inptr0, inptr1, outptr;
+ /* Load expansion mask to pad remaining elements of last DCT block. */
+ const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
+ const uint8x16_t expand_mask = vld1q_u8(
+ &jsimd_h2_downsample_consts[mask_offset]);
+ /* Load bias pattern alternating every pixel. */
+ const uint16x8_t bias = { 1, 2, 1, 2, 1, 2, 1, 2 };
+
+ for (unsigned outrow = 0; outrow < v_samp_factor; outrow++) {
+ outptr = output_data[outrow];
+ inptr0 = input_data[outrow];
+ inptr1 = input_data[outrow + 1];
+
+ /* Downsample all but the last DCT block of pixels. */
+ for (unsigned i = 0; i < width_in_blocks - 1; i++) {
+ uint8x16_t pixels_r0 = vld1q_u8(inptr0 + i * 2 * DCTSIZE);
+ uint8x16_t pixels_r1 = vld1q_u8(inptr1 + i * 2 * DCTSIZE);
+ /* Add adjacent pixel values in row 0, widen to 16-bit and add bias. */
+ uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
+ /* Add adjacent pixel values in row 1, widen to 16-bit and accumulate. */
+ samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
+ /* Divide total by 4 and narrow to 8-bit. */
+ uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
+ /* Store samples to memory and increment pointers. */
+ vst1_u8(outptr + i * DCTSIZE, samples_u8);
+ }
+
+ /* Load pixels in last DCT block into a table. */
+ uint8x16_t pixels_r0 = vld1q_u8(
+ inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);
+ uint8x16_t pixels_r1 = vld1q_u8(
+ inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);
+#if defined(__aarch64__)
+ /* Pad the empty elements with the value of the last pixel. */
+ pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask);
+ pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask);
+#else
+ uint8x8x2_t table_r0 = { vget_low_u8(pixels_r0), vget_high_u8(pixels_r0) };
+ uint8x8x2_t table_r1 = { vget_low_u8(pixels_r1), vget_high_u8(pixels_r1) };
+ pixels_r0 = vcombine_u8(vtbl2_u8(table_r0, vget_low_u8(expand_mask)),
+ vtbl2_u8(table_r0, vget_high_u8(expand_mask)));
+ pixels_r1 = vcombine_u8(vtbl2_u8(table_r1, vget_low_u8(expand_mask)),
+ vtbl2_u8(table_r1, vget_high_u8(expand_mask)));
+#endif
+ /* Add adjacent pixel values in row 0, widen to 16-bit and add bias. */
+ uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
+ /* Add adjacent pixel values in row 1, widen to 16-bit and accumulate. */
+ samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
+ /* Divide total by 4, narrow to 8-bit and store. */
+ uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
+ vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
+ }
+}
diff --git a/simd/arm/common/jdcolext-neon.c b/simd/arm/common/jdcolext-neon.c
new file mode 100644
index 0000000..b201792
--- /dev/null
+++ b/simd/arm/common/jdcolext-neon.c
@@ -0,0 +1,330 @@
+/*
+ * jdcolext-neon.c - colorspace conversion (Arm NEON)
+ *
+ * Copyright 2019 The Chromium Authors. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-neon.c. */
+
+/*
+ * YCbCr -> RGB conversion is defined by the following equations:
+ * R = Y + 1.40200 * (Cr - 128)
+ * G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
+ * B = Y + 1.77200 * (Cb - 128)
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ * 0.3441467 = 11277 * 2^-15
+ * 0.7141418 = 23401 * 2^-15
+ * 1.4020386 = 22971 * 2^-14
+ * 1.7720337 = 29033 * 2^-14
+ * These constants are defined in jdcolor-neon.c.
+ *
+ * Rounding is used when descaling to ensure correct results.
+ */
+
+/*
+ * Notes on safe memory access for YCbCr -> RGB conversion routines:
+ *
+ * Input memory buffers can be safely overread up to the next multiple of
+ * ALIGN_SIZE bytes since they are always allocated by alloc_sarray() in
+ * jmemmgr.c.
+ *
+ * The output buffer cannot safely be written beyond output_width since the
+ * TurboJPEG API permits it to be allocated with or without padding up to the
+ * next multiple of ALIGN_SIZE bytes.
+ */
+
+void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width,
+ JSAMPIMAGE input_buf,
+ JDIMENSION input_row,
+ JSAMPARRAY output_buf,
+ int num_rows)
+{
+ JSAMPROW outptr;
+ /* Pointers to Y, Cb and Cr data. */
+ JSAMPROW inptr0, inptr1, inptr2;
+
+ const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+ while (--num_rows >= 0) {
+ inptr0 = input_buf[0][input_row];
+ inptr1 = input_buf[1][input_row];
+ inptr2 = input_buf[2][input_row];
+ input_row++;
+ outptr = *output_buf++;
+ int cols_remaining = output_width;
+ for (; cols_remaining >= 16; cols_remaining -= 16) {
+ uint8x16_t y = vld1q_u8(inptr0);
+ uint8x16_t cb = vld1q_u8(inptr1);
+ uint8x16_t cr = vld1q_u8(inptr2);
+ /* Subtract 128 from Cb and Cr. */
+ int16x8_t cr_128_l = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(neg_128), vget_low_u8(cr)));
+ int16x8_t cr_128_h = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(neg_128), vget_high_u8(cr)));
+ int16x8_t cb_128_l = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(neg_128), vget_low_u8(cb)));
+ int16x8_t cb_128_h = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(neg_128), vget_high_u8(cb)));
+ /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+ int32x4_t g_sub_y_ll = vmull_n_s16(vget_low_s16(cb_128_l), -F_0_344);
+ int32x4_t g_sub_y_lh = vmull_n_s16(vget_high_s16(cb_128_l), -F_0_344);
+ int32x4_t g_sub_y_hl = vmull_n_s16(vget_low_s16(cb_128_h), -F_0_344);
+ int32x4_t g_sub_y_hh = vmull_n_s16(vget_high_s16(cb_128_h), -F_0_344);
+ g_sub_y_ll = vmlsl_n_s16(g_sub_y_ll, vget_low_s16(cr_128_l), F_0_714);
+ g_sub_y_lh = vmlsl_n_s16(g_sub_y_lh, vget_high_s16(cr_128_l), F_0_714);
+ g_sub_y_hl = vmlsl_n_s16(g_sub_y_hl, vget_low_s16(cr_128_h), F_0_714);
+ g_sub_y_hh = vmlsl_n_s16(g_sub_y_hh, vget_high_s16(cr_128_h), F_0_714);
+ /* Descale G components: shift right 15, round and narrow to 16-bit. */
+ int16x8_t g_sub_y_l = vcombine_s16(vrshrn_n_s32(g_sub_y_ll, 15),
+ vrshrn_n_s32(g_sub_y_lh, 15));
+ int16x8_t g_sub_y_h = vcombine_s16(vrshrn_n_s32(g_sub_y_hl, 15),
+ vrshrn_n_s32(g_sub_y_hh, 15));
+ /* Compute R-Y: 1.40200 * (Cr - 128) */
+ int16x8_t r_sub_y_l = vqrdmulhq_n_s16(vshlq_n_s16(cr_128_l, 1), F_1_402);
+ int16x8_t r_sub_y_h = vqrdmulhq_n_s16(vshlq_n_s16(cr_128_h, 1), F_1_402);
+ /* Compute B-Y: 1.77200 * (Cb - 128) */
+ int16x8_t b_sub_y_l = vqrdmulhq_n_s16(vshlq_n_s16(cb_128_l, 1), F_1_772);
+ int16x8_t b_sub_y_h = vqrdmulhq_n_s16(vshlq_n_s16(cb_128_h, 1), F_1_772);
+ /* Add Y. */
+ int16x8_t r_l = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(r_sub_y_l), vget_low_u8(y)));
+ int16x8_t r_h = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(r_sub_y_h), vget_high_u8(y)));
+ int16x8_t b_l = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(b_sub_y_l), vget_low_u8(y)));
+ int16x8_t b_h = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(b_sub_y_h), vget_high_u8(y)));
+ int16x8_t g_l = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(g_sub_y_l), vget_low_u8(y)));
+ int16x8_t g_h = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h), vget_high_u8(y)));
+
+#if RGB_PIXELSIZE == 4
+ uint8x16x4_t rgba;
+ /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+ rgba.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
+ rgba.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
+ rgba.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
+ /* Set alpha channel to opaque (0xFF). */
+ rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+ /* Store RGBA pixel data to memory. */
+ vst4q_u8(outptr, rgba);
+#elif RGB_PIXELSIZE == 3
+ uint8x16x3_t rgb;
+ /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+ rgb.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
+ rgb.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
+ rgb.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
+ /* Store RGB pixel data to memory. */
+ vst3q_u8(outptr, rgb);
+#else /* RGB565 */
+ /* Pack R, G and B values in ratio 5:6:5. */
+ uint16x8_t rgb565_l = vqshluq_n_s16(r_l, 8);
+ rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(g_l, 8), 5);
+ rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(b_l, 8), 11);
+ uint16x8_t rgb565_h = vqshluq_n_s16(r_h, 8);
+ rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(g_h, 8), 5);
+ rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(b_h, 8), 11);
+ /* Store RGB pixel data to memory. */
+ vst1q_u16((uint16_t *)outptr, rgb565_l);
+ vst1q_u16(((uint16_t *)outptr) + 8, rgb565_h);
+#endif /* RGB565 */
+
+ /* Increment pointers. */
+ inptr0 += 16;
+ inptr1 += 16;
+ inptr2 += 16;
+ outptr += (RGB_PIXELSIZE * 16);
+ }
+
+ if (cols_remaining >= 8) {
+ uint8x8_t y = vld1_u8(inptr0);
+ uint8x8_t cb = vld1_u8(inptr1);
+ uint8x8_t cr = vld1_u8(inptr2);
+ /* Subtract 128 from Cb and Cr. */
+ int16x8_t cr_128 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+ int16x8_t cb_128 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+ /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+ int32x4_t g_sub_y_l = vmull_n_s16(vget_low_s16(cb_128), -F_0_344);
+ int32x4_t g_sub_y_h = vmull_n_s16(vget_high_s16(cb_128), -F_0_344);
+ g_sub_y_l = vmlsl_n_s16(g_sub_y_l, vget_low_s16(cr_128), F_0_714);
+ g_sub_y_h = vmlsl_n_s16(g_sub_y_h, vget_high_s16(cr_128), F_0_714);
+ /* Descale G components: shift right 15, round and narrow to 16-bit. */
+ int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+ vrshrn_n_s32(g_sub_y_h, 15));
+ /* Compute R-Y: 1.40200 * (Cr - 128) */
+ int16x8_t r_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cr_128, 1), F_1_402);
+ /* Compute B-Y: 1.77200 * (Cb - 128) */
+ int16x8_t b_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cb_128, 1), F_1_772);
+ /* Add Y. */
+ int16x8_t r = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
+ int16x8_t b = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
+ int16x8_t g = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
+
+#if RGB_PIXELSIZE == 4
+ uint8x8x4_t rgba;
+ /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+ rgba.val[RGB_RED] = vqmovun_s16(r);
+ rgba.val[RGB_GREEN] = vqmovun_s16(g);
+ rgba.val[RGB_BLUE] = vqmovun_s16(b);
+ /* Set alpha channel to opaque (0xFF). */
+ rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+ /* Store RGBA pixel data to memory. */
+ vst4_u8(outptr, rgba);
+#elif RGB_PIXELSIZE == 3
+ uint8x8x3_t rgb;
+ /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+ rgb.val[RGB_RED] = vqmovun_s16(r);
+ rgb.val[RGB_GREEN] = vqmovun_s16(g);
+ rgb.val[RGB_BLUE] = vqmovun_s16(b);
+ /* Store RGB pixel data to memory. */
+ vst3_u8(outptr, rgb);
+#else /* RGB565 */
+ /* Pack R, G and B values in ratio 5:6:5. */
+ uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
+ rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
+ rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
+ /* Store RGB pixel data to memory. */
+ vst1q_u16((uint16_t *)outptr, rgb565);
+#endif /* RGB565 */
+
+ /* Increment pointers. */
+ inptr0 += 8;
+ inptr1 += 8;
+ inptr2 += 8;
+ outptr += (RGB_PIXELSIZE * 8);
+ cols_remaining -= 8;
+ }
+
+ /* Handle the tail elements. */
+ if (cols_remaining > 0) {
+ uint8x8_t y = vld1_u8(inptr0);
+ uint8x8_t cb = vld1_u8(inptr1);
+ uint8x8_t cr = vld1_u8(inptr2);
+ /* Subtract 128 from Cb and Cr. */
+ int16x8_t cr_128 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+ int16x8_t cb_128 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+ /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+ int32x4_t g_sub_y_l = vmull_n_s16(vget_low_s16(cb_128), -F_0_344);
+ int32x4_t g_sub_y_h = vmull_n_s16(vget_high_s16(cb_128), -F_0_344);
+ g_sub_y_l = vmlsl_n_s16(g_sub_y_l, vget_low_s16(cr_128), F_0_714);
+ g_sub_y_h = vmlsl_n_s16(g_sub_y_h, vget_high_s16(cr_128), F_0_714);
+ /* Descale G components: shift right 15, round and narrow to 16-bit. */
+ int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+ vrshrn_n_s32(g_sub_y_h, 15));
+ /* Compute R-Y: 1.40200 * (Cr - 128) */
+ int16x8_t r_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cr_128, 1), F_1_402);
+ /* Compute B-Y: 1.77200 * (Cb - 128) */
+ int16x8_t b_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cb_128, 1), F_1_772);
+ /* Add Y. */
+ int16x8_t r = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
+ int16x8_t b = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
+ int16x8_t g = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
+
+#if RGB_PIXELSIZE == 4
+ uint8x8x4_t rgba;
+ /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+ rgba.val[RGB_RED] = vqmovun_s16(r);
+ rgba.val[RGB_GREEN] = vqmovun_s16(g);
+ rgba.val[RGB_BLUE] = vqmovun_s16(b);
+ /* Set alpha channel to opaque (0xFF). */
+ rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+ /* Store RGBA pixel data to memory. */
+ switch (cols_remaining) {
+ case 7 :
+ vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba, 6);
+ case 6 :
+ vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba, 5);
+ case 5 :
+ vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba, 4);
+ case 4 :
+ vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba, 3);
+ case 3 :
+ vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba, 2);
+ case 2 :
+ vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba, 1);
+ case 1 :
+ vst4_lane_u8(outptr, rgba, 0);
+ default:
+ break;
+ }
+#elif RGB_PIXELSIZE == 3
+ uint8x8x3_t rgb;
+ /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+ rgb.val[RGB_RED] = vqmovun_s16(r);
+ rgb.val[RGB_GREEN] = vqmovun_s16(g);
+ rgb.val[RGB_BLUE] = vqmovun_s16(b);
+ /* Store RGB pixel data to memory. */
+ switch (cols_remaining) {
+ case 7 :
+ vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb, 6);
+ case 6 :
+ vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb, 5);
+ case 5 :
+ vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb, 4);
+ case 4 :
+ vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb, 3);
+ case 3 :
+ vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb, 2);
+ case 2 :
+ vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb, 1);
+ case 1 :
+ vst3_lane_u8(outptr, rgb, 0);
+ default:
+ break;
+ }
+#else /* RGB565 */
+ /* Pack R, G and B values in ratio 5:6:5. */
+ uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
+ rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
+ rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
+ /* Store RGB565 pixel data to memory. */
+ switch (cols_remaining) {
+ case 7 :
+ vst1q_lane_u16(outptr + 6 * RGB_PIXELSIZE, rgb565, 6);
+ case 6 :
+ vst1q_lane_u16(outptr + 5 * RGB_PIXELSIZE, rgb565, 5);
+ case 5 :
+ vst1q_lane_u16(outptr + 4 * RGB_PIXELSIZE, rgb565, 4);
+ case 4 :
+ vst1q_lane_u16(outptr + 3 * RGB_PIXELSIZE, rgb565, 3);
+ case 3 :
+ vst1q_lane_u16(outptr + 2 * RGB_PIXELSIZE, rgb565, 2);
+ case 2 :
+ vst1q_lane_u16(outptr + RGB_PIXELSIZE, rgb565, 1);
+ case 1 :
+ vst1q_lane_u16(outptr, rgb565, 0);
+ default:
+ break;
+ }
+#endif /* RGB565 */
+ }
+ }
+}
diff --git a/simd/arm/common/jdcolor-neon.c b/simd/arm/common/jdcolor-neon.c
new file mode 100644
index 0000000..52dab1e
--- /dev/null
+++ b/simd/arm/common/jdcolor-neon.c
@@ -0,0 +1,134 @@
+/*
+ * jdcolor-neon.c - colorspace conversion (Arm NEON)
+ *
+ * Copyright 2019 The Chromium Authors. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <arm_neon.h>
+
+/* YCbCr -> RGB conversion constants. */
+
+#define F_0_344 11277 /* 0.3441467 = 11277 * 2^-15 */
+#define F_0_714 23401 /* 0.7141418 = 23401 * 2^-15 */
+#define F_1_402 22971 /* 1.4020386 = 22971 * 2^-14 */
+#define F_1_772 29033 /* 1.7720337 = 29033 * 2^-14 */
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extrgb_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_ALPHA 3
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extrgbx_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extbgr_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_ALPHA 3
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extbgrx_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_ALPHA 0
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extxbgr_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_ALPHA 0
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extxrgb_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+/* YCbCr -> RGB565 Conversion. */
+
+#define RGB_PIXELSIZE 2
+#define jsimd_ycc_rgb_convert_neon jsimd_ycc_rgb565_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
diff --git a/simd/arm/common/jdmerge-neon.c b/simd/arm/common/jdmerge-neon.c
new file mode 100644
index 0000000..71798c7
--- /dev/null
+++ b/simd/arm/common/jdmerge-neon.c
@@ -0,0 +1,138 @@
+/*
+ * jdmerge-neon.c - merged upsampling/color conversion (Arm NEON)
+ *
+ * Copyright 2019 The Chromium Authors. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <arm_neon.h>
+
+/* YCbCr -> RGB conversion constants. */
+
+#define F_0_344 11277 /* 0.3441467 = 11277 * 2^-15 */
+#define F_0_714 23401 /* 0.7141418 = 23401 * 2^-15 */
+#define F_1_402 22971 /* 1.4020386 = 22971 * 2^-14 */
+#define F_1_772 29033 /* 1.7720337 = 29033 * 2^-14 */
+
+/* Include inline routines for colorspace extensions */
+
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extrgb_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extrgb_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_ALPHA 3
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extrgbx_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extrgbx_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extbgr_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extbgr_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_ALPHA 3
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extbgrx_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extbgrx_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_ALPHA 0
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extxbgr_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extxbgr_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_ALPHA 0
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extxrgb_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extxrgb_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
diff --git a/simd/arm/common/jdmrgext-neon.c b/simd/arm/common/jdmrgext-neon.c
new file mode 100644
index 0000000..8533d71
--- /dev/null
+++ b/simd/arm/common/jdmrgext-neon.c
@@ -0,0 +1,607 @@
+/*
+ * jdmrgext-neon.c - merged upsampling/color conversion (Arm NEON)
+ *
+ * Copyright 2019 The Chromium Authors. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdmerge-neon.c. */
+
+/*
+ * These routines perform simple chroma upsampling - h2v1 or h2v2 - followed by
+ * YCbCr -> RGB color conversion all in the same function.
+ *
+ * As with the standalone functions, YCbCr -> RGB conversion is defined by the
+ * following equations:
+ * R = Y + 1.40200 * (Cr - 128)
+ * G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
+ * B = Y + 1.77200 * (Cb - 128)
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ * 0.3441467 = 11277 * 2^-15
+ * 0.7141418 = 23401 * 2^-15
+ * 1.4020386 = 22971 * 2^-14
+ * 1.7720337 = 29033 * 2^-14
+ * These constants are defined in jdmerge-neon.c.
+ *
+ * Rounding is used when descaling to ensure correct results.
+ */
+
+/*
+ * Notes on safe memory access for merged upsampling/YCbCr -> RGB conversion
+ * routines:
+ *
+ * Input memory buffers can be safely overread up to the next multiple of
+ * ALIGN_SIZE bytes since they are always allocated by alloc_sarray() in
+ * jmemmgr.c.
+ *
+ * The output buffer cannot safely be written beyond output_width since the
+ * TurboJPEG API permits it to be allocated with or without padding up to the
+ * next multiple of ALIGN_SIZE bytes.
+ */
+
+/*
+ * Upsample and color convert from YCbCr -> RGB for the case of 2:1 horizontal.
+ */
+
+void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+ JSAMPROW outptr;
+ /* Pointers to Y, Cb and Cr data. */
+ JSAMPROW inptr0, inptr1, inptr2;
+
+ int16x8_t neg_128 = vdupq_n_s16(-128);
+
+ inptr0 = input_buf[0][in_row_group_ctr];
+ inptr1 = input_buf[1][in_row_group_ctr];
+ inptr2 = input_buf[2][in_row_group_ctr];
+ outptr = output_buf[0];
+
+ int cols_remaining = output_width;
+ for (; cols_remaining >= 16; cols_remaining -= 16) {
+ /* Load Y-values such that even pixel indices are in one vector and odd */
+ /* pixel indices are in another vector. */
+ uint8x8x2_t y = vld2_u8(inptr0);
+ uint8x8_t cb = vld1_u8(inptr1);
+ uint8x8_t cr = vld1_u8(inptr2);
+ /* Subtract 128 from Cb and Cr. */
+ int16x8_t cr_128 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+ int16x8_t cb_128 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+ /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+ int32x4_t g_sub_y_l = vmull_n_s16(vget_low_s16(cb_128), -F_0_344);
+ int32x4_t g_sub_y_h = vmull_n_s16(vget_high_s16(cb_128), -F_0_344);
+ g_sub_y_l = vmlsl_n_s16(g_sub_y_l, vget_low_s16(cr_128), F_0_714);
+ g_sub_y_h = vmlsl_n_s16(g_sub_y_h, vget_high_s16(cr_128), F_0_714);
+ /* Descale G components: shift right 15, round and narrow to 16-bit. */
+ int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+ vrshrn_n_s32(g_sub_y_h, 15));
+ /* Compute R-Y: 1.40200 * (Cr - 128) */
+ int16x8_t r_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cr_128, 1), F_1_402);
+ /* Compute B-Y: 1.77200 * (Cb - 128) */
+ int16x8_t b_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cb_128, 1), F_1_772);
+ /* Add Y and duplicate chroma components; upsampling horizontally. */
+ int16x8_t g_even = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y.val[0]));
+ int16x8_t r_even = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y.val[0]));
+ int16x8_t b_even = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y.val[0]));
+ int16x8_t g_odd = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y.val[1]));
+ int16x8_t r_odd = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y.val[1]));
+ int16x8_t b_odd = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y.val[1]));
+ /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+ /* Interleave pixel channel values having odd and even pixel indices. */
+ uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
+ uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
+ uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
+
+#ifdef RGB_ALPHA
+ uint8x16x4_t rgba;
+ rgba.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
+ rgba.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
+ rgba.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
+ /* Set alpha channel to opaque (0xFF). */
+ rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+ /* Store RGBA pixel data to memory. */
+ vst4q_u8(outptr, rgba);
+#else
+ uint8x16x3_t rgb;
+ rgb.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
+ rgb.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
+ rgb.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
+ /* Store RGB pixel data to memory. */
+ vst3q_u8(outptr, rgb);
+#endif
+
+ /* Increment pointers. */
+ inptr0 += 16;
+ inptr1 += 8;
+ inptr2 += 8;
+ outptr += (RGB_PIXELSIZE * 16);
+ }
+
+ if (cols_remaining > 0) {
+ /* Load y-values such that even pixel indices are in one vector and odd */
+ /* pixel indices are in another vector. */
+ uint8x8x2_t y = vld2_u8(inptr0);
+ uint8x8_t cb = vld1_u8(inptr1);
+ uint8x8_t cr = vld1_u8(inptr2);
+ /* Subtract 128 from Cb and Cr. */
+ int16x8_t cr_128 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+ int16x8_t cb_128 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+ /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+ int32x4_t g_sub_y_l = vmull_n_s16(vget_low_s16(cb_128), -F_0_344);
+ int32x4_t g_sub_y_h = vmull_n_s16(vget_high_s16(cb_128), -F_0_344);
+ g_sub_y_l = vmlsl_n_s16(g_sub_y_l, vget_low_s16(cr_128), F_0_714);
+ g_sub_y_h = vmlsl_n_s16(g_sub_y_h, vget_high_s16(cr_128), F_0_714);
+ /* Descale G components: shift right 15, round and narrow to 16-bit. */
+ int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+ vrshrn_n_s32(g_sub_y_h, 15));
+ /* Compute R-Y: 1.40200 * (Cr - 128) */
+ int16x8_t r_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cr_128, 1), F_1_402);
+ /* Compute B-Y: 1.77200 * (Cb - 128) */
+ int16x8_t b_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cb_128, 1), F_1_772);
+ /* Add Y and duplicate chroma components - upsample horizontally. */
+ int16x8_t g_even = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y.val[0]));
+ int16x8_t r_even = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y.val[0]));
+ int16x8_t b_even = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y.val[0]));
+ int16x8_t g_odd = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y.val[1]));
+ int16x8_t r_odd = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y.val[1]));
+ int16x8_t b_odd = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y.val[1]));
+ /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+ /* Interleave pixel channel values having odd and even pixel indices. */
+ uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
+ uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
+ uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
+
+#ifdef RGB_ALPHA
+ uint8x8x4_t rgba_h;
+ rgba_h.val[RGB_RED] = r.val[1];
+ rgba_h.val[RGB_GREEN] = g.val[1];
+ rgba_h.val[RGB_BLUE] = b.val[1];
+ /* Set alpha channel to opaque (0xFF). */
+ rgba_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+ uint8x8x4_t rgba_l;
+ rgba_l.val[RGB_RED] = r.val[0];
+ rgba_l.val[RGB_GREEN] = g.val[0];
+ rgba_l.val[RGB_BLUE] = b.val[0];
+ /* Set alpha channel to opaque (0xFF). */
+ rgba_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+ /* Store RGBA pixel data to memory. */
+ switch (cols_remaining) {
+ case 15 :
+ vst4_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgba_h, 6);
+ case 14 :
+ vst4_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgba_h, 5);
+ case 13 :
+ vst4_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgba_h, 4);
+ case 12 :
+ vst4_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgba_h, 3);
+ case 11 :
+ vst4_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgba_h, 2);
+ case 10 :
+ vst4_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgba_h, 1);
+ case 9 :
+ vst4_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgba_h, 0);
+ case 8 :
+ vst4_u8(outptr, rgba_l);
+ break;
+ case 7 :
+ vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba_l, 6);
+ case 6 :
+ vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba_l, 5);
+ case 5 :
+ vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba_l, 4);
+ case 4 :
+ vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba_l, 3);
+ case 3 :
+ vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba_l, 2);
+ case 2 :
+ vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba_l, 1);
+ case 1 :
+ vst4_lane_u8(outptr, rgba_l, 0);
+ default :
+ break;
+ }
+#else
+ uint8x8x3_t rgb_h;
+ rgb_h.val[RGB_RED] = r.val[1];
+ rgb_h.val[RGB_GREEN] = g.val[1];
+ rgb_h.val[RGB_BLUE] = b.val[1];
+ uint8x8x3_t rgb_l;
+ rgb_l.val[RGB_RED] = r.val[0];
+ rgb_l.val[RGB_GREEN] = g.val[0];
+ rgb_l.val[RGB_BLUE] = b.val[0];
+ /* Store RGB pixel data to memory. */
+ switch (cols_remaining) {
+ case 15 :
+ vst3_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgb_h, 6);
+ case 14 :
+ vst3_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgb_h, 5);
+ case 13 :
+ vst3_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgb_h, 4);
+ case 12 :
+ vst3_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgb_h, 3);
+ case 11 :
+ vst3_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgb_h, 2);
+ case 10 :
+ vst3_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgb_h, 1);
+ case 9 :
+ vst3_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgb_h, 0);
+ case 8 :
+ vst3_u8(outptr, rgb_l);
+ break;
+ case 7 :
+ vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb_l, 6);
+ case 6 :
+ vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb_l, 5);
+ case 5 :
+ vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb_l, 4);
+ case 4 :
+ vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb_l, 3);
+ case 3 :
+ vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb_l, 2);
+ case 2 :
+ vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb_l, 1);
+ case 1 :
+ vst3_lane_u8(outptr, rgb_l, 0);
+ default :
+ break;
+ }
+#endif
+ }
+}
+
+
+/*
+ * Upsample and color convert from YCbCr -> RGB for the case of 2:1 horizontal
+ * and 2:1 vertical.
+ *
+ * See above for details of color conversion and safe memory buffer access.
+ */
+
+void jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width,
+ JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf)
+{
+ JSAMPROW outptr0, outptr1;
+ /* Pointers to Y (both rows), Cb and Cr data. */
+ JSAMPROW inptr0_0, inptr0_1, inptr1, inptr2;
+
+ int16x8_t neg_128 = vdupq_n_s16(-128);
+
+ inptr0_0 = input_buf[0][in_row_group_ctr * 2];
+ inptr0_1 = input_buf[0][in_row_group_ctr * 2 + 1];
+ inptr1 = input_buf[1][in_row_group_ctr];
+ inptr2 = input_buf[2][in_row_group_ctr];
+ outptr0 = output_buf[0];
+ outptr1 = output_buf[1];
+
+ int cols_remaining = output_width;
+ for (; cols_remaining >= 16; cols_remaining -= 16) {
+ /* Load Y-values such that even pixel indices are in one vector and odd */
+ /* pixel indices are in another vector. */
+ uint8x8x2_t y0 = vld2_u8(inptr0_0);
+ uint8x8x2_t y1 = vld2_u8(inptr0_1);
+ uint8x8_t cb = vld1_u8(inptr1);
+ uint8x8_t cr = vld1_u8(inptr2);
+ /* Subtract 128 from Cb and Cr. */
+ int16x8_t cr_128 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+ int16x8_t cb_128 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+ /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+ int32x4_t g_sub_y_l = vmull_n_s16(vget_low_s16(cb_128), -F_0_344);
+ int32x4_t g_sub_y_h = vmull_n_s16(vget_high_s16(cb_128), -F_0_344);
+ g_sub_y_l = vmlsl_n_s16(g_sub_y_l, vget_low_s16(cr_128), F_0_714);
+ g_sub_y_h = vmlsl_n_s16(g_sub_y_h, vget_high_s16(cr_128), F_0_714);
+ /* Descale G components: shift right 15, round and narrow to 16-bit. */
+ int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+ vrshrn_n_s32(g_sub_y_h, 15));
+ /* Compute R-Y: 1.40200 * (Cr - 128) */
+ int16x8_t r_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cr_128, 1), F_1_402);
+ /* Compute B-Y: 1.77200 * (Cb - 128) */
+ int16x8_t b_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cb_128, 1), F_1_772);
+ /* Add Y and duplicate chroma components - upsample horizontally. */
+ int16x8_t g0_even = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y0.val[0]));
+ int16x8_t r0_even = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y0.val[0]));
+ int16x8_t b0_even = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y0.val[0]));
+ int16x8_t g0_odd = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y0.val[1]));
+ int16x8_t r0_odd = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y0.val[1]));
+ int16x8_t b0_odd = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y0.val[1]));
+ int16x8_t g1_even = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y1.val[0]));
+ int16x8_t r1_even = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y1.val[0]));
+ int16x8_t b1_even = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y1.val[0]));
+ int16x8_t g1_odd = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y1.val[1]));
+ int16x8_t r1_odd = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y1.val[1]));
+ int16x8_t b1_odd = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y1.val[1]));
+ /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+ /* Interleave pixel channel values having odd and even pixel indices. */
+ uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
+ uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
+ uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
+ uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
+ uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
+ uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
+
+#ifdef RGB_ALPHA
+ uint8x16x4_t rgba0, rgba1;
+ rgba0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
+ rgba1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
+ rgba0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
+ rgba1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
+ rgba0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
+ rgba1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
+ /* Set alpha channel to opaque (0xFF). */
+ rgba0.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+ rgba1.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+ /* Store RGBA pixel data to memory. */
+ vst4q_u8(outptr0, rgba0);
+ vst4q_u8(outptr1, rgba1);
+#else
+ uint8x16x3_t rgb0, rgb1;
+ rgb0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
+ rgb1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
+ rgb0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
+ rgb1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
+ rgb0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
+ rgb1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
+ /* Store RGB pixel data to memory. */
+ vst3q_u8(outptr0, rgb0);
+ vst3q_u8(outptr1, rgb1);
+#endif
+
+ /* Increment pointers. */
+ inptr0_0 += 16;
+ inptr0_1 += 16;
+ inptr1 += 8;
+ inptr2 += 8;
+ outptr0 += (RGB_PIXELSIZE * 16);
+ outptr1 += (RGB_PIXELSIZE * 16);
+ }
+
+ if (cols_remaining > 0) {
+ /* Load Y-values such that even pixel indices are in one vector and */
+ /* odd pixel indices are in another vector. */
+ uint8x8x2_t y0 = vld2_u8(inptr0_0);
+ uint8x8x2_t y1 = vld2_u8(inptr0_1);
+ uint8x8_t cb = vld1_u8(inptr1);
+ uint8x8_t cr = vld1_u8(inptr2);
+ /* Subtract 128 from Cb and Cr. */
+ int16x8_t cr_128 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+ int16x8_t cb_128 = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+ /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+ int32x4_t g_sub_y_l = vmull_n_s16(vget_low_s16(cb_128), -F_0_344);
+ int32x4_t g_sub_y_h = vmull_n_s16(vget_high_s16(cb_128), -F_0_344);
+ g_sub_y_l = vmlsl_n_s16(g_sub_y_l, vget_low_s16(cr_128), F_0_714);
+ g_sub_y_h = vmlsl_n_s16(g_sub_y_h, vget_high_s16(cr_128), F_0_714);
+ /* Descale G components: shift right 15, round and narrow to 16-bit. */
+ int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+ vrshrn_n_s32(g_sub_y_h, 15));
+ /* Compute R-Y: 1.40200 * (Cr - 128) */
+ int16x8_t r_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cr_128, 1), F_1_402);
+ /* Compute B-Y: 1.77200 * (Cb - 128) */
+ int16x8_t b_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cb_128, 1), F_1_772);
+ /* Add Y and duplicate chroma components - upsample horizontally. */
+ int16x8_t g0_even = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y0.val[0]));
+ int16x8_t r0_even = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y0.val[0]));
+ int16x8_t b0_even = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y0.val[0]));
+ int16x8_t g0_odd = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y0.val[1]));
+ int16x8_t r0_odd = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y0.val[1]));
+ int16x8_t b0_odd = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y0.val[1]));
+ int16x8_t g1_even = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y1.val[0]));
+ int16x8_t r1_even = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y1.val[0]));
+ int16x8_t b1_even = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y1.val[0]));
+ int16x8_t g1_odd = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y1.val[1]));
+ int16x8_t r1_odd = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y1.val[1]));
+ int16x8_t b1_odd = vreinterpretq_s16_u16(
+ vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y1.val[1]));
+ /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+ /* Interleave pixel channel values having odd and even pixel indices. */
+ uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
+ uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
+ uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
+ uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
+ uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
+ uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
+
+#ifdef RGB_ALPHA
+ uint8x8x4_t rgba0_h, rgba1_h;
+ rgba0_h.val[RGB_RED] = r0.val[1];
+ rgba1_h.val[RGB_RED] = r1.val[1];
+ rgba0_h.val[RGB_GREEN] = g0.val[1];
+ rgba1_h.val[RGB_GREEN] = g1.val[1];
+ rgba0_h.val[RGB_BLUE] = b0.val[1];
+ rgba1_h.val[RGB_BLUE] = b1.val[1];
+ /* Set alpha channel to opaque (0xFF). */
+ rgba0_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+ rgba1_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+
+ uint8x8x4_t rgba0_l, rgba1_l;
+ rgba0_l.val[RGB_RED] = r0.val[0];
+ rgba1_l.val[RGB_RED] = r1.val[0];
+ rgba0_l.val[RGB_GREEN] = g0.val[0];
+ rgba1_l.val[RGB_GREEN] = g1.val[0];
+ rgba0_l.val[RGB_BLUE] = b0.val[0];
+ rgba1_l.val[RGB_BLUE] = b1.val[0];
+ /* Set alpha channel to opaque (0xFF). */
+ rgba0_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+ rgba1_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+ /* Store RGBA pixel data to memory. */
+ switch (cols_remaining) {
+ case 15 :
+ vst4_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgba0_h, 6);
+ vst4_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgba1_h, 6);
+ case 14 :
+ vst4_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgba0_h, 5);
+ vst4_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgba1_h, 5);
+ case 13 :
+ vst4_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgba0_h, 4);
+ vst4_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgba1_h, 4);
+ case 12 :
+ vst4_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgba0_h, 3);
+ vst4_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgba1_h, 3);
+ case 11 :
+ vst4_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgba0_h, 2);
+ vst4_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgba1_h, 2);
+ case 10 :
+ vst4_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgba0_h, 1);
+ vst4_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgba1_h, 1);
+ case 9 :
+ vst4_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgba0_h, 0);
+ vst4_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgba1_h, 0);
+ case 8 :
+ vst4_u8(outptr0, rgba0_l);
+ vst4_u8(outptr1, rgba1_l);
+ break;
+ case 7 :
+ vst4_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgba0_l, 6);
+ vst4_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgba1_l, 6);
+ case 6 :
+ vst4_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgba0_l, 5);
+ vst4_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgba1_l, 5);
+ case 5 :
+ vst4_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgba0_l, 4);
+ vst4_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgba1_l, 4);
+ case 4 :
+ vst4_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgba0_l, 3);
+ vst4_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgba1_l, 3);
+ case 3 :
+ vst4_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgba0_l, 2);
+ vst4_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgba1_l, 2);
+ case 2 :
+ vst4_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgba0_l, 1);
+ vst4_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgba1_l, 1);
+ case 1 :
+ vst4_lane_u8(outptr0, rgba0_l, 0);
+ vst4_lane_u8(outptr1, rgba1_l, 0);
+ default :
+ break;
+ }
+#else
+ uint8x8x3_t rgb0_h, rgb1_h;
+ rgb0_h.val[RGB_RED] = r0.val[1];
+ rgb1_h.val[RGB_RED] = r1.val[1];
+ rgb0_h.val[RGB_GREEN] = g0.val[1];
+ rgb1_h.val[RGB_GREEN] = g1.val[1];
+ rgb0_h.val[RGB_BLUE] = b0.val[1];
+ rgb1_h.val[RGB_BLUE] = b1.val[1];
+
+ uint8x8x3_t rgb0_l, rgb1_l;
+ rgb0_l.val[RGB_RED] = r0.val[0];
+ rgb1_l.val[RGB_RED] = r1.val[0];
+ rgb0_l.val[RGB_GREEN] = g0.val[0];
+ rgb1_l.val[RGB_GREEN] = g1.val[0];
+ rgb0_l.val[RGB_BLUE] = b0.val[0];
+ rgb1_l.val[RGB_BLUE] = b1.val[0];
+ /* Store RGB pixel data to memory. */
+ switch (cols_remaining) {
+ case 15 :
+ vst3_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgb0_h, 6);
+ vst3_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgb1_h, 6);
+ case 14 :
+ vst3_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgb0_h, 5);
+ vst3_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgb1_h, 5);
+ case 13 :
+ vst3_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgb0_h, 4);
+ vst3_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgb1_h, 4);
+ case 12 :
+ vst3_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgb0_h, 3);
+ vst3_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgb1_h, 3);
+ case 11 :
+ vst3_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgb0_h, 2);
+ vst3_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgb1_h, 2);
+ case 10 :
+ vst3_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgb0_h, 1);
+ vst3_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgb1_h, 1);
+ case 9 :
+ vst3_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgb0_h, 0);
+ vst3_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgb1_h, 0);
+ case 8 :
+ vst3_u8(outptr0, rgb0_l);
+ vst3_u8(outptr1, rgb1_l);
+ break;
+ case 7 :
+ vst3_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgb0_l, 6);
+ vst3_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgb1_l, 6);
+ case 6 :
+ vst3_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgb0_l, 5);
+ vst3_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgb1_l, 5);
+ case 5 :
+ vst3_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgb0_l, 4);
+ vst3_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgb1_l, 4);
+ case 4 :
+ vst3_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgb0_l, 3);
+ vst3_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgb1_l, 3);
+ case 3 :
+ vst3_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgb0_l, 2);
+ vst3_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgb1_l, 2);
+ case 2 :
+ vst3_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgb0_l, 1);
+ vst3_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgb1_l, 1);
+ case 1 :
+ vst3_lane_u8(outptr0, rgb0_l, 0);
+ vst3_lane_u8(outptr1, rgb1_l, 0);
+ default :
+ break;
+ }
+#endif
+ }
+}
diff --git a/simd/arm/common/jdsample-neon.c b/simd/arm/common/jdsample-neon.c
new file mode 100644
index 0000000..e4f5129
--- /dev/null
+++ b/simd/arm/common/jdsample-neon.c
@@ -0,0 +1,557 @@
+/*
+ * jdsample-neon.c - upsampling (Arm NEON)
+ *
+ * Copyright 2019 The Chromium Authors. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <arm_neon.h>
+
+/*
+ * The diagram below shows a row of samples (luma or chroma) produced by h2v1
+ * downsampling.
+ *
+ * s0 s1 s2
+ * +---------+---------+---------+
+ * | | | |
+ * | p0 p1 | p2 p3 | p4 p5 |
+ * | | | |
+ * +---------+---------+---------+
+ *
+ * Each sample contains two of the original pixel channel values. These pixel
+ * channel values are centred at positions p0, p1, p2, p3, p4 and p5 above. To
+ * compute the channel values of the original image, we proportionally blend
+ * the adjacent samples in each row.
+ *
+ * There are three cases to consider:
+ *
+ * 1) The first pixel in the original image.
+ * Pixel channel value p0 contains only a component from sample s0, so we
+ * set p0 = s0.
+ * 2) The last pixel in the original image.
+ * Pixel channel value p5 contains only a component from sample s2, so we
+ * set p5 = s2.
+ * 3) General case (all other pixels in the row).
+ * Apart from the first and last pixels, every other pixel channel value is
+ * computed by blending the containing sample and the nearest neigbouring
+ * sample in the ratio 3:1.
+ * For example, the pixel channel value centred at p1 would be computed as
+ * follows:
+ * 3/4 * s0 + 1/4 * s1
+ * while the pixel channel value centred at p2 would be:
+ * 3/4 * s1 + 1/4 * s0
+ */
+
+void jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
+ JDIMENSION downsampled_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr, outptr;
+ /* Setup constants. */
+ const uint16x8_t one_u16 = vdupq_n_u16(1);
+ const uint8x8_t three_u8 = vdup_n_u8(3);
+
+ for (int inrow = 0; inrow < max_v_samp_factor; inrow++) {
+ inptr = input_data[inrow];
+ outptr = output_data[inrow];
+ /* Case 1: first pixel channel value in this row of the original image. */
+ *outptr = (JSAMPLE)GETJSAMPLE(*inptr);
+
+ /* General case: */
+ /* 3/4 * containing sample + 1/4 * nearest neighbouring sample */
+ /* For p1: containing sample = s0, nearest neighbouring sample = s1. */
+ /* For p2: containing sample = s1, nearest neighbouring sample = s0. */
+ uint8x16_t s0 = vld1q_u8(inptr);
+ uint8x16_t s1 = vld1q_u8(inptr + 1);
+ /* Multiplication makes vectors twice as wide: '_l' and '_h' suffixes */
+ /* denote low half and high half respectively. */
+ uint16x8_t s1_add_3s0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1)),
+ vget_low_u8(s0), three_u8);
+ uint16x8_t s1_add_3s0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1)),
+ vget_high_u8(s0), three_u8);
+ uint16x8_t s0_add_3s1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0)),
+ vget_low_u8(s1), three_u8);
+ uint16x8_t s0_add_3s1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0)),
+ vget_high_u8(s1), three_u8);
+ /* Add ordered dithering bias to odd pixel values. */
+ s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
+ s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
+
+ /* Initially 1 - due to having already stored the first pixel of the */
+ /* image. However, in subsequent iterations of the SIMD loop this offset */
+ /* is (2 * colctr - 1) to stay within the bounds of the sample buffers */
+ /* without having to resort to a slow scalar tail case for the last */
+ /* (downsampled_width % 16) samples. See "Creation of 2-D sample arrays" */
+ /* in jmemmgr.c for details. */
+ unsigned outptr_offset = 1;
+ uint8x16x2_t output_pixels;
+
+#if defined(__aarch64__) && defined(__clang__) && !defined(__OPTIMIZE_SIZE__)
+ /* Unrolling by four is beneficial on AArch64 as there are 16 additional */
+ /* 128-bit SIMD registers to accommodate the extra data in flight. */
+ #pragma clang loop unroll_count(4)
+#endif
+ /* We use software pipelining to maximise performance. The code indented */
+ /* an extra 6 spaces begins the next iteration of the loop. */
+ for (unsigned colctr = 16; colctr < downsampled_width; colctr += 16) {
+ s0 = vld1q_u8(inptr + colctr - 1);
+ s1 = vld1q_u8(inptr + colctr);
+ /* Right-shift by 2 (divide by 4), narrow to 8-bit and combine. */
+ output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
+ vrshrn_n_u16(s1_add_3s0_h, 2));
+ output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
+ vshrn_n_u16(s0_add_3s1_h, 2));
+ /* Multiplication makes vectors twice as wide: '_l' and '_h' */
+ /* suffixes denote low half and high half respectively. */
+ s1_add_3s0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1)),
+ vget_low_u8(s0), three_u8);
+ s1_add_3s0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1)),
+ vget_high_u8(s0), three_u8);
+ s0_add_3s1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0)),
+ vget_low_u8(s1), three_u8);
+ s0_add_3s1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0)),
+ vget_high_u8(s1), three_u8);
+ /* Add ordered dithering bias to odd pixel values. */
+ s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
+ s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
+ /* Store pixel channel values to memory. */
+ vst2q_u8(outptr + outptr_offset, output_pixels);
+ outptr_offset = 2 * colctr - 1;
+ }
+
+ /* Complete the last iteration of the loop. */
+ /* Right-shift by 2 (divide by 4), narrow to 8-bit and combine. */
+ output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
+ vrshrn_n_u16(s1_add_3s0_h, 2));
+ output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
+ vshrn_n_u16(s0_add_3s1_h, 2));
+ /* Store pixel channel values to memory. */
+ vst2q_u8(outptr + outptr_offset, output_pixels);
+
+ /* Case 2: last pixel channel value in this row of the original image. */
+ outptr[2 * downsampled_width - 1] =
+ GETJSAMPLE(inptr[downsampled_width - 1]);
+ }
+}
+
+
+/*
+ * The diagram below shows a grid-window of samples (luma or chroma) produced
+ * by h2v2 downsampling.
+ *
+ * s0 s1
+ * +---------+---------+
+ * | p0 p1 | p2 p3 |
+ * r0 | | |
+ * | p4 p5 | p6 p7 |
+ * +---------+---------+
+ * | p8 p9 | p10 p11|
+ * r1 | | |
+ * | p12 p13| p14 p15|
+ * +---------+---------+
+ * | p16 p17| p18 p19|
+ * r2 | | |
+ * | p20 p21| p22 p23|
+ * +---------+---------+
+ *
+ * Every sample contains four of the original pixel channel values. The pixels'
+ * channel values are centred at positions p0, p1, p2,..., p23 above. For a
+ * given grid-window position, r1 is always used to denote the row of samples
+ * containing the pixel channel values we are computing. For the top row of
+ * pixel channel values in r1 (p8-p11), the nearest neighbouring samples are in
+ * the row above - denoted by r0. Likewise, for the bottom row of pixels in r1
+ * (p12-p15), the nearest neighbouring samples are in the row below - denoted
+ * by r2.
+ *
+ * To compute the pixel channel values of the original image, we proportionally
+ * blend the sample containing the pixel centre with the nearest neighbouring
+ * samples in each row, column and diagonal.
+ *
+ * There are three cases to consider:
+ *
+ * 1) The first pixel in this row of the original image.
+ * Pixel channel value p8 only contains components from sample column s0.
+ * Its value is computed by blending samples s0r1 and s0r0 in the ratio 3:1.
+ * 2) The last pixel in this row of the original image.
+ * Pixel channel value p11 only contains components from sample column s1.
+ * Its value is computed by blending samples s1r1 and s1r0 in the ratio 3:1.
+ * 3) General case (all other pixels in the row).
+ * Apart from the first and last pixels, every other pixel channel value in
+ * the row contains components from samples in adjacent columns.
+ *
+ * For example, the pixel centred at p9 would be computed as follows:
+ * (9/16 * s0r1) + (3/16 * s0r0) + (3/16 * s1r1) + (1/16 * s1r0)
+ *
+ * This can be broken down into two steps:
+ * 1) Blend samples vertically in columns s0 and s1 in the ratio 3:1:
+ * s0colsum = 3/4 * s0r1 + 1/4 * s0r0
+ * s1colsum = 3/4 * s1r1 + 1/4 * s1r0
+ * 2) Blend the already-blended columns in the ratio 3:1:
+ * p9 = 3/4 * s0colsum + 1/4 * s1colsum
+ *
+ * The bottom row of pixel channel values in row r1 can be computed in the same
+ * way for each of the three cases, only using samples in row r2 instead of row
+ * r0 - as r2 is the nearest neighbouring row.
+ */
+
+void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
+ JDIMENSION downsampled_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
+ int inrow, outrow;
+ /* Setup constants. */
+ const uint16x8_t seven_u16 = vdupq_n_u16(7);
+ const uint8x8_t three_u8 = vdup_n_u8(3);
+ const uint16x8_t three_u16 = vdupq_n_u16(3);
+
+ inrow = outrow = 0;
+ while (outrow < max_v_samp_factor) {
+ inptr0 = input_data[inrow - 1];
+ inptr1 = input_data[inrow];
+ inptr2 = input_data[inrow + 1];
+ /* Suffixes 0 and 1 denote the top and bottom rows of output pixels */
+ /* respectively. */
+ outptr0 = output_data[outrow++];
+ outptr1 = output_data[outrow++];
+
+ /* Case 1: first pixel channel value in this row of original image. */
+ int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0);
+ *outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4);
+ int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2);
+ *outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4);
+
+ /* General case as described above. */
+ /* Step 1: Blend samples vertically in columns s0 and s1. */
+ /* Leave the divide by 4 to the end when it can be done for both */
+ /* dimensions at once, right-shifting by 4. */
+
+ /* Load and compute s0colsum0 and s0colsum1. */
+ uint8x16_t s0r0 = vld1q_u8(inptr0);
+ uint8x16_t s0r1 = vld1q_u8(inptr1);
+ uint8x16_t s0r2 = vld1q_u8(inptr2);
+ /* Multiplication makes vectors twice as wide: '_l' and '_h' suffixes */
+ /* denote low half and high half respectively. */
+ uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r0)),
+ vget_low_u8(s0r1), three_u8);
+ uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r0)),
+ vget_high_u8(s0r1), three_u8);
+ uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r2)),
+ vget_low_u8(s0r1), three_u8);
+ uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r2)),
+ vget_high_u8(s0r1), three_u8);
+ /* Load and compute s1colsum0 and s1colsum1. */
+ uint8x16_t s1r0 = vld1q_u8(inptr0 + 1);
+ uint8x16_t s1r1 = vld1q_u8(inptr1 + 1);
+ uint8x16_t s1r2 = vld1q_u8(inptr2 + 1);
+ uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r0)),
+ vget_low_u8(s1r1), three_u8);
+ uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r0)),
+ vget_high_u8(s1r1), three_u8);
+ uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r2)),
+ vget_low_u8(s1r1), three_u8);
+ uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r2)),
+ vget_high_u8(s1r1), three_u8);
+ /* Step 2: Blend the already-blended columns. */
+ uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
+ uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
+ uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
+ uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
+ uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
+ uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
+ uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
+ uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
+ /* Add ordered dithering bias to odd pixel values. */
+ output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
+ output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
+ output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
+ output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
+ /* Right-shift by 4 (divide by 16), narrow to 8-bit and combine. */
+ uint8x16x2_t output_pixels0 = { vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
+ vshrn_n_u16(output0_p1_h, 4)),
+ vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
+ vrshrn_n_u16(output0_p2_h, 4))
+ };
+ uint8x16x2_t output_pixels1 = { vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
+ vshrn_n_u16(output1_p1_h, 4)),
+ vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
+ vrshrn_n_u16(output1_p2_h, 4))
+ };
+ /* Store pixel channel values to memory. */
+ /* The minimum size of the output buffer for each row is 64 bytes => no */
+ /* need to worry about buffer overflow here. See "Creation of 2-D sample */
+ /* arrays" in jmemmgr.c for details. */
+ vst2q_u8(outptr0 + 1, output_pixels0);
+ vst2q_u8(outptr1 + 1, output_pixels1);
+
+ /* The first pixel of the image shifted our loads and stores by one */
+ /* byte. We have to re-align on a 32-byte boundary at some point before */
+ /* the end of the row (we do it now on the 32/33 pixel boundary) to stay */
+ /* within the bounds of the sample buffers without having to resort to a */
+ /* slow scalar tail case for the last (downsampled_width % 16) samples. */
+ /* See "Creation of 2-D sample arrays" in jmemmgr.c for details.*/
+ for (unsigned colctr = 16; colctr < downsampled_width; colctr += 16) {
+ /* Step 1: Blend samples vertically in columns s0 and s1. */
+ /* Load and compute s0colsum0 and s0colsum1. */
+ s0r0 = vld1q_u8(inptr0 + colctr - 1);
+ s0r1 = vld1q_u8(inptr1 + colctr - 1);
+ s0r2 = vld1q_u8(inptr2 + colctr - 1);
+ s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r0)),
+ vget_low_u8(s0r1), three_u8);
+ s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r0)),
+ vget_high_u8(s0r1), three_u8);
+ s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r2)),
+ vget_low_u8(s0r1), three_u8);
+ s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r2)),
+ vget_high_u8(s0r1), three_u8);
+ /* Load and compute s1colsum0 and s1colsum1. */
+ s1r0 = vld1q_u8(inptr0 + colctr);
+ s1r1 = vld1q_u8(inptr1 + colctr);
+ s1r2 = vld1q_u8(inptr2 + colctr);
+ s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r0)),
+ vget_low_u8(s1r1), three_u8);
+ s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r0)),
+ vget_high_u8(s1r1), three_u8);
+ s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r2)),
+ vget_low_u8(s1r1), three_u8);
+ s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r2)),
+ vget_high_u8(s1r1), three_u8);
+ /* Step 2: Blend the already-blended columns. */
+ output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
+ output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
+ output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
+ output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
+ output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
+ output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
+ output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
+ output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
+ /* Add ordered dithering bias to odd pixel values. */
+ output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
+ output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
+ output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
+ output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
+ /* Right-shift by 4 (divide by 16), narrow to 8-bit and combine. */
+ output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
+ vshrn_n_u16(output0_p1_h, 4));
+ output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
+ vrshrn_n_u16(output0_p2_h, 4));
+ output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
+ vshrn_n_u16(output1_p1_h, 4));
+ output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
+ vrshrn_n_u16(output1_p2_h, 4));
+ /* Store pixel channel values to memory. */
+ vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0);
+ vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1);
+ }
+
+ /* Case 2: last pixel channel value in this row of the original image. */
+ int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
+ GETJSAMPLE(inptr0[downsampled_width - 1]);
+ outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4);
+ int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
+ GETJSAMPLE(inptr2[downsampled_width - 1]);
+ outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);
+ inrow++;
+ }
+}
+
+
+/*
+ * The diagram below shows a grid-window of samples (luma or chroma) produced
+ * by h2v1 downsampling; which has been subsequently rotated 90 degrees. (The
+ * usual use of h1v2 upsampling is upsampling rotated or transposed h2v1
+ * downsampled images.)
+ *
+ * s0 s1
+ * +---------+---------+
+ * | p0 | p1 |
+ * r0 | | |
+ * | p2 | p3 |
+ * +---------+---------+
+ * | p4 | p5 |
+ * r1 | | |
+ * | p6 | p7 |
+ * +---------+---------+
+ * | p8 | p9 |
+ * r2 | | |
+ * | p10 | p11 |
+ * +---------+---------+
+ *
+ * Every sample contains two of the original pixel channel values. The pixels'
+ * channel values are centred at positions p0, p1, p2,..., p11 above. For a
+ * given grid-window position, r1 is always used to denote the row of samples
+ * containing the pixel channel values we are computing. For the top row of
+ * pixel channel values in r1 (p4 and p5), the nearest neighbouring samples are
+ * in the row above - denoted by r0. Likewise, for the bottom row of pixels in
+ * r1 (p6 and p7), the nearest neighbouring samples are in the row below -
+ * denoted by r2.
+ *
+ * To compute the pixel channel values of the original image, we proportionally
+ * blend the adjacent samples in each column.
+ *
+ * For example, the pixel channel value centred at p4 would be computed as
+ * follows:
+ * 3/4 * s0r1 + 1/4 * s0r0
+ * while the pixel channel value centred at p6 would be:
+ * 3/4 * s0r1 + 1/4 * s0r2
+ */
+
+void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
+ JDIMENSION downsampled_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
+ int inrow, outrow;
+ /* Setup constants. */
+ const uint16x8_t one_u16 = vdupq_n_u16(1);
+ const uint8x8_t three_u8 = vdup_n_u8(3);
+
+ inrow = outrow = 0;
+ while (outrow < max_v_samp_factor) {
+ inptr0 = input_data[inrow - 1];
+ inptr1 = input_data[inrow];
+ inptr2 = input_data[inrow + 1];
+ /* Suffixes 0 and 1 denote the top and bottom rows of output pixels */
+ /* respectively. */
+ outptr0 = output_data[outrow++];
+ outptr1 = output_data[outrow++];
+ inrow++;
+
+ /* The size of the input and output buffers is always a multiple of 32 */
+ /* bytes => no need to worry about buffer overflow when reading/writing */
+ /* memory. See "Creation of 2-D sample arrays" in jmemmgr.c for details. */
+ for (unsigned colctr = 0; colctr < downsampled_width; colctr += 16) {
+ /* Load samples. */
+ uint8x16_t r0 = vld1q_u8(inptr0 + colctr);
+ uint8x16_t r1 = vld1q_u8(inptr1 + colctr);
+ uint8x16_t r2 = vld1q_u8(inptr2 + colctr);
+ /* Blend samples vertically. */
+ uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(r0)),
+ vget_low_u8(r1), three_u8);
+ uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(r0)),
+ vget_high_u8(r1), three_u8);
+ uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(r2)),
+ vget_low_u8(r1), three_u8);
+ uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(r2)),
+ vget_high_u8(r1), three_u8);
+ /* Add ordered dithering bias to pixel values in even output rows. */
+ colsum0_l = vaddq_u16(colsum0_l, one_u16);
+ colsum0_h = vaddq_u16(colsum0_h, one_u16);
+ /* Right-shift by 2 (divide by 4), narrow to 8-bit and combine. */
+ uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),
+ vshrn_n_u16(colsum0_h, 2));
+ uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),
+ vrshrn_n_u16(colsum1_h, 2));
+ /* Store pixel channel values to memory. */
+ vst1q_u8(outptr0 + colctr, output_pixels0);
+ vst1q_u8(outptr1 + colctr, output_pixels1);
+ }
+ }
+}
+
+
+/*
+ * The diagram below shows the operation of h2v1 (simple) upsampling. Each
+ * sample in the row is duplicated to form two output pixel channel values.
+ *
+ * p0 p1 p2 p3
+ * +----+----+ +----+----+----+----+
+ * | s0 | s1 | -> | s0 | s0 | s1 | s1 |
+ * +----+----+ +----+----+----+----+
+ */
+
+void jsimd_h2v1_upsample_neon(int max_v_samp_factor,
+ JDIMENSION output_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr, outptr;
+
+ for (int inrow = 0; inrow < max_v_samp_factor; inrow++) {
+ inptr = input_data[inrow];
+ outptr = output_data[inrow];
+ for (unsigned colctr = 0; 2 * colctr < output_width; colctr += 16) {
+ uint8x16_t samples = vld1q_u8(inptr + colctr);
+ /* Duplicate the samples - the store interleaves them to produce the */
+ /* pattern in the diagram above. */
+ uint8x16x2_t output_pixels = { samples, samples };
+ /* Store pixel values to memory. */
+ /* Due to the way sample buffers are allocated, we don't need to worry */
+ /* about tail cases when output_width is not a multiple of 32. */
+ /* See "Creation of 2-D sample arrays" in jmemmgr.c for details. */
+ vst2q_u8(outptr + 2 * colctr, output_pixels);
+ }
+ }
+}
+
+
+/*
+ * The diagram below shows the operation of h2v2 (simple) upsampling. Each
+ * sample in the row is duplicated to form two output pixel channel values.
+ * This horizontally-upsampled row is then also duplicated.
+ *
+ * p0 p1 p2 p3
+ * +-----+-----+ +-----+-----+-----+-----+
+ * | s0 | s1 | -> | s0 | s0 | s1 | s1 |
+ * +-----+-----+ +-----+-----+-----+-----+
+ * | s0 | s0 | s1 | s1 |
+ * +-----+-----+-----+-----+
+ */
+
+void jsimd_h2v2_upsample_neon(int max_v_samp_factor,
+ JDIMENSION output_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr, outptr0, outptr1;
+
+ for (int inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+ inptr = input_data[inrow];
+ outptr0 = output_data[outrow++];
+ outptr1 = output_data[outrow++];
+
+ for (unsigned colctr = 0; 2 * colctr < output_width; colctr += 16) {
+ uint8x16_t samples = vld1q_u8(inptr + colctr);
+ /* Duplicate the samples - the store interleaves them to produce the */
+ /* pattern in the diagram above. */
+ uint8x16x2_t output_pixels = { samples, samples };
+ /* Store pixel values to memory for both output rows. */
+ /* Due to the way sample buffers are allocated, we don't need to worry */
+ /* about tail cases when output_width is not a multiple of 32. */
+ /* See "Creation of 2-D sample arrays" in jmemmgr.c for details. */
+ vst2q_u8(outptr0 + 2 * colctr, output_pixels);
+ vst2q_u8(outptr1 + 2 * colctr, output_pixels);
+ }
+ }
+}
diff --git a/simd/arm/common/jfdctfst-neon.c b/simd/arm/common/jfdctfst-neon.c
new file mode 100644
index 0000000..e7b2e96
--- /dev/null
+++ b/simd/arm/common/jfdctfst-neon.c
@@ -0,0 +1,211 @@
+/*
+ * jfdctfst-neon.c - fast DCT (Arm NEON)
+ *
+ * Copyright 2020 The Chromium Authors. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jconfigint.h"
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <arm_neon.h>
+
+/*
+ * 'jsimd_fdct_ifast_neon' performs a fast, not so accurate forward DCT
+ * (Discrete Cosine Transform) on one block of samples. It uses the same
+ * calculations and produces exactly the same output as IJG's original
+ * 'jpeg_fdct_ifast' function, which can be found in jfdctfst.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ * 0.382683433 = 12544 * 2^-15
+ * 0.541196100 = 17795 * 2^-15
+ * 0.707106781 = 23168 * 2^-15
+ * 0.306562965 = 9984 * 2^-15
+ *
+ * See jfdctfst.c for further details of the IDCT algorithm. Where possible,
+ * the variable names and comments here in 'jsimd_fdct_ifast_neon' match up
+ * with those in 'jpeg_fdct_ifast'.
+ */
+
+#define F_0_382 12544
+#define F_0_541 17792
+#define F_0_707 23168
+#define F_0_306 9984
+
+ALIGN(16) static const int16_t jsimd_fdct_ifast_neon_consts[] = {
+ F_0_382, F_0_541, F_0_707, F_0_306
+};
+
+void jsimd_fdct_ifast_neon(DCTELEM *data)
+{
+ /* Load an 8x8 block of samples into Neon registers. De-interleaving loads */
+ /* are used followed by vuzp to transpose the block such that we have a */
+ /* column of samples per vector - allowing all rows to be processed at */
+ /* once. */
+ int16x8x4_t data1 = vld4q_s16(data);
+ int16x8x4_t data2 = vld4q_s16(data + 4 * DCTSIZE);
+
+ int16x8x2_t cols_04 = vuzpq_s16(data1.val[0], data2.val[0]);
+ int16x8x2_t cols_15 = vuzpq_s16(data1.val[1], data2.val[1]);
+ int16x8x2_t cols_26 = vuzpq_s16(data1.val[2], data2.val[2]);
+ int16x8x2_t cols_37 = vuzpq_s16(data1.val[3], data2.val[3]);
+
+ int16x8_t col0 = cols_04.val[0];
+ int16x8_t col1 = cols_15.val[0];
+ int16x8_t col2 = cols_26.val[0];
+ int16x8_t col3 = cols_37.val[0];
+ int16x8_t col4 = cols_04.val[1];
+ int16x8_t col5 = cols_15.val[1];
+ int16x8_t col6 = cols_26.val[1];
+ int16x8_t col7 = cols_37.val[1];
+
+ /* Load DCT conversion constants. */
+ const int16x4_t consts = vld1_s16(jsimd_fdct_ifast_neon_consts);
+
+ /* Pass 1: process rows. */
+ int16x8_t tmp0 = vaddq_s16(col0, col7);
+ int16x8_t tmp7 = vsubq_s16(col0, col7);
+ int16x8_t tmp1 = vaddq_s16(col1, col6);
+ int16x8_t tmp6 = vsubq_s16(col1, col6);
+ int16x8_t tmp2 = vaddq_s16(col2, col5);
+ int16x8_t tmp5 = vsubq_s16(col2, col5);
+ int16x8_t tmp3 = vaddq_s16(col3, col4);
+ int16x8_t tmp4 = vsubq_s16(col3, col4);
+
+ /* Even part */
+ int16x8_t tmp10 = vaddq_s16(tmp0, tmp3); /* phase 2 */
+ int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
+ int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
+ int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
+
+ col0 = vaddq_s16(tmp10, tmp11); /* phase 3 */
+ col4 = vsubq_s16(tmp10, tmp11);
+
+ int16x8_t z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
+ col2 = vaddq_s16(tmp13, z1); /* phase 5 */
+ col6 = vsubq_s16(tmp13, z1);
+
+ /* Odd part */
+ tmp10 = vaddq_s16(tmp4, tmp5); /* phase 2 */
+ tmp11 = vaddq_s16(tmp5, tmp6);
+ tmp12 = vaddq_s16(tmp6, tmp7);
+
+ int16x8_t z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
+ int16x8_t z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
+ z2 = vaddq_s16(z2, z5);
+ int16x8_t z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
+ z5 = vaddq_s16(tmp12, z5);
+ z4 = vaddq_s16(z4, z5);
+ int16x8_t z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
+
+ int16x8_t z11 = vaddq_s16(tmp7, z3); /* phase 5 */
+ int16x8_t z13 = vsubq_s16(tmp7, z3);
+
+ col5 = vaddq_s16(z13, z2); /* phase 6 */
+ col3 = vsubq_s16(z13, z2);
+ col1 = vaddq_s16(z11, z4);
+ col7 = vsubq_s16(z11, z4);
+
+ /* Transpose to work on columns in pass 2. */
+ int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
+ int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
+ int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
+ int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
+
+ int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
+ vreinterpretq_s32_s16(cols_45.val[0]));
+ int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
+ vreinterpretq_s32_s16(cols_45.val[1]));
+ int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
+ vreinterpretq_s32_s16(cols_67.val[0]));
+ int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
+ vreinterpretq_s32_s16(cols_67.val[1]));
+
+ int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
+ int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
+ int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
+ int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
+
+ int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
+ int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
+ int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
+ int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
+ int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
+ int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
+ int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
+ int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
+
+ /* Pass 2: process columns. */
+ tmp0 = vaddq_s16(row0, row7);
+ tmp7 = vsubq_s16(row0, row7);
+ tmp1 = vaddq_s16(row1, row6);
+ tmp6 = vsubq_s16(row1, row6);
+ tmp2 = vaddq_s16(row2, row5);
+ tmp5 = vsubq_s16(row2, row5);
+ tmp3 = vaddq_s16(row3, row4);
+ tmp4 = vsubq_s16(row3, row4);
+
+ /* Even part */
+ tmp10 = vaddq_s16(tmp0, tmp3); /* phase 2 */
+ tmp13 = vsubq_s16(tmp0, tmp3);
+ tmp11 = vaddq_s16(tmp1, tmp2);
+ tmp12 = vsubq_s16(tmp1, tmp2);
+
+ row0 = vaddq_s16(tmp10, tmp11); /* phase 3 */
+ row4 = vsubq_s16(tmp10, tmp11);
+
+ z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
+ row2 = vaddq_s16(tmp13, z1); /* phase 5 */
+ row6 = vsubq_s16(tmp13, z1);
+
+ /* Odd part */
+ tmp10 = vaddq_s16(tmp4, tmp5); /* phase 2 */
+ tmp11 = vaddq_s16(tmp5, tmp6);
+ tmp12 = vaddq_s16(tmp6, tmp7);
+
+ z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
+ z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
+ z2 = vaddq_s16(z2, z5);
+ z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
+ z5 = vaddq_s16(tmp12, z5);
+ z4 = vaddq_s16(z4, z5);
+ z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
+
+ z11 = vaddq_s16(tmp7, z3); /* phase 5 */
+ z13 = vsubq_s16(tmp7, z3);
+
+ row5 = vaddq_s16(z13, z2); /* phase 6 */
+ row3 = vsubq_s16(z13, z2);
+ row1 = vaddq_s16(z11, z4);
+ row7 = vsubq_s16(z11, z4);
+
+ vst1q_s16(data + 0 * DCTSIZE, row0);
+ vst1q_s16(data + 1 * DCTSIZE, row1);
+ vst1q_s16(data + 2 * DCTSIZE, row2);
+ vst1q_s16(data + 3 * DCTSIZE, row3);
+ vst1q_s16(data + 4 * DCTSIZE, row4);
+ vst1q_s16(data + 5 * DCTSIZE, row5);
+ vst1q_s16(data + 6 * DCTSIZE, row6);
+ vst1q_s16(data + 7 * DCTSIZE, row7);
+}
diff --git a/simd/arm/common/jfdctint-neon.c b/simd/arm/common/jfdctint-neon.c
new file mode 100644
index 0000000..55abb1b
--- /dev/null
+++ b/simd/arm/common/jfdctint-neon.c
@@ -0,0 +1,371 @@
+/*
+ * jfdctint-neon.c - accurate DCT (Arm NEON)
+ *
+ * Copyright 2020 The Chromium Aruthors. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jconfigint.h"
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <arm_neon.h>
+
+/*
+ * 'jsimd_fdct_islow_neon' performs a slow-but-accurate forward DCT (Discrete
+ * Cosine Transform) on one block of samples. It uses the same calculations
+ * and produces exactly the same output as IJG's original 'jpeg_fdct_islow'
+ * function, which can be found in jfdctint.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ * 0.298631336 = 2446 * 2^-13
+ * 0.390180644 = 3196 * 2^-13
+ * 0.541196100 = 4433 * 2^-13
+ * 0.765366865 = 6270 * 2^-13
+ * 0.899976223 = 7373 * 2^-13
+ * 1.175875602 = 9633 * 2^-13
+ * 1.501321110 = 12299 * 2^-13
+ * 1.847759065 = 15137 * 2^-13
+ * 1.961570560 = 16069 * 2^-13
+ * 2.053119869 = 16819 * 2^-13
+ * 2.562915447 = 20995 * 2^-13
+ * 3.072711026 = 25172 * 2^-13
+ *
+ * See jfdctint.c for further details of the DCT algorithm. Where possible,
+ * the variable names and comments here in 'jsimd_fdct_islow_neon' match up
+ * with those in 'jpeg_fdct_islow'.
+ */
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
+
+#define F_0_298 2446
+#define F_0_390 3196
+#define F_0_541 4433
+#define F_0_765 6270
+#define F_0_899 7373
+#define F_1_175 9633
+#define F_1_501 12299
+#define F_1_847 15137
+#define F_1_961 16069
+#define F_2_053 16819
+#define F_2_562 20995
+#define F_3_072 25172
+
+ALIGN(16) static const int16_t jsimd_fdct_islow_neon_consts[] = {
+ F_0_298, -F_0_390, F_0_541, F_0_765,
+ -F_0_899, F_1_175, F_1_501, -F_1_847,
+ -F_1_961, F_2_053, -F_2_562, F_3_072
+};
+
+void jsimd_fdct_islow_neon(DCTELEM *data)
+{
+ /* Load DCT constants. */
+#if defined(__clang__) || defined(_MSC_VER)
+ const int16x4x3_t consts = vld1_s16_x3(jsimd_fdct_islow_neon_consts);
+#else
+ /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
+ const int16x4_t consts1 = vld1_s16(jsimd_fdct_islow_neon_consts);
+ const int16x4_t consts2 = vld1_s16(jsimd_fdct_islow_neon_consts + 4);
+ const int16x4_t consts3 = vld1_s16(jsimd_fdct_islow_neon_consts + 8);
+ const int16x4x3_t consts = { consts1, consts2, consts3 };
+#endif
+
+ /* Load an 8x8 block of samples into Neon registers. De-interleaving loads */
+ /* are used followed by vuzp to transpose the block such that we have a */
+ /* column of samples per vector - allowing all rows to be processed at */
+ /* once. */
+ int16x8x4_t s_rows_0123 = vld4q_s16(data);
+ int16x8x4_t s_rows_4567 = vld4q_s16(data + 4 * DCTSIZE);
+
+ int16x8x2_t cols_04 = vuzpq_s16(s_rows_0123.val[0], s_rows_4567.val[0]);
+ int16x8x2_t cols_15 = vuzpq_s16(s_rows_0123.val[1], s_rows_4567.val[1]);
+ int16x8x2_t cols_26 = vuzpq_s16(s_rows_0123.val[2], s_rows_4567.val[2]);
+ int16x8x2_t cols_37 = vuzpq_s16(s_rows_0123.val[3], s_rows_4567.val[3]);
+
+ int16x8_t col0 = cols_04.val[0];
+ int16x8_t col1 = cols_15.val[0];
+ int16x8_t col2 = cols_26.val[0];
+ int16x8_t col3 = cols_37.val[0];
+ int16x8_t col4 = cols_04.val[1];
+ int16x8_t col5 = cols_15.val[1];
+ int16x8_t col6 = cols_26.val[1];
+ int16x8_t col7 = cols_37.val[1];
+
+ /* Pass 1: process rows. */
+ int16x8_t tmp0 = vaddq_s16(col0, col7);
+ int16x8_t tmp7 = vsubq_s16(col0, col7);
+ int16x8_t tmp1 = vaddq_s16(col1, col6);
+ int16x8_t tmp6 = vsubq_s16(col1, col6);
+ int16x8_t tmp2 = vaddq_s16(col2, col5);
+ int16x8_t tmp5 = vsubq_s16(col2, col5);
+ int16x8_t tmp3 = vaddq_s16(col3, col4);
+ int16x8_t tmp4 = vsubq_s16(col3, col4);
+
+ /* Even part. */
+ int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);
+ int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
+ int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
+ int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
+
+ col0 = vshlq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
+ col4 = vshlq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
+
+ int16x8_t tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
+ int32x4_t z1_l = vmull_lane_s16(vget_low_s16(tmp12_add_tmp13),
+ consts.val[0], 2);
+ int32x4_t z1_h = vmull_lane_s16(vget_high_s16(tmp12_add_tmp13),
+ consts.val[0], 2);
+
+ int32x4_t col2_scaled_l = vmlal_lane_s16(z1_l, vget_low_s16(tmp13),
+ consts.val[0], 3);
+ int32x4_t col2_scaled_h = vmlal_lane_s16(z1_h, vget_high_s16(tmp13),
+ consts.val[0], 3);
+ col2 = vcombine_s16(vrshrn_n_s32(col2_scaled_l, DESCALE_P1),
+ vrshrn_n_s32(col2_scaled_h, DESCALE_P1));
+
+ int32x4_t col6_scaled_l = vmlal_lane_s16(z1_l, vget_low_s16(tmp12),
+ consts.val[1], 3);
+ int32x4_t col6_scaled_h = vmlal_lane_s16(z1_h, vget_high_s16(tmp12),
+ consts.val[1], 3);
+ col6 = vcombine_s16(vrshrn_n_s32(col6_scaled_l, DESCALE_P1),
+ vrshrn_n_s32(col6_scaled_h, DESCALE_P1));
+
+ /* Odd part. */
+ int16x8_t z1 = vaddq_s16(tmp4, tmp7);
+ int16x8_t z2 = vaddq_s16(tmp5, tmp6);
+ int16x8_t z3 = vaddq_s16(tmp4, tmp6);
+ int16x8_t z4 = vaddq_s16(tmp5, tmp7);
+ /* sqrt(2) * c3 */
+ int32x4_t z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
+ int32x4_t z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
+ z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
+ z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
+
+ /* sqrt(2) * (-c1+c3+c5-c7) */
+ int32x4_t tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
+ int32x4_t tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
+ /* sqrt(2) * ( c1+c3-c5+c7) */
+ int32x4_t tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
+ int32x4_t tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
+ /* sqrt(2) * ( c1+c3+c5-c7) */
+ int32x4_t tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
+ int32x4_t tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
+ /* sqrt(2) * ( c1+c3-c5-c7) */
+ int32x4_t tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
+ int32x4_t tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
+
+ /* sqrt(2) * (c7-c3) */
+ z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
+ z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
+ /* sqrt(2) * (-c1-c3) */
+ int32x4_t z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
+ int32x4_t z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
+ /* sqrt(2) * (-c3-c5) */
+ int32x4_t z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
+ int32x4_t z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
+ /* sqrt(2) * (c5-c3) */
+ int32x4_t z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
+ int32x4_t z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
+
+ z3_l = vaddq_s32(z3_l, z5_l);
+ z3_h = vaddq_s32(z3_h, z5_h);
+ z4_l = vaddq_s32(z4_l, z5_l);
+ z4_h = vaddq_s32(z4_h, z5_h);
+
+ tmp4_l = vaddq_s32(tmp4_l, z1_l);
+ tmp4_h = vaddq_s32(tmp4_h, z1_h);
+ tmp4_l = vaddq_s32(tmp4_l, z3_l);
+ tmp4_h = vaddq_s32(tmp4_h, z3_h);
+ col7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P1),
+ vrshrn_n_s32(tmp4_h, DESCALE_P1));
+
+ tmp5_l = vaddq_s32(tmp5_l, z2_l);
+ tmp5_h = vaddq_s32(tmp5_h, z2_h);
+ tmp5_l = vaddq_s32(tmp5_l, z4_l);
+ tmp5_h = vaddq_s32(tmp5_h, z4_h);
+ col5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P1),
+ vrshrn_n_s32(tmp5_h, DESCALE_P1));
+
+ tmp6_l = vaddq_s32(tmp6_l, z2_l);
+ tmp6_h = vaddq_s32(tmp6_h, z2_h);
+ tmp6_l = vaddq_s32(tmp6_l, z3_l);
+ tmp6_h = vaddq_s32(tmp6_h, z3_h);
+ col3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P1),
+ vrshrn_n_s32(tmp6_h, DESCALE_P1));
+
+ tmp7_l = vaddq_s32(tmp7_l, z1_l);
+ tmp7_h = vaddq_s32(tmp7_h, z1_h);
+ tmp7_l = vaddq_s32(tmp7_l, z4_l);
+ tmp7_h = vaddq_s32(tmp7_h, z4_h);
+ col1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P1),
+ vrshrn_n_s32(tmp7_h, DESCALE_P1));
+
+ /* Transpose to work on columns in pass 2. */
+ int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
+ int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
+ int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
+ int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
+
+ int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
+ vreinterpretq_s32_s16(cols_45.val[0]));
+ int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
+ vreinterpretq_s32_s16(cols_45.val[1]));
+ int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
+ vreinterpretq_s32_s16(cols_67.val[0]));
+ int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
+ vreinterpretq_s32_s16(cols_67.val[1]));
+
+ int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
+ int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
+ int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
+ int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
+
+ int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
+ int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
+ int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
+ int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
+ int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
+ int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
+ int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
+ int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
+
+ /* Pass 2. */
+ tmp0 = vaddq_s16(row0, row7);
+ tmp7 = vsubq_s16(row0, row7);
+ tmp1 = vaddq_s16(row1, row6);
+ tmp6 = vsubq_s16(row1, row6);
+ tmp2 = vaddq_s16(row2, row5);
+ tmp5 = vsubq_s16(row2, row5);
+ tmp3 = vaddq_s16(row3, row4);
+ tmp4 = vsubq_s16(row3, row4);
+
+ /* Even part. */
+ tmp10 = vaddq_s16(tmp0, tmp3);
+ tmp13 = vsubq_s16(tmp0, tmp3);
+ tmp11 = vaddq_s16(tmp1, tmp2);
+ tmp12 = vsubq_s16(tmp1, tmp2);
+
+ row0 = vrshrq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
+ row4 = vrshrq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
+
+ tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
+ z1_l = vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
+ z1_h = vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
+
+ int32x4_t row2_scaled_l = vmlal_lane_s16(z1_l, vget_low_s16(tmp13),
+ consts.val[0], 3);
+ int32x4_t row2_scaled_h = vmlal_lane_s16(z1_h, vget_high_s16(tmp13),
+ consts.val[0], 3);
+ row2 = vcombine_s16(vrshrn_n_s32(row2_scaled_l, DESCALE_P2),
+ vrshrn_n_s32(row2_scaled_h, DESCALE_P2));
+
+ int32x4_t row6_scaled_l = vmlal_lane_s16(z1_l, vget_low_s16(tmp12),
+ consts.val[1], 3);
+ int32x4_t row6_scaled_h = vmlal_lane_s16(z1_h, vget_high_s16(tmp12),
+ consts.val[1], 3);
+ row6 = vcombine_s16(vrshrn_n_s32(row6_scaled_l, DESCALE_P2),
+ vrshrn_n_s32(row6_scaled_h, DESCALE_P2));
+
+ /* Odd part. */
+ z1 = vaddq_s16(tmp4, tmp7);
+ z2 = vaddq_s16(tmp5, tmp6);
+ z3 = vaddq_s16(tmp4, tmp6);
+ z4 = vaddq_s16(tmp5, tmp7);
+ /* sqrt(2) * c3 */
+ z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
+ z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
+ z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
+ z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
+
+ /* sqrt(2) * (-c1+c3+c5-c7) */
+ tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
+ tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
+ /* sqrt(2) * ( c1+c3-c5+c7) */
+ tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
+ tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
+ /* sqrt(2) * ( c1+c3+c5-c7) */
+ tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
+ tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
+ /* sqrt(2) * ( c1+c3-c5-c7) */
+ tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
+ tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
+
+ /* sqrt(2) * (c7-c3) */
+ z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
+ z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
+ /* sqrt(2) * (-c1-c3) */
+ z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
+ z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
+ /* sqrt(2) * (-c3-c5) */
+ z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
+ z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
+ /* sqrt(2) * (c5-c3) */
+ z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
+ z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
+
+ z3_l = vaddq_s32(z3_l, z5_l);
+ z3_h = vaddq_s32(z3_h, z5_h);
+ z4_l = vaddq_s32(z4_l, z5_l);
+ z4_h = vaddq_s32(z4_h, z5_h);
+
+ tmp4_l = vaddq_s32(tmp4_l, z1_l);
+ tmp4_h = vaddq_s32(tmp4_h, z1_h);
+ tmp4_l = vaddq_s32(tmp4_l, z3_l);
+ tmp4_h = vaddq_s32(tmp4_h, z3_h);
+ row7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P2),
+ vrshrn_n_s32(tmp4_h, DESCALE_P2));
+
+ tmp5_l = vaddq_s32(tmp5_l, z2_l);
+ tmp5_h = vaddq_s32(tmp5_h, z2_h);
+ tmp5_l = vaddq_s32(tmp5_l, z4_l);
+ tmp5_h = vaddq_s32(tmp5_h, z4_h);
+ row5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P2),
+ vrshrn_n_s32(tmp5_h, DESCALE_P2));
+
+ tmp6_l = vaddq_s32(tmp6_l, z2_l);
+ tmp6_h = vaddq_s32(tmp6_h, z2_h);
+ tmp6_l = vaddq_s32(tmp6_l, z3_l);
+ tmp6_h = vaddq_s32(tmp6_h, z3_h);
+ row3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P2),
+ vrshrn_n_s32(tmp6_h, DESCALE_P2));
+
+ tmp7_l = vaddq_s32(tmp7_l, z1_l);
+ tmp7_h = vaddq_s32(tmp7_h, z1_h);
+ tmp7_l = vaddq_s32(tmp7_l, z4_l);
+ tmp7_h = vaddq_s32(tmp7_h, z4_h);
+ row1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P2),
+ vrshrn_n_s32(tmp7_h, DESCALE_P2));
+
+ vst1q_s16(data + 0 * DCTSIZE, row0);
+ vst1q_s16(data + 1 * DCTSIZE, row1);
+ vst1q_s16(data + 2 * DCTSIZE, row2);
+ vst1q_s16(data + 3 * DCTSIZE, row3);
+ vst1q_s16(data + 4 * DCTSIZE, row4);
+ vst1q_s16(data + 5 * DCTSIZE, row5);
+ vst1q_s16(data + 6 * DCTSIZE, row6);
+ vst1q_s16(data + 7 * DCTSIZE, row7);
+}
diff --git a/simd/arm/common/jidctfst-neon.c b/simd/arm/common/jidctfst-neon.c
new file mode 100644
index 0000000..87806fd
--- /dev/null
+++ b/simd/arm/common/jidctfst-neon.c
@@ -0,0 +1,454 @@
+/*
+ * jidctfst-neon.c - fast IDCT (Arm NEON)
+ *
+ * Copyright 2019 The Chromium Authors. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <arm_neon.h>
+
+/*
+ * 'jsimd_idct_ifast_neon' performs dequantization and a fast, not so accurate
+ * inverse DCT (Discrete Cosine Transform) on one block of coefficients. It
+ * uses the same calculations and produces exactly the same output as IJG's
+ * original 'jpeg_idct_ifast' function, which can be found in jidctfst.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ * 0.082392200 = 2688 * 2^-15
+ * 0.414213562 = 13568 * 2^-15
+ * 0.847759065 = 27776 * 2^-15
+ * 0.613125930 = 20096 * 2^-15
+ *
+ * See jidctfst.c for further details of the IDCT algorithm. Where possible,
+ * the variable names and comments here in 'jsimd_idct_ifast_neon' match up
+ * with those in 'jpeg_idct_ifast'.
+ */
+
+#define PASS1_BITS 2
+
+#define F_0_082 2688
+#define F_0_414 13568
+#define F_0_847 27776
+#define F_0_613 20096
+
+void jsimd_idct_ifast_neon(void *dct_table,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ IFAST_MULT_TYPE *quantptr = dct_table;
+
+ /* Load DCT coefficients. */
+ int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
+ int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
+ int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE);
+ int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
+ int16x8_t row4 = vld1q_s16(coef_block + 4 * DCTSIZE);
+ int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
+ int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE);
+ int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+ /* Load quantization table values for DC coefficients. */
+ int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+ /* Dequantize DC coefficients. */
+ row0 = vmulq_s16(row0, quant_row0);
+
+ /* Construct bitmap to test if all AC coefficients are 0. */
+ int16x8_t bitmap = vorrq_s16(row1, row2);
+ bitmap = vorrq_s16(bitmap, row3);
+ bitmap = vorrq_s16(bitmap, row4);
+ bitmap = vorrq_s16(bitmap, row5);
+ bitmap = vorrq_s16(bitmap, row6);
+ bitmap = vorrq_s16(bitmap, row7);
+
+ int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
+ int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
+
+ if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
+ /* All AC coefficients are zero. */
+ /* Compute DC values and duplicate into vectors. */
+ int16x8_t dcval = row0;
+ row1 = dcval;
+ row2 = dcval;
+ row3 = dcval;
+ row4 = dcval;
+ row5 = dcval;
+ row6 = dcval;
+ row7 = dcval;
+ } else if (left_ac_bitmap == 0) {
+ /* AC coefficients are zero for columns 0, 1, 2 and 3. */
+ /* Use DC values for these columns. */
+ int16x4_t dcval = vget_low_s16(row0);
+
+ /* Commence regular fast IDCT computation for columns 4, 5, 6 and 7. */
+ /* Load quantization table.*/
+ int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+ int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+ int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+ int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
+ int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+ int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+ int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+ /* Even part: dequantize DCT coefficients. */
+ int16x4_t tmp0 = vget_high_s16(row0);
+ int16x4_t tmp1 = vmul_s16(vget_high_s16(row2), quant_row2);
+ int16x4_t tmp2 = vmul_s16(vget_high_s16(row4), quant_row4);
+ int16x4_t tmp3 = vmul_s16(vget_high_s16(row6), quant_row6);
+
+ int16x4_t tmp10 = vadd_s16(tmp0, tmp2); /* phase 3 */
+ int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
+
+ int16x4_t tmp13 = vadd_s16(tmp1, tmp3); /* phases 5-3 */
+ int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
+ int16x4_t tmp12 = vqdmulh_n_s16(tmp1_sub_tmp3, F_0_414);
+ tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
+ tmp12 = vsub_s16(tmp12, tmp13);
+
+ tmp0 = vadd_s16(tmp10, tmp13); /* phase 2 */
+ tmp3 = vsub_s16(tmp10, tmp13);
+ tmp1 = vadd_s16(tmp11, tmp12);
+ tmp2 = vsub_s16(tmp11, tmp12);
+
+ /* Odd part: dequantize DCT coefficients. */
+ int16x4_t tmp4 = vmul_s16(vget_high_s16(row1), quant_row1);
+ int16x4_t tmp5 = vmul_s16(vget_high_s16(row3), quant_row3);
+ int16x4_t tmp6 = vmul_s16(vget_high_s16(row5), quant_row5);
+ int16x4_t tmp7 = vmul_s16(vget_high_s16(row7), quant_row7);
+
+ int16x4_t z13 = vadd_s16(tmp6, tmp5); /* phase 6 */
+ int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
+ int16x4_t z11 = vadd_s16(tmp4, tmp7);
+ int16x4_t z12 = vsub_s16(tmp4, tmp7);
+
+ tmp7 = vadd_s16(z11, z13); /* phase 5 */
+ int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
+ tmp11 = vqdmulh_n_s16(z11_sub_z13, F_0_414);
+ tmp11 = vadd_s16(tmp11, z11_sub_z13);
+
+ int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
+ int16x4_t z5 = vqdmulh_n_s16(z10_add_z12, F_0_847);
+ z5 = vadd_s16(z5, z10_add_z12);
+ tmp10 = vqdmulh_n_s16(z12, F_0_082);
+ tmp10 = vadd_s16(tmp10, z12);
+ tmp10 = vsub_s16(tmp10, z5);
+ tmp12 = vqdmulh_n_s16(neg_z10, F_0_613);
+ tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
+ tmp12 = vadd_s16(tmp12, z5);
+
+ tmp6 = vsub_s16(tmp12, tmp7); /* phase 2 */
+ tmp5 = vsub_s16(tmp11, tmp6);
+ tmp4 = vadd_s16(tmp10, tmp5);
+
+ row0 = vcombine_s16(dcval, vadd_s16(tmp0, tmp7));
+ row7 = vcombine_s16(dcval, vsub_s16(tmp0, tmp7));
+ row1 = vcombine_s16(dcval, vadd_s16(tmp1, tmp6));
+ row6 = vcombine_s16(dcval, vsub_s16(tmp1, tmp6));
+ row2 = vcombine_s16(dcval, vadd_s16(tmp2, tmp5));
+ row5 = vcombine_s16(dcval, vsub_s16(tmp2, tmp5));
+ row4 = vcombine_s16(dcval, vadd_s16(tmp3, tmp4));
+ row3 = vcombine_s16(dcval, vsub_s16(tmp3, tmp4));
+ } else if (right_ac_bitmap == 0) {
+ /* AC coefficients are zero for columns 4, 5, 6 and 7. */
+ /* Use DC values for these columns. */
+ int16x4_t dcval = vget_high_s16(row0);
+
+ /* Commence regular fast IDCT computation for columns 0, 1, 2 and 3. */
+ /* Load quantization table.*/
+ int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+ int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+ int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+ int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
+ int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+ int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+ int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+ /* Even part: dequantize DCT coefficients. */
+ int16x4_t tmp0 = vget_low_s16(row0);
+ int16x4_t tmp1 = vmul_s16(vget_low_s16(row2), quant_row2);
+ int16x4_t tmp2 = vmul_s16(vget_low_s16(row4), quant_row4);
+ int16x4_t tmp3 = vmul_s16(vget_low_s16(row6), quant_row6);
+
+ int16x4_t tmp10 = vadd_s16(tmp0, tmp2); /* phase 3 */
+ int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
+
+ int16x4_t tmp13 = vadd_s16(tmp1, tmp3); /* phases 5-3 */
+ int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
+ int16x4_t tmp12 = vqdmulh_n_s16(tmp1_sub_tmp3, F_0_414);
+ tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
+ tmp12 = vsub_s16(tmp12, tmp13);
+
+ tmp0 = vadd_s16(tmp10, tmp13); /* phase 2 */
+ tmp3 = vsub_s16(tmp10, tmp13);
+ tmp1 = vadd_s16(tmp11, tmp12);
+ tmp2 = vsub_s16(tmp11, tmp12);
+
+ /* Odd part: dequantize DCT coefficients. */
+ int16x4_t tmp4 = vmul_s16(vget_low_s16(row1), quant_row1);
+ int16x4_t tmp5 = vmul_s16(vget_low_s16(row3), quant_row3);
+ int16x4_t tmp6 = vmul_s16(vget_low_s16(row5), quant_row5);
+ int16x4_t tmp7 = vmul_s16(vget_low_s16(row7), quant_row7);
+
+ int16x4_t z13 = vadd_s16(tmp6, tmp5); /* phase 6 */
+ int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
+ int16x4_t z11 = vadd_s16(tmp4, tmp7);
+ int16x4_t z12 = vsub_s16(tmp4, tmp7);
+
+ tmp7 = vadd_s16(z11, z13); /* phase 5 */
+ int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
+ tmp11 = vqdmulh_n_s16(z11_sub_z13, F_0_414);
+ tmp11 = vadd_s16(tmp11, z11_sub_z13);
+
+ int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
+ int16x4_t z5 = vqdmulh_n_s16(z10_add_z12, F_0_847);
+ z5 = vadd_s16(z5, z10_add_z12);
+ tmp10 = vqdmulh_n_s16(z12, F_0_082);
+ tmp10 = vadd_s16(tmp10, z12);
+ tmp10 = vsub_s16(tmp10, z5);
+ tmp12 = vqdmulh_n_s16(neg_z10, F_0_613);
+ tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
+ tmp12 = vadd_s16(tmp12, z5);
+
+ tmp6 = vsub_s16(tmp12, tmp7); /* phase 2 */
+ tmp5 = vsub_s16(tmp11, tmp6);
+ tmp4 = vadd_s16(tmp10, tmp5);
+
+ row0 = vcombine_s16(vadd_s16(tmp0, tmp7), dcval);
+ row7 = vcombine_s16(vsub_s16(tmp0, tmp7), dcval);
+ row1 = vcombine_s16(vadd_s16(tmp1, tmp6), dcval);
+ row6 = vcombine_s16(vsub_s16(tmp1, tmp6), dcval);
+ row2 = vcombine_s16(vadd_s16(tmp2, tmp5), dcval);
+ row5 = vcombine_s16(vsub_s16(tmp2, tmp5), dcval);
+ row4 = vcombine_s16(vadd_s16(tmp3, tmp4), dcval);
+ row3 = vcombine_s16(vsub_s16(tmp3, tmp4), dcval);
+ } else {
+ /* Some AC coefficients are non-zero; full IDCT calculation required. */
+ /* Load quantization table.*/
+ int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+ int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
+ int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+ int16x8_t quant_row4 = vld1q_s16(quantptr + 4 * DCTSIZE);
+ int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+ int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
+ int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+ /* Even part: dequantize DCT coefficients. */
+ int16x8_t tmp0 = row0;
+ int16x8_t tmp1 = vmulq_s16(row2, quant_row2);
+ int16x8_t tmp2 = vmulq_s16(row4, quant_row4);
+ int16x8_t tmp3 = vmulq_s16(row6, quant_row6);
+
+ int16x8_t tmp10 = vaddq_s16(tmp0, tmp2); /* phase 3 */
+ int16x8_t tmp11 = vsubq_s16(tmp0, tmp2);
+
+ int16x8_t tmp13 = vaddq_s16(tmp1, tmp3); /* phases 5-3 */
+ int16x8_t tmp1_sub_tmp3 = vsubq_s16(tmp1, tmp3);
+ int16x8_t tmp12 = vqdmulhq_n_s16(tmp1_sub_tmp3, F_0_414);
+ tmp12 = vaddq_s16(tmp12, tmp1_sub_tmp3);
+ tmp12 = vsubq_s16(tmp12, tmp13);
+
+ tmp0 = vaddq_s16(tmp10, tmp13); /* phase 2 */
+ tmp3 = vsubq_s16(tmp10, tmp13);
+ tmp1 = vaddq_s16(tmp11, tmp12);
+ tmp2 = vsubq_s16(tmp11, tmp12);
+
+ /* Odd part: dequantize DCT coefficients. */
+ int16x8_t tmp4 = vmulq_s16(row1, quant_row1);
+ int16x8_t tmp5 = vmulq_s16(row3, quant_row3);
+ int16x8_t tmp6 = vmulq_s16(row5, quant_row5);
+ int16x8_t tmp7 = vmulq_s16(row7, quant_row7);
+
+ int16x8_t z13 = vaddq_s16(tmp6, tmp5); /* phase 6 */
+ int16x8_t neg_z10 = vsubq_s16(tmp5, tmp6);
+ int16x8_t z11 = vaddq_s16(tmp4, tmp7);
+ int16x8_t z12 = vsubq_s16(tmp4, tmp7);
+
+ tmp7 = vaddq_s16(z11, z13); /* phase 5 */
+ int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
+ tmp11 = vqdmulhq_n_s16(z11_sub_z13, F_0_414);
+ tmp11 = vaddq_s16(tmp11, z11_sub_z13);
+
+ int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
+ int16x8_t z5 = vqdmulhq_n_s16(z10_add_z12, F_0_847);
+ z5 = vaddq_s16(z5, z10_add_z12);
+ tmp10 = vqdmulhq_n_s16(z12, F_0_082);
+ tmp10 = vaddq_s16(tmp10, z12);
+ tmp10 = vsubq_s16(tmp10, z5);
+ tmp12 = vqdmulhq_n_s16(neg_z10, F_0_613);
+ tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
+ tmp12 = vaddq_s16(tmp12, z5);
+
+ tmp6 = vsubq_s16(tmp12, tmp7); /* phase 2 */
+ tmp5 = vsubq_s16(tmp11, tmp6);
+ tmp4 = vaddq_s16(tmp10, tmp5);
+
+ row0 = vaddq_s16(tmp0, tmp7);
+ row7 = vsubq_s16(tmp0, tmp7);
+ row1 = vaddq_s16(tmp1, tmp6);
+ row6 = vsubq_s16(tmp1, tmp6);
+ row2 = vaddq_s16(tmp2, tmp5);
+ row5 = vsubq_s16(tmp2, tmp5);
+ row4 = vaddq_s16(tmp3, tmp4);
+ row3 = vsubq_s16(tmp3, tmp4);
+ }
+
+ /* Tranpose rows to work on columns in pass 2. */
+ int16x8x2_t rows_01 = vtrnq_s16(row0, row1);
+ int16x8x2_t rows_23 = vtrnq_s16(row2, row3);
+ int16x8x2_t rows_45 = vtrnq_s16(row4, row5);
+ int16x8x2_t rows_67 = vtrnq_s16(row6, row7);
+
+ int32x4x2_t rows_0145_l = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[0]),
+ vreinterpretq_s32_s16(rows_45.val[0]));
+ int32x4x2_t rows_0145_h = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[1]),
+ vreinterpretq_s32_s16(rows_45.val[1]));
+ int32x4x2_t rows_2367_l = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[0]),
+ vreinterpretq_s32_s16(rows_67.val[0]));
+ int32x4x2_t rows_2367_h = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[1]),
+ vreinterpretq_s32_s16(rows_67.val[1]));
+
+ int32x4x2_t cols_04 = vzipq_s32(rows_0145_l.val[0], rows_2367_l.val[0]);
+ int32x4x2_t cols_15 = vzipq_s32(rows_0145_h.val[0], rows_2367_h.val[0]);
+ int32x4x2_t cols_26 = vzipq_s32(rows_0145_l.val[1], rows_2367_l.val[1]);
+ int32x4x2_t cols_37 = vzipq_s32(rows_0145_h.val[1], rows_2367_h.val[1]);
+
+ int16x8_t col0 = vreinterpretq_s16_s32(cols_04.val[0]);
+ int16x8_t col1 = vreinterpretq_s16_s32(cols_15.val[0]);
+ int16x8_t col2 = vreinterpretq_s16_s32(cols_26.val[0]);
+ int16x8_t col3 = vreinterpretq_s16_s32(cols_37.val[0]);
+ int16x8_t col4 = vreinterpretq_s16_s32(cols_04.val[1]);
+ int16x8_t col5 = vreinterpretq_s16_s32(cols_15.val[1]);
+ int16x8_t col6 = vreinterpretq_s16_s32(cols_26.val[1]);
+ int16x8_t col7 = vreinterpretq_s16_s32(cols_37.val[1]);
+
+ /* 1-D IDCT, pass 2. */
+ /* Even part. */
+ int16x8_t tmp10 = vaddq_s16(col0, col4);
+ int16x8_t tmp11 = vsubq_s16(col0, col4);
+
+ int16x8_t tmp13 = vaddq_s16(col2, col6);
+ int16x8_t col2_sub_col6 = vsubq_s16(col2, col6);
+ int16x8_t tmp12 = vqdmulhq_n_s16(col2_sub_col6, F_0_414);
+ tmp12 = vaddq_s16(tmp12, col2_sub_col6);
+ tmp12 = vsubq_s16(tmp12, tmp13);
+
+ int16x8_t tmp0 = vaddq_s16(tmp10, tmp13);
+ int16x8_t tmp3 = vsubq_s16(tmp10, tmp13);
+ int16x8_t tmp1 = vaddq_s16(tmp11, tmp12);
+ int16x8_t tmp2 = vsubq_s16(tmp11, tmp12);
+
+ /* Odd part. */
+ int16x8_t z13 = vaddq_s16(col5, col3);
+ int16x8_t neg_z10 = vsubq_s16(col3, col5);
+ int16x8_t z11 = vaddq_s16(col1, col7);
+ int16x8_t z12 = vsubq_s16(col1, col7);
+
+ int16x8_t tmp7 = vaddq_s16(z11, z13); /* phase 5 */
+ int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
+ tmp11 = vqdmulhq_n_s16(z11_sub_z13, F_0_414);
+ tmp11 = vaddq_s16(tmp11, z11_sub_z13);
+
+ int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
+ int16x8_t z5 = vqdmulhq_n_s16(z10_add_z12, F_0_847);
+ z5 = vaddq_s16(z5, z10_add_z12);
+ tmp10 = vqdmulhq_n_s16(z12, F_0_082);
+ tmp10 = vaddq_s16(tmp10, z12);
+ tmp10 = vsubq_s16(tmp10, z5);
+ tmp12 = vqdmulhq_n_s16(neg_z10, F_0_613);
+ tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
+ tmp12 = vaddq_s16(tmp12, z5);
+
+ int16x8_t tmp6 = vsubq_s16(tmp12, tmp7); /* phase 2 */
+ int16x8_t tmp5 = vsubq_s16(tmp11, tmp6);
+ int16x8_t tmp4 = vaddq_s16(tmp10, tmp5);
+
+ col0 = vaddq_s16(tmp0, tmp7);
+ col7 = vsubq_s16(tmp0, tmp7);
+ col1 = vaddq_s16(tmp1, tmp6);
+ col6 = vsubq_s16(tmp1, tmp6);
+ col2 = vaddq_s16(tmp2, tmp5);
+ col5 = vsubq_s16(tmp2, tmp5);
+ col4 = vaddq_s16(tmp3, tmp4);
+ col3 = vsubq_s16(tmp3, tmp4);
+
+ /* Scale down by factor of 8, narrowing to 8-bit. */
+ int8x16_t cols_01_s8 = vcombine_s8(vqshrn_n_s16(col0, PASS1_BITS + 3),
+ vqshrn_n_s16(col1, PASS1_BITS + 3));
+ int8x16_t cols_45_s8 = vcombine_s8(vqshrn_n_s16(col4, PASS1_BITS + 3),
+ vqshrn_n_s16(col5, PASS1_BITS + 3));
+ int8x16_t cols_23_s8 = vcombine_s8(vqshrn_n_s16(col2, PASS1_BITS + 3),
+ vqshrn_n_s16(col3, PASS1_BITS + 3));
+ int8x16_t cols_67_s8 = vcombine_s8(vqshrn_n_s16(col6, PASS1_BITS + 3),
+ vqshrn_n_s16(col7, PASS1_BITS + 3));
+ /* Clamp to range [0-255]. */
+ uint8x16_t cols_01 = vreinterpretq_u8_s8(
+ vaddq_s8(cols_01_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+ uint8x16_t cols_45 = vreinterpretq_u8_s8(
+ vaddq_s8(cols_45_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+ uint8x16_t cols_23 = vreinterpretq_u8_s8(
+ vaddq_s8(cols_23_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+ uint8x16_t cols_67 = vreinterpretq_u8_s8(
+ vaddq_s8(cols_67_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+
+ /* Transpose block ready for store. */
+ uint32x4x2_t cols_0415 = vzipq_u32(vreinterpretq_u32_u8(cols_01),
+ vreinterpretq_u32_u8(cols_45));
+ uint32x4x2_t cols_2637 = vzipq_u32(vreinterpretq_u32_u8(cols_23),
+ vreinterpretq_u32_u8(cols_67));
+
+ uint8x16x2_t cols_0145 = vtrnq_u8(vreinterpretq_u8_u32(cols_0415.val[0]),
+ vreinterpretq_u8_u32(cols_0415.val[1]));
+ uint8x16x2_t cols_2367 = vtrnq_u8(vreinterpretq_u8_u32(cols_2637.val[0]),
+ vreinterpretq_u8_u32(cols_2637.val[1]));
+ uint16x8x2_t rows_0426 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[0]),
+ vreinterpretq_u16_u8(cols_2367.val[0]));
+ uint16x8x2_t rows_1537 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[1]),
+ vreinterpretq_u16_u8(cols_2367.val[1]));
+
+ uint8x16_t rows_04 = vreinterpretq_u8_u16(rows_0426.val[0]);
+ uint8x16_t rows_15 = vreinterpretq_u8_u16(rows_1537.val[0]);
+ uint8x16_t rows_26 = vreinterpretq_u8_u16(rows_0426.val[1]);
+ uint8x16_t rows_37 = vreinterpretq_u8_u16(rows_1537.val[1]);
+
+ JSAMPROW outptr0 = output_buf[0] + output_col;
+ JSAMPROW outptr1 = output_buf[1] + output_col;
+ JSAMPROW outptr2 = output_buf[2] + output_col;
+ JSAMPROW outptr3 = output_buf[3] + output_col;
+ JSAMPROW outptr4 = output_buf[4] + output_col;
+ JSAMPROW outptr5 = output_buf[5] + output_col;
+ JSAMPROW outptr6 = output_buf[6] + output_col;
+ JSAMPROW outptr7 = output_buf[7] + output_col;
+
+ /* Store DCT block to memory. */
+ vst1q_lane_u64((uint64_t *)outptr0, vreinterpretq_u64_u8(rows_04), 0);
+ vst1q_lane_u64((uint64_t *)outptr1, vreinterpretq_u64_u8(rows_15), 0);
+ vst1q_lane_u64((uint64_t *)outptr2, vreinterpretq_u64_u8(rows_26), 0);
+ vst1q_lane_u64((uint64_t *)outptr3, vreinterpretq_u64_u8(rows_37), 0);
+ vst1q_lane_u64((uint64_t *)outptr4, vreinterpretq_u64_u8(rows_04), 1);
+ vst1q_lane_u64((uint64_t *)outptr5, vreinterpretq_u64_u8(rows_15), 1);
+ vst1q_lane_u64((uint64_t *)outptr6, vreinterpretq_u64_u8(rows_26), 1);
+ vst1q_lane_u64((uint64_t *)outptr7, vreinterpretq_u64_u8(rows_37), 1);
+}
diff --git a/simd/arm/common/jidctint-neon.c b/simd/arm/common/jidctint-neon.c
new file mode 100644
index 0000000..0fd4a36
--- /dev/null
+++ b/simd/arm/common/jidctint-neon.c
@@ -0,0 +1,758 @@
+/*
+ * jidctint-neon.c - slow IDCT (Arm NEON)
+ *
+ * Copyright 2019 The Chromium Authors. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jconfigint.h"
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <arm_neon.h>
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
+
+/* The computation of the inverse DCT requires the use of constants known at
+ * compile-time. Scaled integer constants are used to avoid floating-point
+ * arithmetic:
+ * 0.298631336 = 2446 * 2^-13
+ * 0.390180644 = 3196 * 2^-13
+ * 0.541196100 = 4433 * 2^-13
+ * 0.765366865 = 6270 * 2^-13
+ * 0.899976223 = 7373 * 2^-13
+ * 1.175875602 = 9633 * 2^-13
+ * 1.501321110 = 12299 * 2^-13
+ * 1.847759065 = 15137 * 2^-13
+ * 1.961570560 = 16069 * 2^-13
+ * 2.053119869 = 16819 * 2^-13
+ * 2.562915447 = 20995 * 2^-13
+ * 3.072711026 = 25172 * 2^-13
+ */
+
+#define F_0_298 2446
+#define F_0_390 3196
+#define F_0_541 4433
+#define F_0_765 6270
+#define F_0_899 7373
+#define F_1_175 9633
+#define F_1_501 12299
+#define F_1_847 15137
+#define F_1_961 16069
+#define F_2_053 16819
+#define F_2_562 20995
+#define F_3_072 25172
+
+#define F_1_175_MINUS_1_961 (F_1_175 - F_1_961)
+#define F_1_175_MINUS_0_390 (F_1_175 - F_0_390)
+#define F_0_541_MINUS_1_847 (F_0_541 - F_1_847)
+#define F_3_072_MINUS_2_562 (F_3_072 - F_2_562)
+#define F_0_298_MINUS_0_899 (F_0_298 - F_0_899)
+#define F_1_501_MINUS_0_899 (F_1_501 - F_0_899)
+#define F_2_053_MINUS_2_562 (F_2_053 - F_2_562)
+#define F_0_541_PLUS_0_765 (F_0_541 + F_0_765)
+
+ALIGN(16) static const int16_t jsimd_idct_islow_neon_consts[] = {
+ F_0_899, F_0_541,
+ F_2_562, F_0_298_MINUS_0_899,
+ F_1_501_MINUS_0_899, F_2_053_MINUS_2_562,
+ F_0_541_PLUS_0_765, F_1_175,
+ F_1_175_MINUS_0_390, F_0_541_MINUS_1_847,
+ F_3_072_MINUS_2_562, F_1_175_MINUS_1_961,
+ 0, 0, 0, 0
+ };
+
+/* Forward declaration of regular and sparse IDCT helper functions. */
+
+static inline void jsimd_idct_islow_pass1_regular(int16x4_t row0,
+ int16x4_t row1,
+ int16x4_t row2,
+ int16x4_t row3,
+ int16x4_t row4,
+ int16x4_t row5,
+ int16x4_t row6,
+ int16x4_t row7,
+ int16x4_t quant_row0,
+ int16x4_t quant_row1,
+ int16x4_t quant_row2,
+ int16x4_t quant_row3,
+ int16x4_t quant_row4,
+ int16x4_t quant_row5,
+ int16x4_t quant_row6,
+ int16x4_t quant_row7,
+ int16_t *workspace_1,
+ int16_t *workspace_2);
+
+static inline void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
+ int16x4_t row1,
+ int16x4_t row2,
+ int16x4_t row3,
+ int16x4_t quant_row0,
+ int16x4_t quant_row1,
+ int16x4_t quant_row2,
+ int16x4_t quant_row3,
+ int16_t *workspace_1,
+ int16_t *workspace_2);
+
+static inline void jsimd_idct_islow_pass2_regular(int16_t *workspace,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col,
+ unsigned buf_offset);
+
+static inline void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col,
+ unsigned buf_offset);
+
+
+/* Performs dequantization and inverse DCT on one block of coefficients. For
+ * reference, the C implementation 'jpeg_idct_slow' can be found jidctint.c.
+ *
+ * Optimization techniques used for data access:
+ *
+ * In each pass, the inverse DCT is computed on the left and right 4x8 halves
+ * of the DCT block. This avoids spilling due to register pressure and the
+ * increased granularity allows an optimized calculation depending on the
+ * values of the DCT coefficients. Between passes, intermediate data is stored
+ * in 4x8 workspace buffers.
+ *
+ * Transposing the 8x8 DCT block after each pass can be achieved by transposing
+ * each of the four 4x4 quadrants, and swapping quadrants 1 and 2 (in the
+ * diagram below.) Swapping quadrants is cheap as the second pass can just load
+ * from the other workspace buffer.
+ *
+ * +-------+-------+ +-------+-------+
+ * | | | | | |
+ * | 0 | 1 | | 0 | 2 |
+ * | | | transpose | | |
+ * +-------+-------+ ------> +-------+-------+
+ * | | | | | |
+ * | 2 | 3 | | 1 | 3 |
+ * | | | | | |
+ * +-------+-------+ +-------+-------+
+ *
+ * Optimization techniques used to accelerate the inverse DCT calculation:
+ *
+ * In a DCT coefficient block, the coefficients are increasingly likely to be 0
+ * moving diagonally from top left to bottom right. If whole rows of
+ * coefficients are 0, the inverse DCT calculation can be simplified. In this
+ * NEON implementation, on the first pass of the inverse DCT, we test for three
+ * special cases before defaulting to a full 'regular' inverse DCT:
+ *
+ * i) AC and DC coefficients are all zero. (Only tested for the right 4x8
+ * half of the DCT coefficient block.) In this case the inverse DCT result
+ * is all zero. We do no work here, signalling that the 'sparse' case is
+ * required in the second pass.
+ * ii) AC coefficients (all but the top row) are zero. In this case, the value
+ * of the inverse DCT of the AC coefficients is just the DC coefficients.
+ * iii) Coefficients of rows 4, 5, 6 and 7 are all zero. In this case we opt to
+ * execute a 'sparse' simplified inverse DCT.
+ *
+ * In the second pass, only a single special case is tested: whether the the AC
+ * and DC coefficients were all zero in the right 4x8 block in the first pass
+ * (case 'i'). If this is the case, a 'sparse' variant of the second pass
+ * inverse DCT is executed for both the left and right halves of the DCT block.
+ * (The transposition after the first pass would have made the bottom half of
+ * the block all zero.)
+ */
+
+void jsimd_idct_islow_neon(void *dct_table,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ ISLOW_MULT_TYPE *quantptr = dct_table;
+
+ int16_t workspace_l[8 * DCTSIZE / 2];
+ int16_t workspace_r[8 * DCTSIZE / 2];
+
+ /* Compute IDCT first pass on left 4x8 coefficient block. */
+ /* Load DCT coefficients in left 4x8 block. */
+ int16x4_t row0 = vld1_s16(coef_block + 0 * DCTSIZE);
+ int16x4_t row1 = vld1_s16(coef_block + 1 * DCTSIZE);
+ int16x4_t row2 = vld1_s16(coef_block + 2 * DCTSIZE);
+ int16x4_t row3 = vld1_s16(coef_block + 3 * DCTSIZE);
+ int16x4_t row4 = vld1_s16(coef_block + 4 * DCTSIZE);
+ int16x4_t row5 = vld1_s16(coef_block + 5 * DCTSIZE);
+ int16x4_t row6 = vld1_s16(coef_block + 6 * DCTSIZE);
+ int16x4_t row7 = vld1_s16(coef_block + 7 * DCTSIZE);
+
+ /* Load quantization table for left 4x8 block. */
+ int16x4_t quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE);
+ int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+ int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+ int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+ int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
+ int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+ int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+ int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+ /* Construct bitmap to test if DCT coefficients in left 4x8 block are 0. */
+ int16x4_t bitmap = vorr_s16(row7, row6);
+ bitmap = vorr_s16(bitmap, row5);
+ bitmap = vorr_s16(bitmap, row4);
+ int64_t bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+ if (bitmap_rows_4567 == 0) {
+ bitmap = vorr_s16(bitmap, row3);
+ bitmap = vorr_s16(bitmap, row2);
+ bitmap = vorr_s16(bitmap, row1);
+ int64_t left_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+ if (left_ac_bitmap == 0) {
+ int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
+ int16x4x4_t quadrant = { dcval, dcval, dcval, dcval };
+ /* Store 4x4 blocks to workspace, transposing in the process. */
+ vst4_s16(workspace_l, quadrant);
+ vst4_s16(workspace_r, quadrant);
+ } else {
+ jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
+ quant_row1, quant_row2, quant_row3,
+ workspace_l, workspace_r);
+ }
+ } else {
+ jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
+ row6, row7, quant_row0, quant_row1,
+ quant_row2, quant_row3, quant_row4,
+ quant_row5, quant_row6, quant_row7,
+ workspace_l, workspace_r);
+ }
+
+ /* Compute IDCT first pass on right 4x8 coefficient block.*/
+ /* Load DCT coefficients for right 4x8 block. */
+ row0 = vld1_s16(coef_block + 0 * DCTSIZE + 4);
+ row1 = vld1_s16(coef_block + 1 * DCTSIZE + 4);
+ row2 = vld1_s16(coef_block + 2 * DCTSIZE + 4);
+ row3 = vld1_s16(coef_block + 3 * DCTSIZE + 4);
+ row4 = vld1_s16(coef_block + 4 * DCTSIZE + 4);
+ row5 = vld1_s16(coef_block + 5 * DCTSIZE + 4);
+ row6 = vld1_s16(coef_block + 6 * DCTSIZE + 4);
+ row7 = vld1_s16(coef_block + 7 * DCTSIZE + 4);
+
+ /* Load quantization table for right 4x8 block. */
+ quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE + 4);
+ quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+ quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+ quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+ quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
+ quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+ quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+ quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+ /* Construct bitmap to test if DCT coefficients in right 4x8 block are 0. */
+ bitmap = vorr_s16(row7, row6);
+ bitmap = vorr_s16(bitmap, row5);
+ bitmap = vorr_s16(bitmap, row4);
+ bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+ bitmap = vorr_s16(bitmap, row3);
+ bitmap = vorr_s16(bitmap, row2);
+ bitmap = vorr_s16(bitmap, row1);
+ int64_t right_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+ /* Initialise to non-zero value: defaults to regular second pass. */
+ int64_t right_ac_dc_bitmap = 1;
+
+ if (right_ac_bitmap == 0) {
+ bitmap = vorr_s16(bitmap, row0);
+ right_ac_dc_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+ if (right_ac_dc_bitmap != 0) {
+ int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
+ int16x4x4_t quadrant = { dcval, dcval, dcval, dcval };
+ /* Store 4x4 blocks to workspace, transposing in the process. */
+ vst4_s16(workspace_l + 4 * DCTSIZE / 2, quadrant);
+ vst4_s16(workspace_r + 4 * DCTSIZE / 2, quadrant);
+ }
+ } else {
+ if (bitmap_rows_4567 == 0) {
+ jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
+ quant_row1, quant_row2, quant_row3,
+ workspace_l + 4 * DCTSIZE / 2,
+ workspace_r + 4 * DCTSIZE / 2);
+ } else {
+ jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
+ row6, row7, quant_row0, quant_row1,
+ quant_row2, quant_row3, quant_row4,
+ quant_row5, quant_row6, quant_row7,
+ workspace_l + 4 * DCTSIZE / 2,
+ workspace_r + 4 * DCTSIZE / 2);
+ }
+ }
+
+ /* Second pass: compute IDCT on rows in workspace. */
+ /* If all coefficients in right 4x8 block are 0, use 'sparse' second pass. */
+ if (right_ac_dc_bitmap == 0) {
+ jsimd_idct_islow_pass2_sparse(workspace_l, output_buf, output_col, 0);
+ jsimd_idct_islow_pass2_sparse(workspace_r, output_buf, output_col, 4);
+ } else {
+ jsimd_idct_islow_pass2_regular(workspace_l, output_buf, output_col, 0);
+ jsimd_idct_islow_pass2_regular(workspace_r, output_buf, output_col, 4);
+ }
+}
+
+
+/* Performs dequantization and the first pass of the slow-but-accurate inverse
+ * DCT on a 4x8 block of coefficients. (To process the full 8x8 DCT block this
+ * function - or some other optimized variant - needs to be called on both the
+ * right and left 4x8 blocks.)
+ *
+ * This 'regular' version assumes that no optimization can be made to the IDCT
+ * calculation since no useful set of AC coefficients are all 0.
+ *
+ * The original C implementation of the slow IDCT 'jpeg_idct_slow' can be found
+ * in jidctint.c. Algorithmic changes made here are documented inline.
+ */
+
+static inline void jsimd_idct_islow_pass1_regular(int16x4_t row0,
+ int16x4_t row1,
+ int16x4_t row2,
+ int16x4_t row3,
+ int16x4_t row4,
+ int16x4_t row5,
+ int16x4_t row6,
+ int16x4_t row7,
+ int16x4_t quant_row0,
+ int16x4_t quant_row1,
+ int16x4_t quant_row2,
+ int16x4_t quant_row3,
+ int16x4_t quant_row4,
+ int16x4_t quant_row5,
+ int16x4_t quant_row6,
+ int16x4_t quant_row7,
+ int16_t *workspace_1,
+ int16_t *workspace_2)
+{
+ /* Load constants for IDCT calculation. */
+ const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+
+ /* Even part. */
+ int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
+ int16x4_t z3_s16 = vmul_s16(row6, quant_row6);
+
+ int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+ int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+ tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
+ tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
+
+ z2_s16 = vmul_s16(row0, quant_row0);
+ z3_s16 = vmul_s16(row4, quant_row4);
+
+ int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
+ int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+ int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+ int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+ /* Odd part. */
+ int16x4_t tmp0_s16 = vmul_s16(row7, quant_row7);
+ int16x4_t tmp1_s16 = vmul_s16(row5, quant_row5);
+ int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
+ int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
+
+ z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
+ int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
+
+ /* Implementation as per 'jpeg_idct_islow' in jidctint.c:
+ * z5 = (z3 + z4) * 1.175875602;
+ * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ * z3 += z5; z4 += z5;
+ *
+ * This implementation:
+ * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+ */
+
+ int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+ int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+ z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+ z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+ /* Implementation as per 'jpeg_idct_islow' in jidctint.c:
+ * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ * tmp0 += z1 + z3; tmp1 += z2 + z4;
+ * tmp2 += z2 + z3; tmp3 += z1 + z4;
+ *
+ * This implementation:
+ * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ * tmp0 += z3; tmp1 += z4;
+ * tmp2 += z3; tmp3 += z4;
+ */
+
+ tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
+ tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
+ tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
+ tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
+
+ tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
+ tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
+ tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
+ tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
+
+ tmp0 = vaddq_s32(tmp0, z3);
+ tmp1 = vaddq_s32(tmp1, z4);
+ tmp2 = vaddq_s32(tmp2, z3);
+ tmp3 = vaddq_s32(tmp3, z4);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ int16x4x4_t rows_0123 = { vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
+ };
+ int16x4x4_t rows_4567 = { vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
+ };
+
+ /* Store 4x4 blocks to the intermediate workspace ready for second pass. */
+ /* (VST4 transposes the blocks - we need to operate on rows in next pass.) */
+ vst4_s16(workspace_1, rows_0123);
+ vst4_s16(workspace_2, rows_4567);
+}
+
+
+/* Performs dequantization and the first pass of the slow-but-accurate inverse
+ * DCT on a 4x8 block of coefficients.
+ *
+ * This 'sparse' version assumes that the AC coefficients in rows 4, 5, 6 and 7
+ * are all 0. This simplifies the IDCT calculation, accelerating overall
+ * performance.
+ */
+
+static inline void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
+ int16x4_t row1,
+ int16x4_t row2,
+ int16x4_t row3,
+ int16x4_t quant_row0,
+ int16x4_t quant_row1,
+ int16x4_t quant_row2,
+ int16x4_t quant_row3,
+ int16_t *workspace_1,
+ int16_t *workspace_2)
+{
+ /* Load constants for IDCT computation. */
+ const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+
+ /* Even part. */
+ int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
+ /* z3 is all 0. */
+
+ int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+ int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+
+ z2_s16 = vmul_s16(row0, quant_row0);
+ int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
+ int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+ int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+ int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+ /* Odd part. */
+ /* tmp0 and tmp1 are both all 0. */
+ int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
+ int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
+
+ int16x4_t z3_s16 = tmp2_s16;
+ int16x4_t z4_s16 = tmp3_s16;
+
+ int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+ int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+ z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+ z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+ tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
+ tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
+ tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
+ tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ int16x4x4_t rows_0123 = { vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
+ };
+ int16x4x4_t rows_4567 = { vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
+ };
+
+ /* Store 4x4 blocks to the intermediate workspace ready for second pass. */
+ /* (VST4 transposes the blocks - we need to operate on rows in next pass.) */
+ vst4_s16(workspace_1, rows_0123);
+ vst4_s16(workspace_2, rows_4567);
+}
+
+
+/* Performs the second pass of the slow-but-accurate inverse DCT on a 4x8 block
+ * of coefficients. (To process the full 8x8 DCT block this function - or some
+ * other optimized variant - needs to be called on both the right and left 4x8
+ * blocks.)
+ *
+ * This 'regular' version assumes that no optimization can be made to the IDCT
+ * calculation since no useful set of coefficient values are all 0 after the
+ * first pass.
+ *
+ * Again, the original C implementation of the slow IDCT 'jpeg_idct_slow' can
+ * be found in jidctint.c. Algorithmic changes made here are documented inline.
+ */
+
+static inline void jsimd_idct_islow_pass2_regular(int16_t *workspace,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col,
+ unsigned buf_offset)
+{
+ /* Load constants for IDCT computation. */
+ const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+
+ /* Even part. */
+ int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
+ int16x4_t z3_s16 = vld1_s16(workspace + 6 * DCTSIZE / 2);
+
+ int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+ int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+ tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
+ tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
+
+ z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
+ z3_s16 = vld1_s16(workspace + 4 * DCTSIZE / 2);
+
+ int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
+ int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+ int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+ int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+ /* Odd part. */
+ int16x4_t tmp0_s16 = vld1_s16(workspace + 7 * DCTSIZE / 2);
+ int16x4_t tmp1_s16 = vld1_s16(workspace + 5 * DCTSIZE / 2);
+ int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
+ int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
+
+ z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
+ int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
+
+ /* Implementation as per 'jpeg_idct_islow' in jidctint.c:
+ * z5 = (z3 + z4) * 1.175875602;
+ * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
+ * z3 += z5; z4 += z5;
+ *
+ * This implementation:
+ * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+ * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+ */
+
+ int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+ int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+ z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+ z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+ /* Implementation as per 'jpeg_idct_islow' in jidctint.c:
+ * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
+ * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
+ * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
+ * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
+ * tmp0 += z1 + z3; tmp1 += z2 + z4;
+ * tmp2 += z2 + z3; tmp3 += z1 + z4;
+ *
+ * This implementation:
+ * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+ * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+ * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+ * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+ * tmp0 += z3; tmp1 += z4;
+ * tmp2 += z3; tmp3 += z4;
+ */
+
+ tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
+ tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
+ tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
+ tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
+
+ tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
+ tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
+ tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
+ tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
+
+ tmp0 = vaddq_s32(tmp0, z3);
+ tmp1 = vaddq_s32(tmp1, z4);
+ tmp2 = vaddq_s32(tmp2, z3);
+ tmp3 = vaddq_s32(tmp3, z4);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
+ vaddhn_s32(tmp12, tmp1));
+ int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
+ vaddhn_s32(tmp13, tmp0));
+ int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
+ vsubhn_s32(tmp11, tmp2));
+ int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
+ vsubhn_s32(tmp10, tmp3));
+ /* Descale and narrow to 8-bit. */
+ int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
+ int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
+ int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
+ int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
+ /* Clamp to range [0-255]. */
+ uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+ uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+ uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+ uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+
+ /* Transpose 4x8 block and store to memory. */
+ /* Zipping adjacent columns together allows us to store 16-bit elements. */
+ uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
+ uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
+ uint16x4x4_t cols_01_23_45_67 = { vreinterpret_u16_u8(cols_01_23.val[0]),
+ vreinterpret_u16_u8(cols_01_23.val[1]),
+ vreinterpret_u16_u8(cols_45_67.val[0]),
+ vreinterpret_u16_u8(cols_45_67.val[1])
+ };
+
+ JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
+ JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
+ JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
+ JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
+ /* VST4 of 16-bit elements completes the transpose. */
+ vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
+ vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
+ vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
+ vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
+}
+
+
+/* Performs the second pass of the slow-but-accurate inverse DCT on a 4x8 block
+ * of coefficients.
+ *
+ * This 'sparse' version assumes that the coefficient values (after the first
+ * pass) in rows 4, 5, 6 and 7 are all 0. This simplifies the IDCT calculation,
+ * accelerating overall performance.
+ */
+
+static inline void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
+ JSAMPARRAY output_buf,
+ JDIMENSION output_col,
+ unsigned buf_offset)
+{
+ /* Load constants for IDCT computation. */
+ const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+
+ /* Even part. */
+ int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
+ /* z3 is all 0. */
+
+ int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+ int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+
+ z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
+ int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
+ int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+ int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+ int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+ /* Odd part. */
+ /* tmp0 and tmp1 are both all 0. */
+ int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
+ int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
+
+ int16x4_t z3_s16 = tmp2_s16;
+ int16x4_t z4_s16 = tmp3_s16;
+
+ int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+ z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+ int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+ z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+ tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
+ tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
+ tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
+ tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
+ vaddhn_s32(tmp12, tmp1));
+ int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
+ vaddhn_s32(tmp13, tmp0));
+ int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
+ vsubhn_s32(tmp11, tmp2));
+ int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
+ vsubhn_s32(tmp10, tmp3));
+ /* Descale and narrow to 8-bit. */
+ int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
+ int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
+ int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
+ int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
+ /* Clamp to range [0-255]. */
+ uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+ uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+ uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+ uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
+ vdup_n_u8(CENTERJSAMPLE));
+
+ /* Transpose 4x8 block and store to memory. */
+ /* Zipping adjacent columns together allow us to store 16-bit elements. */
+ uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
+ uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
+ uint16x4x4_t cols_01_23_45_67 = { vreinterpret_u16_u8(cols_01_23.val[0]),
+ vreinterpret_u16_u8(cols_01_23.val[1]),
+ vreinterpret_u16_u8(cols_45_67.val[0]),
+ vreinterpret_u16_u8(cols_45_67.val[1])
+ };
+
+ JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
+ JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
+ JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
+ JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
+ /* VST4 of 16-bit elements completes the transpose. */
+ vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
+ vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
+ vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
+ vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
+}
diff --git a/simd/arm/common/jidctred-neon.c b/simd/arm/common/jidctred-neon.c
new file mode 100644
index 0000000..ed4232c
--- /dev/null
+++ b/simd/arm/common/jidctred-neon.c
@@ -0,0 +1,469 @@
+/*
+ * jidctred-neon.c - reduced-size IDCT (Arm NEON)
+ *
+ * Copyright 2019 The Chromium Authors. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jconfigint.h"
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <arm_neon.h>
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+
+#define F_0_211 1730
+#define F_0_509 4176
+#define F_0_601 4926
+#define F_0_720 5906
+#define F_0_765 6270
+#define F_0_850 6967
+#define F_0_899 7373
+#define F_1_061 8697
+#define F_1_272 10426
+#define F_1_451 11893
+#define F_1_847 15137
+#define F_2_172 17799
+#define F_2_562 20995
+#define F_3_624 29692
+
+/*
+ * 'jsimd_idct_2x2_neon' is an inverse-DCT function for getting reduced-size
+ * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations and
+ * produces exactly the same output as IJG's original 'jpeg_idct_2x2' function
+ * from jpeg-6b, which can be found in jidctred.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ * 0.720959822 = 5906 * 2^-13
+ * 0.850430095 = 6967 * 2^-13
+ * 1.272758580 = 10426 * 2^-13
+ * 3.624509785 = 29692 * 2^-13
+ *
+ * See jidctred.c for further details of the 2x2 reduced IDCT algorithm. Where
+ * possible, the variable names and comments here in 'jsimd_idct_2x2_neon'
+ * match up with those in 'jpeg_idct_2x2'.
+ *
+ * NOTE: jpeg-8 has an improved implementation of the 2x2 inverse-DCT which
+ * requires fewer arithmetic operations and hence should be faster. The
+ * primary purpose of this particular NEON optimized function is bit
+ * exact compatibility with jpeg-6b.
+ */
+
+void jsimd_idct_2x2_neon(void *dct_table,
+ JCOEFPTR coef_block,
+ JSAMPARRAY restrict output_buf,
+ JDIMENSION output_col)
+{
+ ISLOW_MULT_TYPE *quantptr = dct_table;
+
+ /* Load DCT coefficients. */
+ int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
+ int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
+ int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
+ int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
+ int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+ /* Load DCT quantization table. */
+ int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+ int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+ int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+ int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+ int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+ /* Dequantize DCT coefficients. */
+ row0 = vmulq_s16(row0, quant_row0);
+ row1 = vmulq_s16(row1, quant_row1);
+ row3 = vmulq_s16(row3, quant_row3);
+ row5 = vmulq_s16(row5, quant_row5);
+ row7 = vmulq_s16(row7, quant_row7);
+
+ /* Pass 1: process input columns; put results in vectors row0 and row1. */
+ /* Even part. */
+ int32x4_t tmp10_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 2);
+ int32x4_t tmp10_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 2);
+
+ /* Odd part. */
+ int32x4_t tmp0_l = vmull_n_s16(vget_low_s16(row1), F_3_624);
+ tmp0_l = vmlal_n_s16(tmp0_l, vget_low_s16(row3), -F_1_272);
+ tmp0_l = vmlal_n_s16(tmp0_l, vget_low_s16(row5), F_0_850);
+ tmp0_l = vmlal_n_s16(tmp0_l, vget_low_s16(row7), -F_0_720);
+ int32x4_t tmp0_h = vmull_n_s16(vget_high_s16(row1), F_3_624);
+ tmp0_h = vmlal_n_s16(tmp0_h, vget_high_s16(row3), -F_1_272);
+ tmp0_h = vmlal_n_s16(tmp0_h, vget_high_s16(row5), F_0_850);
+ tmp0_h = vmlal_n_s16(tmp0_h, vget_high_s16(row7), -F_0_720);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp0_l), CONST_BITS),
+ vrshrn_n_s32(vaddq_s32(tmp10_h, tmp0_h), CONST_BITS));
+ row1 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp0_l), CONST_BITS),
+ vrshrn_n_s32(vsubq_s32(tmp10_h, tmp0_h), CONST_BITS));
+
+ /* Transpose two rows ready for second pass. */
+ int16x8x2_t cols_0246_1357 = vtrnq_s16(row0, row1);
+ int16x8_t cols_0246 = cols_0246_1357.val[0];
+ int16x8_t cols_1357 = cols_0246_1357.val[1];
+ /* Duplicate columns such that each is accessible in its own vector. */
+ int32x4x2_t cols_1155_3377 = vtrnq_s32(vreinterpretq_s32_s16(cols_1357),
+ vreinterpretq_s32_s16(cols_1357));
+ int16x8_t cols_1155 = vreinterpretq_s16_s32(cols_1155_3377.val[0]);
+ int16x8_t cols_3377 = vreinterpretq_s16_s32(cols_1155_3377.val[1]);
+
+ /* Pass 2: process 2 rows, store to output array. */
+ /* Even part: only interested in col0; top half of tmp10 is "don't care". */
+ int32x4_t tmp10 = vshll_n_s16(vget_low_s16(cols_0246), CONST_BITS + 2);
+
+ /* Odd part. Only interested in bottom half of tmp0. */
+ int32x4_t tmp0 = vmull_n_s16(vget_low_s16(cols_1155), F_3_624);
+ tmp0 = vmlal_n_s16(tmp0, vget_low_s16(cols_3377), -F_1_272);
+ tmp0 = vmlal_n_s16(tmp0, vget_high_s16(cols_1155), F_0_850);
+ tmp0 = vmlal_n_s16(tmp0, vget_high_s16(cols_3377), -F_0_720);
+
+ /* Final output stage: descale and clamp to range [0-255]. */
+ int16x8_t output_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp0),
+ vsubhn_s32(tmp10, tmp0));
+ output_s16 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_s16,
+ CONST_BITS + PASS1_BITS + 3 + 2 - 16);
+ /* Narrow to 8-bit and convert to unsigned. */
+ uint8x8_t output_u8 = vqmovun_s16(output_s16);
+
+ /* Store 2x2 block to memory. */
+ vst1_lane_u8(output_buf[0] + output_col, output_u8, 0);
+ vst1_lane_u8(output_buf[1] + output_col, output_u8, 1);
+ vst1_lane_u8(output_buf[0] + output_col + 1, output_u8, 4);
+ vst1_lane_u8(output_buf[1] + output_col + 1, output_u8, 5);
+}
+
+
+/*
+ * 'jsimd_idct_4x4_neon' is an inverse-DCT function for getting reduced-size
+ * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations and
+ * produces exactly the same output as IJG's original 'jpeg_idct_4x4' function
+ * from jpeg-6b, which can be found in jidctred.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ * 0.211164243 = 1730 * 2^-13
+ * 0.509795579 = 4176 * 2^-13
+ * 0.601344887 = 4926 * 2^-13
+ * 0.765366865 = 6270 * 2^-13
+ * 0.899976223 = 7373 * 2^-13
+ * 1.061594337 = 8697 * 2^-13
+ * 1.451774981 = 11893 * 2^-13
+ * 1.847759065 = 15137 * 2^-13
+ * 2.172734803 = 17799 * 2^-13
+ * 2.562915447 = 20995 * 2^-13
+ *
+ * See jidctred.c for further details of the 4x4 reduced IDCT algorithm. Where
+ * possible, the variable names and comments here in 'jsimd_idct_4x4_neon'
+ * match up with those in 'jpeg_idct_4x4'.
+ *
+ * NOTE: jpeg-8 has an improved implementation of the 4x4 inverse-DCT which
+ * requires fewer arithmetic operations and hence should be faster. The
+ * primary purpose of this particular NEON optimized function is bit
+ * exact compatibility with jpeg-6b.
+ */
+
+ALIGN(16) static const int16_t jsimd_idct_4x4_neon_consts[] = {
+ F_1_847, -F_0_765, -F_0_211, F_1_451,
+ -F_2_172, F_1_061, -F_0_509, -F_0_601,
+ F_0_899, F_2_562, 0, 0
+ };
+
+void jsimd_idct_4x4_neon(void *dct_table,
+ JCOEFPTR coef_block,
+ JSAMPARRAY restrict output_buf,
+ JDIMENSION output_col)
+{
+ ISLOW_MULT_TYPE *quantptr = dct_table;
+
+ /* Load DCT coefficients. */
+ int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
+ int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
+ int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE);
+ int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
+ int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
+ int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE);
+ int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+ /* Load quantization table values for DC coefficients. */
+ int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+ /* Dequantize DC coefficients. */
+ row0 = vmulq_s16(row0, quant_row0);
+
+ /* Construct bitmap to test if all AC coefficients are 0. */
+ int16x8_t bitmap = vorrq_s16(row1, row2);
+ bitmap = vorrq_s16(bitmap, row3);
+ bitmap = vorrq_s16(bitmap, row5);
+ bitmap = vorrq_s16(bitmap, row6);
+ bitmap = vorrq_s16(bitmap, row7);
+
+ int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
+ int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
+
+ /* Load constants for IDCT computation. */
+ const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_4x4_neon_consts);
+
+ if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
+ /* All AC coefficients are zero. */
+ /* Compute DC values and duplicate into row vectors 0, 1, 2 and 3. */
+ int16x8_t dcval = vshlq_n_s16(row0, PASS1_BITS);
+ row0 = dcval;
+ row1 = dcval;
+ row2 = dcval;
+ row3 = dcval;
+ } else if (left_ac_bitmap == 0) {
+ /* AC coefficients are zero for columns 0, 1, 2 and 3. */
+ /* Compute DC values for these columns. */
+ int16x4_t dcval = vshl_n_s16(vget_low_s16(row0), PASS1_BITS);
+
+ /* Commence regular IDCT computation for columns 4, 5, 6 and 7. */
+ /* Load quantization table. */
+ int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+ int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+ int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+ int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+ int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+ int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+ /* Even part. */
+ int32x4_t tmp0 = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
+
+ int16x4_t z2 = vmul_s16(vget_high_s16(row2), quant_row2);
+ int16x4_t z3 = vmul_s16(vget_high_s16(row6), quant_row6);
+
+ int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
+ tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+ /* Odd part. */
+ int16x4_t z1 = vmul_s16(vget_high_s16(row7), quant_row7);
+ z2 = vmul_s16(vget_high_s16(row5), quant_row5);
+ z3 = vmul_s16(vget_high_s16(row3), quant_row3);
+ int16x4_t z4 = vmul_s16(vget_high_s16(row1), quant_row1);
+
+ tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
+ tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
+ tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
+ tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
+
+ tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
+ tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
+ tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
+ tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ row0 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
+ CONST_BITS - PASS1_BITS + 1));
+ row3 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
+ CONST_BITS - PASS1_BITS + 1));
+ row1 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
+ CONST_BITS - PASS1_BITS + 1));
+ row2 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
+ CONST_BITS - PASS1_BITS + 1));
+ } else if (right_ac_bitmap == 0) {
+ /* AC coefficients are zero for columns 4, 5, 6 and 7. */
+ /* Compute DC values for these columns. */
+ int16x4_t dcval = vshl_n_s16(vget_high_s16(row0), PASS1_BITS);
+
+ /* Commence regular IDCT computation for columns 0, 1, 2 and 3. */
+ /* Load quantization table. */
+ int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+ int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+ int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+ int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+ int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+ int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+ /* Even part. */
+ int32x4_t tmp0 = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
+
+ int16x4_t z2 = vmul_s16(vget_low_s16(row2), quant_row2);
+ int16x4_t z3 = vmul_s16(vget_low_s16(row6), quant_row6);
+
+ int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
+ tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+ /* Odd part. */
+ int16x4_t z1 = vmul_s16(vget_low_s16(row7), quant_row7);
+ z2 = vmul_s16(vget_low_s16(row5), quant_row5);
+ z3 = vmul_s16(vget_low_s16(row3), quant_row3);
+ int16x4_t z4 = vmul_s16(vget_low_s16(row1), quant_row1);
+
+ tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
+ tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
+ tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
+ tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
+
+ tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
+ tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
+ tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
+ tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
+ CONST_BITS - PASS1_BITS + 1), dcval);
+ row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
+ CONST_BITS - PASS1_BITS + 1), dcval);
+ row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
+ CONST_BITS - PASS1_BITS + 1), dcval);
+ row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
+ CONST_BITS - PASS1_BITS + 1), dcval);
+ } else {
+ /* All AC coefficients are non-zero; full IDCT calculation required. */
+ int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+ int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
+ int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+ int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+ int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
+ int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+ /* Even part. */
+ int32x4_t tmp0_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
+ int32x4_t tmp0_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
+
+ int16x8_t z2 = vmulq_s16(row2, quant_row2);
+ int16x8_t z3 = vmulq_s16(row6, quant_row6);
+
+ int32x4_t tmp2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[0], 0);
+ int32x4_t tmp2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[0], 0);
+ tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[0], 1);
+ tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[0], 1);
+
+ int32x4_t tmp10_l = vaddq_s32(tmp0_l, tmp2_l);
+ int32x4_t tmp10_h = vaddq_s32(tmp0_h, tmp2_h);
+ int32x4_t tmp12_l = vsubq_s32(tmp0_l, tmp2_l);
+ int32x4_t tmp12_h = vsubq_s32(tmp0_h, tmp2_h);
+
+ /* Odd part. */
+ int16x8_t z1 = vmulq_s16(row7, quant_row7);
+ z2 = vmulq_s16(row5, quant_row5);
+ z3 = vmulq_s16(row3, quant_row3);
+ int16x8_t z4 = vmulq_s16(row1, quant_row1);
+
+ tmp0_l = vmull_lane_s16(vget_low_s16(z1), consts.val[0], 2);
+ tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z2), consts.val[0], 3);
+ tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z3), consts.val[1], 0);
+ tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z4), consts.val[1], 1);
+ tmp0_h = vmull_lane_s16(vget_high_s16(z1), consts.val[0], 2);
+ tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z2), consts.val[0], 3);
+ tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z3), consts.val[1], 0);
+ tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z4), consts.val[1], 1);
+
+ tmp2_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 2);
+ tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z2), consts.val[1], 3);
+ tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[2], 0);
+ tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z4), consts.val[2], 1);
+ tmp2_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 2);
+ tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z2), consts.val[1], 3);
+ tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[2], 0);
+ tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z4), consts.val[2], 1);
+
+ /* Final output stage: descale and narrow to 16-bit. */
+ row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp2_l),
+ CONST_BITS - PASS1_BITS + 1),
+ vrshrn_n_s32(vaddq_s32(tmp10_h, tmp2_h),
+ CONST_BITS - PASS1_BITS + 1));
+ row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp2_l),
+ CONST_BITS - PASS1_BITS + 1),
+ vrshrn_n_s32(vsubq_s32(tmp10_h, tmp2_h),
+ CONST_BITS - PASS1_BITS + 1));
+ row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12_l, tmp0_l),
+ CONST_BITS - PASS1_BITS + 1),
+ vrshrn_n_s32(vaddq_s32(tmp12_h, tmp0_h),
+ CONST_BITS - PASS1_BITS + 1));
+ row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12_l, tmp0_l),
+ CONST_BITS - PASS1_BITS + 1),
+ vrshrn_n_s32(vsubq_s32(tmp12_h, tmp0_h),
+ CONST_BITS - PASS1_BITS + 1));
+ }
+
+ /* Transpose 8x4 block to perform IDCT on rows in second pass. */
+ int16x8x2_t row_01 = vtrnq_s16(row0, row1);
+ int16x8x2_t row_23 = vtrnq_s16(row2, row3);
+
+ int32x4x2_t cols_0426 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[0]),
+ vreinterpretq_s32_s16(row_23.val[0]));
+ int32x4x2_t cols_1537 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[1]),
+ vreinterpretq_s32_s16(row_23.val[1]));
+
+ int16x4_t col0 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[0]));
+ int16x4_t col1 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[0]));
+ int16x4_t col2 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[1]));
+ int16x4_t col3 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[1]));
+ int16x4_t col5 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[0]));
+ int16x4_t col6 = vreinterpret_s16_s32(vget_high_s32(cols_0426.val[1]));
+ int16x4_t col7 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[1]));
+
+ /* Commence second pass of IDCT. */
+ /* Even part. */
+ int32x4_t tmp0 = vshll_n_s16(col0, CONST_BITS + 1);
+ int32x4_t tmp2 = vmull_lane_s16(col2, consts.val[0], 0);
+ tmp2 = vmlal_lane_s16(tmp2, col6, consts.val[0], 1);
+
+ int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+ int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+ /* Odd part. */
+ tmp0 = vmull_lane_s16(col7, consts.val[0], 2);
+ tmp0 = vmlal_lane_s16(tmp0, col5, consts.val[0], 3);
+ tmp0 = vmlal_lane_s16(tmp0, col3, consts.val[1], 0);
+ tmp0 = vmlal_lane_s16(tmp0, col1, consts.val[1], 1);
+
+ tmp2 = vmull_lane_s16(col7, consts.val[1], 2);
+ tmp2 = vmlal_lane_s16(tmp2, col5, consts.val[1], 3);
+ tmp2 = vmlal_lane_s16(tmp2, col3, consts.val[2], 0);
+ tmp2 = vmlal_lane_s16(tmp2, col1, consts.val[2], 1);
+
+ /* Final output stage: descale and clamp to range [0-255]. */
+ int16x8_t output_cols_02 = vcombine_s16(vaddhn_s32(tmp10, tmp2),
+ vsubhn_s32(tmp12, tmp0));
+ int16x8_t output_cols_13 = vcombine_s16(vaddhn_s32(tmp12, tmp0),
+ vsubhn_s32(tmp10, tmp2));
+ output_cols_02 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_02,
+ CONST_BITS + PASS1_BITS + 3 + 1 - 16);
+ output_cols_13 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_13,
+ CONST_BITS + PASS1_BITS + 3 + 1 - 16);
+ /* Narrow to 8-bit and convert to unsigned while zipping 8-bit elements. */
+ /* Interleaving store completes the transpose. */
+ uint8x8x2_t output_0123 = vzip_u8(vqmovun_s16(output_cols_02),
+ vqmovun_s16(output_cols_13));
+ uint16x4x2_t output_01_23 = { vreinterpret_u16_u8(output_0123.val[0]),
+ vreinterpret_u16_u8(output_0123.val[1])
+ };
+
+ /* Store 4x4 block to memory. */
+ JSAMPROW outptr0 = output_buf[0] + output_col;
+ JSAMPROW outptr1 = output_buf[1] + output_col;
+ JSAMPROW outptr2 = output_buf[2] + output_col;
+ JSAMPROW outptr3 = output_buf[3] + output_col;
+ vst2_lane_u16((uint16_t *)outptr0, output_01_23, 0);
+ vst2_lane_u16((uint16_t *)outptr1, output_01_23, 1);
+ vst2_lane_u16((uint16_t *)outptr2, output_01_23, 2);
+ vst2_lane_u16((uint16_t *)outptr3, output_01_23, 3);
+}
diff --git a/simd/arm/common/jquanti-neon.c b/simd/arm/common/jquanti-neon.c
new file mode 100644
index 0000000..6f8a3ab
--- /dev/null
+++ b/simd/arm/common/jquanti-neon.c
@@ -0,0 +1,190 @@
+/*
+ * jquanti-neon.c - sample conversion and integer quantization (Arm NEON)
+ *
+ * Copyright 2020 The Chromium Authors. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <arm_neon.h>
+
+/*
+ * Pixel channel sample values have range [0,255]. The Discrete Cosine
+ * Transform (DCT) operates on values centered around 0.
+ *
+ * To prepare sample values for the DCT, load samples into a DCT workspace,
+ * subtracting CENTREJSAMPLE (128). The samples, now in range [-128, 127],
+ * are also widened from 8- to 16-bit.
+ *
+ * The equivalent scalar C function 'convsamp' can be found in jcdctmgr.c.
+ */
+
+void jsimd_convsamp_neon(JSAMPARRAY sample_data,
+ JDIMENSION start_col,
+ DCTELEM *workspace)
+{
+ uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col);
+ uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col);
+ uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col);
+ uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col);
+ uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col);
+ uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col);
+ uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col);
+ uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col);
+
+ int16x8_t row0 = vreinterpretq_s16_u16(vsubl_u8(samp_row0,
+ vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row1 = vreinterpretq_s16_u16(vsubl_u8(samp_row1,
+ vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row2 = vreinterpretq_s16_u16(vsubl_u8(samp_row2,
+ vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row3 = vreinterpretq_s16_u16(vsubl_u8(samp_row3,
+ vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row4 = vreinterpretq_s16_u16(vsubl_u8(samp_row4,
+ vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row5 = vreinterpretq_s16_u16(vsubl_u8(samp_row5,
+ vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row6 = vreinterpretq_s16_u16(vsubl_u8(samp_row6,
+ vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row7 = vreinterpretq_s16_u16(vsubl_u8(samp_row7,
+ vdup_n_u8(CENTERJSAMPLE)));
+
+ vst1q_s16(workspace + 0 * DCTSIZE, row0);
+ vst1q_s16(workspace + 1 * DCTSIZE, row1);
+ vst1q_s16(workspace + 2 * DCTSIZE, row2);
+ vst1q_s16(workspace + 3 * DCTSIZE, row3);
+ vst1q_s16(workspace + 4 * DCTSIZE, row4);
+ vst1q_s16(workspace + 5 * DCTSIZE, row5);
+ vst1q_s16(workspace + 6 * DCTSIZE, row6);
+ vst1q_s16(workspace + 7 * DCTSIZE, row7);
+}
+
+
+/*
+ * After the DCT, the resulting coefficient values need to be divided by a
+ * quantization value.
+ *
+ * To avoid a slow division operation, the DCT coefficients are multiplied by
+ * the (scaled) reciprocal of the quantization values and then right-shifted.
+ *
+ * The equivalent scalar C function 'quantize' can be found in jcdctmgr.c.
+ */
+
+void jsimd_quantize_neon(JCOEFPTR coef_block,
+ DCTELEM *divisors,
+ DCTELEM *workspace)
+{
+ JCOEFPTR out_ptr = coef_block;
+ UDCTELEM *recip_ptr = (UDCTELEM *)divisors;
+ UDCTELEM *corr_ptr = (UDCTELEM *)divisors + DCTSIZE2;
+ DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2;
+
+ for (int i = 0; i < DCTSIZE; i += DCTSIZE / 2) {
+ /* Load DCT coefficients. */
+ int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE);
+ int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE);
+ int16x8_t row2 = vld1q_s16(workspace + (i + 2) * DCTSIZE);
+ int16x8_t row3 = vld1q_s16(workspace + (i + 3) * DCTSIZE);
+ /* Load reciprocals of quantization values. */
+ uint16x8_t recip0 = vld1q_u16(recip_ptr + (i + 0) * DCTSIZE);
+ uint16x8_t recip1 = vld1q_u16(recip_ptr + (i + 1) * DCTSIZE);
+ uint16x8_t recip2 = vld1q_u16(recip_ptr + (i + 2) * DCTSIZE);
+ uint16x8_t recip3 = vld1q_u16(recip_ptr + (i + 3) * DCTSIZE);
+ uint16x8_t corr0 = vld1q_u16(corr_ptr + (i + 0) * DCTSIZE);
+ uint16x8_t corr1 = vld1q_u16(corr_ptr + (i + 1) * DCTSIZE);
+ uint16x8_t corr2 = vld1q_u16(corr_ptr + (i + 2) * DCTSIZE);
+ uint16x8_t corr3 = vld1q_u16(corr_ptr + (i + 3) * DCTSIZE);
+ int16x8_t shift0 = vld1q_s16(shift_ptr + (i + 0) * DCTSIZE);
+ int16x8_t shift1 = vld1q_s16(shift_ptr + (i + 1) * DCTSIZE);
+ int16x8_t shift2 = vld1q_s16(shift_ptr + (i + 2) * DCTSIZE);
+ int16x8_t shift3 = vld1q_s16(shift_ptr + (i + 3) * DCTSIZE);
+
+ /* Extract sign from coefficients. */
+ int16x8_t sign_row0 = vshrq_n_s16(row0, 15);
+ int16x8_t sign_row1 = vshrq_n_s16(row1, 15);
+ int16x8_t sign_row2 = vshrq_n_s16(row2, 15);
+ int16x8_t sign_row3 = vshrq_n_s16(row3, 15);
+ /* Get absolute value of DCT coefficients. */
+ uint16x8_t abs_row0 = vreinterpretq_u16_s16(vabsq_s16(row0));
+ uint16x8_t abs_row1 = vreinterpretq_u16_s16(vabsq_s16(row1));
+ uint16x8_t abs_row2 = vreinterpretq_u16_s16(vabsq_s16(row2));
+ uint16x8_t abs_row3 = vreinterpretq_u16_s16(vabsq_s16(row3));
+ /* Add correction. */
+ abs_row0 = vaddq_u16(abs_row0, corr0);
+ abs_row1 = vaddq_u16(abs_row1, corr1);
+ abs_row2 = vaddq_u16(abs_row2, corr2);
+ abs_row3 = vaddq_u16(abs_row3, corr3);
+
+ /* Multiply DCT coefficients by quantization reciprocal. */
+ int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0),
+ vget_low_u16(recip0)));
+ int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0),
+ vget_high_u16(recip0)));
+ int32x4_t row1_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row1),
+ vget_low_u16(recip1)));
+ int32x4_t row1_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row1),
+ vget_high_u16(recip1)));
+ int32x4_t row2_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row2),
+ vget_low_u16(recip2)));
+ int32x4_t row2_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row2),
+ vget_high_u16(recip2)));
+ int32x4_t row3_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row3),
+ vget_low_u16(recip3)));
+ int32x4_t row3_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row3),
+ vget_high_u16(recip3)));
+ /* Narrow back to 16-bit. */
+ row0 = vcombine_s16(vshrn_n_s32(row0_l, 16), vshrn_n_s32(row0_h, 16));
+ row1 = vcombine_s16(vshrn_n_s32(row1_l, 16), vshrn_n_s32(row1_h, 16));
+ row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16));
+ row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16));
+
+ /* Since VSHR only supports an immediate as its second argument, negate */
+ /* the shift value and shift left. */
+ row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0),
+ vnegq_s16(shift0)));
+ row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1),
+ vnegq_s16(shift1)));
+ row2 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row2),
+ vnegq_s16(shift2)));
+ row3 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row3),
+ vnegq_s16(shift3)));
+
+ /* Restore sign to original product. */
+ row0 = veorq_s16(row0, sign_row0);
+ row0 = vsubq_s16(row0, sign_row0);
+ row1 = veorq_s16(row1, sign_row1);
+ row1 = vsubq_s16(row1, sign_row1);
+ row2 = veorq_s16(row2, sign_row2);
+ row2 = vsubq_s16(row2, sign_row2);
+ row3 = veorq_s16(row3, sign_row3);
+ row3 = vsubq_s16(row3, sign_row3);
+
+ /* Store quantized coefficients to memory. */
+ vst1q_s16(out_ptr + (i + 0) * DCTSIZE, row0);
+ vst1q_s16(out_ptr + (i + 1) * DCTSIZE, row1);
+ vst1q_s16(out_ptr + (i + 2) * DCTSIZE, row2);
+ vst1q_s16(out_ptr + (i + 3) * DCTSIZE, row3);
+ }
+}
diff --git a/simd/arm/jsimd_neon.S b/simd/arm/jsimd_neon.S
deleted file mode 100644
index af929fe..0000000
--- a/simd/arm/jsimd_neon.S
+++ /dev/null
@@ -1,2878 +0,0 @@
-/*
- * ARMv7 NEON optimizations for libjpeg-turbo
- *
- * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
- * All Rights Reserved.
- * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
- * Copyright (C) 2014, Siarhei Siamashka. All Rights Reserved.
- * Copyright (C) 2014, Linaro Limited. All Rights Reserved.
- * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
- * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */
-#endif
-
-.text
-.fpu neon
-.arch armv7a
-.object_arch armv4
-.arm
-.syntax unified
-
-
-#define RESPECT_STRICT_ALIGNMENT 1
-
-
-/*****************************************************************************/
-
-/* Supplementary macro for setting function attributes */
-.macro asm_function fname
-#ifdef __APPLE__
- .private_extern _\fname
- .globl _\fname
-_\fname:
-#else
- .global \fname
-#ifdef __ELF__
- .hidden \fname
- .type \fname, %function
-#endif
-\fname:
-#endif
-.endm
-
-/* Transpose a block of 4x4 coefficients in four 64-bit registers */
-.macro transpose_4x4 x0, x1, x2, x3
- vtrn.16 \x0, \x1
- vtrn.16 \x2, \x3
- vtrn.32 \x0, \x2
- vtrn.32 \x1, \x3
-.endm
-
-
-#define CENTERJSAMPLE 128
-
-/*****************************************************************************/
-
-/*
- * Perform dequantization and inverse DCT on one block of coefficients.
- *
- * GLOBAL(void)
- * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
- * JSAMPARRAY output_buf, JDIMENSION output_col)
- */
-
-#define FIX_0_298631336 (2446)
-#define FIX_0_390180644 (3196)
-#define FIX_0_541196100 (4433)
-#define FIX_0_765366865 (6270)
-#define FIX_0_899976223 (7373)
-#define FIX_1_175875602 (9633)
-#define FIX_1_501321110 (12299)
-#define FIX_1_847759065 (15137)
-#define FIX_1_961570560 (16069)
-#define FIX_2_053119869 (16819)
-#define FIX_2_562915447 (20995)
-#define FIX_3_072711026 (25172)
-
-#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
-#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
-#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
-#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
-#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
-#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
-#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
-#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865)
-
-/*
- * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
- * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
- */
-#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) { \
- DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
- JLONG q1, q2, q3, q4, q5, q6, q7; \
- JLONG tmp11_plus_tmp2, tmp11_minus_tmp2; \
- \
- /* 1-D iDCT input data */ \
- row0 = xrow0; \
- row1 = xrow1; \
- row2 = xrow2; \
- row3 = xrow3; \
- row4 = xrow4; \
- row5 = xrow5; \
- row6 = xrow6; \
- row7 = xrow7; \
- \
- q5 = row7 + row3; \
- q4 = row5 + row1; \
- q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
- MULTIPLY(q4, FIX_1_175875602); \
- q7 = MULTIPLY(q5, FIX_1_175875602) + \
- MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
- q2 = MULTIPLY(row2, FIX_0_541196100) + \
- MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
- q4 = q6; \
- q3 = ((JLONG)row0 - (JLONG)row4) << 13; \
- q6 += MULTIPLY(row5, -FIX_2_562915447) + \
- MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
- /* now we can use q1 (reloadable constants have been used up) */ \
- q1 = q3 + q2; \
- q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
- MULTIPLY(row1, -FIX_0_899976223); \
- q5 = q7; \
- q1 = q1 + q6; \
- q7 += MULTIPLY(row7, -FIX_0_899976223) + \
- MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
- \
- /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
- tmp11_plus_tmp2 = q1; \
- row1 = 0; \
- \
- q1 = q1 - q6; \
- q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
- MULTIPLY(row3, -FIX_2_562915447); \
- q1 = q1 - q6; \
- q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
- MULTIPLY(row6, FIX_0_541196100); \
- q3 = q3 - q2; \
- \
- /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
- tmp11_minus_tmp2 = q1; \
- \
- q1 = ((JLONG)row0 + (JLONG)row4) << 13; \
- q2 = q1 + q6; \
- q1 = q1 - q6; \
- \
- /* pick up the results */ \
- tmp0 = q4; \
- tmp1 = q5; \
- tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
- tmp3 = q7; \
- tmp10 = q2; \
- tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
- tmp12 = q3; \
- tmp13 = q1; \
-}
-
-#define XFIX_0_899976223 d0[0]
-#define XFIX_0_541196100 d0[1]
-#define XFIX_2_562915447 d0[2]
-#define XFIX_0_298631336_MINUS_0_899976223 d0[3]
-#define XFIX_1_501321110_MINUS_0_899976223 d1[0]
-#define XFIX_2_053119869_MINUS_2_562915447 d1[1]
-#define XFIX_0_541196100_PLUS_0_765366865 d1[2]
-#define XFIX_1_175875602 d1[3]
-#define XFIX_1_175875602_MINUS_0_390180644 d2[0]
-#define XFIX_0_541196100_MINUS_1_847759065 d2[1]
-#define XFIX_3_072711026_MINUS_2_562915447 d2[2]
-#define XFIX_1_175875602_MINUS_1_961570560 d2[3]
-
-.balign 16
-jsimd_idct_islow_neon_consts:
- .short FIX_0_899976223 /* d0[0] */
- .short FIX_0_541196100 /* d0[1] */
- .short FIX_2_562915447 /* d0[2] */
- .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */
- .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */
- .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */
- .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */
- .short FIX_1_175875602 /* d1[3] */
- /* reloadable constants */
- .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */
- .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */
- .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */
- .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */
-
-asm_function jsimd_idct_islow_neon
-
- DCT_TABLE .req r0
- COEF_BLOCK .req r1
- OUTPUT_BUF .req r2
- OUTPUT_COL .req r3
- TMP1 .req r0
- TMP2 .req r1
- TMP3 .req r2
- TMP4 .req ip
-
- ROW0L .req d16
- ROW0R .req d17
- ROW1L .req d18
- ROW1R .req d19
- ROW2L .req d20
- ROW2R .req d21
- ROW3L .req d22
- ROW3R .req d23
- ROW4L .req d24
- ROW4R .req d25
- ROW5L .req d26
- ROW5R .req d27
- ROW6L .req d28
- ROW6R .req d29
- ROW7L .req d30
- ROW7R .req d31
-
- /* Load and dequantize coefficients into NEON registers
- * with the following allocation:
- * 0 1 2 3 | 4 5 6 7
- * ---------+--------
- * 0 | d16 | d17 ( q8 )
- * 1 | d18 | d19 ( q9 )
- * 2 | d20 | d21 ( q10 )
- * 3 | d22 | d23 ( q11 )
- * 4 | d24 | d25 ( q12 )
- * 5 | d26 | d27 ( q13 )
- * 6 | d28 | d29 ( q14 )
- * 7 | d30 | d31 ( q15 )
- */
- adr ip, jsimd_idct_islow_neon_consts
- vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
- vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
- vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
- vmul.s16 q8, q8, q0
- vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
- vmul.s16 q9, q9, q1
- vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
- vmul.s16 q10, q10, q2
- vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
- vmul.s16 q11, q11, q3
- vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
- vmul.s16 q12, q12, q0
- vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
- vmul.s16 q14, q14, q2
- vmul.s16 q13, q13, q1
- vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */
- add ip, ip, #16
- vmul.s16 q15, q15, q3
- vpush {d8-d15} /* save NEON registers */
- /* 1-D IDCT, pass 1, left 4x8 half */
- vadd.s16 d4, ROW7L, ROW3L
- vadd.s16 d5, ROW5L, ROW1L
- vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560
- vmlal.s16 q6, d5, XFIX_1_175875602
- vmull.s16 q7, d4, XFIX_1_175875602
- /* Check for the zero coefficients in the right 4x8 half */
- push {r4, r5}
- vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644
- vsubl.s16 q3, ROW0L, ROW4L
- ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
- vmull.s16 q2, ROW2L, XFIX_0_541196100
- vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
- orr r0, r4, r5
- vmov q4, q6
- vmlsl.s16 q6, ROW5L, XFIX_2_562915447
- ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
- vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
- vshl.s32 q3, q3, #13
- orr r0, r0, r4
- vmlsl.s16 q4, ROW1L, XFIX_0_899976223
- orr r0, r0, r5
- vadd.s32 q1, q3, q2
- ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
- vmov q5, q7
- vadd.s32 q1, q1, q6
- orr r0, r0, r4
- vmlsl.s16 q7, ROW7L, XFIX_0_899976223
- orr r0, r0, r5
- vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
- vrshrn.s32 ROW1L, q1, #11
- ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
- vsub.s32 q1, q1, q6
- vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
- orr r0, r0, r4
- vmlsl.s16 q5, ROW3L, XFIX_2_562915447
- orr r0, r0, r5
- vsub.s32 q1, q1, q6
- vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
- ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
- vmlal.s16 q6, ROW6L, XFIX_0_541196100
- vsub.s32 q3, q3, q2
- orr r0, r0, r4
- vrshrn.s32 ROW6L, q1, #11
- orr r0, r0, r5
- vadd.s32 q1, q3, q5
- ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
- vsub.s32 q3, q3, q5
- vaddl.s16 q5, ROW0L, ROW4L
- orr r0, r0, r4
- vrshrn.s32 ROW2L, q1, #11
- orr r0, r0, r5
- vrshrn.s32 ROW5L, q3, #11
- ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
- vshl.s32 q5, q5, #13
- vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
- orr r0, r0, r4
- vadd.s32 q2, q5, q6
- orrs r0, r0, r5
- vsub.s32 q1, q5, q6
- vadd.s32 q6, q2, q7
- ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
- vsub.s32 q2, q2, q7
- vadd.s32 q5, q1, q4
- orr r0, r4, r5
- vsub.s32 q3, q1, q4
- pop {r4, r5}
- vrshrn.s32 ROW7L, q2, #11
- vrshrn.s32 ROW3L, q5, #11
- vrshrn.s32 ROW0L, q6, #11
- vrshrn.s32 ROW4L, q3, #11
-
- beq 3f /* Go to do some special handling for the sparse
- right 4x8 half */
-
- /* 1-D IDCT, pass 1, right 4x8 half */
- vld1.s16 {d2}, [ip, :64] /* reload constants */
- vadd.s16 d10, ROW7R, ROW3R
- vadd.s16 d8, ROW5R, ROW1R
- /* Transpose left 4x8 half */
- vtrn.16 ROW6L, ROW7L
- vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560
- vmlal.s16 q6, d8, XFIX_1_175875602
- vtrn.16 ROW2L, ROW3L
- vmull.s16 q7, d10, XFIX_1_175875602
- vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644
- vtrn.16 ROW0L, ROW1L
- vsubl.s16 q3, ROW0R, ROW4R
- vmull.s16 q2, ROW2R, XFIX_0_541196100
- vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
- vtrn.16 ROW4L, ROW5L
- vmov q4, q6
- vmlsl.s16 q6, ROW5R, XFIX_2_562915447
- vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
- vtrn.32 ROW1L, ROW3L
- vshl.s32 q3, q3, #13
- vmlsl.s16 q4, ROW1R, XFIX_0_899976223
- vtrn.32 ROW4L, ROW6L
- vadd.s32 q1, q3, q2
- vmov q5, q7
- vadd.s32 q1, q1, q6
- vtrn.32 ROW0L, ROW2L
- vmlsl.s16 q7, ROW7R, XFIX_0_899976223
- vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
- vrshrn.s32 ROW1R, q1, #11
- vtrn.32 ROW5L, ROW7L
- vsub.s32 q1, q1, q6
- vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
- vmlsl.s16 q5, ROW3R, XFIX_2_562915447
- vsub.s32 q1, q1, q6
- vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
- vmlal.s16 q6, ROW6R, XFIX_0_541196100
- vsub.s32 q3, q3, q2
- vrshrn.s32 ROW6R, q1, #11
- vadd.s32 q1, q3, q5
- vsub.s32 q3, q3, q5
- vaddl.s16 q5, ROW0R, ROW4R
- vrshrn.s32 ROW2R, q1, #11
- vrshrn.s32 ROW5R, q3, #11
- vshl.s32 q5, q5, #13
- vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
- vadd.s32 q2, q5, q6
- vsub.s32 q1, q5, q6
- vadd.s32 q6, q2, q7
- vsub.s32 q2, q2, q7
- vadd.s32 q5, q1, q4
- vsub.s32 q3, q1, q4
- vrshrn.s32 ROW7R, q2, #11
- vrshrn.s32 ROW3R, q5, #11
- vrshrn.s32 ROW0R, q6, #11
- vrshrn.s32 ROW4R, q3, #11
- /* Transpose right 4x8 half */
- vtrn.16 ROW6R, ROW7R
- vtrn.16 ROW2R, ROW3R
- vtrn.16 ROW0R, ROW1R
- vtrn.16 ROW4R, ROW5R
- vtrn.32 ROW1R, ROW3R
- vtrn.32 ROW4R, ROW6R
- vtrn.32 ROW0R, ROW2R
- vtrn.32 ROW5R, ROW7R
-
-1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
- vld1.s16 {d2}, [ip, :64] /* reload constants */
- vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */
- vmlal.s16 q6, ROW1L, XFIX_1_175875602
- vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
- vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
- vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */
- vmlal.s16 q7, ROW3L, XFIX_1_175875602
- vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
- vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
- vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */
- vmull.s16 q2, ROW2L, XFIX_0_541196100
- vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */
- vmov q4, q6
- vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */
- vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
- vshl.s32 q3, q3, #13
- vmlsl.s16 q4, ROW1L, XFIX_0_899976223
- vadd.s32 q1, q3, q2
- vmov q5, q7
- vadd.s32 q1, q1, q6
- vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */
- vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
- vshrn.s32 ROW1L, q1, #16
- vsub.s32 q1, q1, q6
- vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */
- vmlsl.s16 q5, ROW3L, XFIX_2_562915447
- vsub.s32 q1, q1, q6
- vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
- vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */
- vsub.s32 q3, q3, q2
- vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
- vadd.s32 q1, q3, q5
- vsub.s32 q3, q3, q5
- vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */
- vshrn.s32 ROW2L, q1, #16
- vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
- vshl.s32 q5, q5, #13
- vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */
- vadd.s32 q2, q5, q6
- vsub.s32 q1, q5, q6
- vadd.s32 q6, q2, q7
- vsub.s32 q2, q2, q7
- vadd.s32 q5, q1, q4
- vsub.s32 q3, q1, q4
- vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
- vshrn.s32 ROW3L, q5, #16
- vshrn.s32 ROW0L, q6, #16
- vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
- /* 1-D IDCT, pass 2, right 4x8 half */
- vld1.s16 {d2}, [ip, :64] /* reload constants */
- vmull.s16 q6, ROW5R, XFIX_1_175875602
- vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */
- vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
- vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
- vmull.s16 q7, ROW7R, XFIX_1_175875602
- vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */
- vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
- vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
- vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */
- vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */
- vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
- vmov q4, q6
- vmlsl.s16 q6, ROW5R, XFIX_2_562915447
- vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */
- vshl.s32 q3, q3, #13
- vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */
- vadd.s32 q1, q3, q2
- vmov q5, q7
- vadd.s32 q1, q1, q6
- vmlsl.s16 q7, ROW7R, XFIX_0_899976223
- vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */
- vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
- vsub.s32 q1, q1, q6
- vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
- vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */
- vsub.s32 q1, q1, q6
- vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */
- vmlal.s16 q6, ROW6R, XFIX_0_541196100
- vsub.s32 q3, q3, q2
- vshrn.s32 ROW6R, q1, #16
- vadd.s32 q1, q3, q5
- vsub.s32 q3, q3, q5
- vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */
- vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
- vshrn.s32 ROW5R, q3, #16
- vshl.s32 q5, q5, #13
- vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
- vadd.s32 q2, q5, q6
- vsub.s32 q1, q5, q6
- vadd.s32 q6, q2, q7
- vsub.s32 q2, q2, q7
- vadd.s32 q5, q1, q4
- vsub.s32 q3, q1, q4
- vshrn.s32 ROW7R, q2, #16
- vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
- vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
- vshrn.s32 ROW4R, q3, #16
-
-2: /* Descale to 8-bit and range limit */
- vqrshrn.s16 d16, q8, #2
- vqrshrn.s16 d17, q9, #2
- vqrshrn.s16 d18, q10, #2
- vqrshrn.s16 d19, q11, #2
- vpop {d8-d15} /* restore NEON registers */
- vqrshrn.s16 d20, q12, #2
- /* Transpose the final 8-bit samples and do signed->unsigned conversion */
- vtrn.16 q8, q9
- vqrshrn.s16 d21, q13, #2
- vqrshrn.s16 d22, q14, #2
- vmov.u8 q0, #(CENTERJSAMPLE)
- vqrshrn.s16 d23, q15, #2
- vtrn.8 d16, d17
- vtrn.8 d18, d19
- vadd.u8 q8, q8, q0
- vadd.u8 q9, q9, q0
- vtrn.16 q10, q11
- /* Store results to the output buffer */
- ldmia OUTPUT_BUF!, {TMP1, TMP2}
- add TMP1, TMP1, OUTPUT_COL
- add TMP2, TMP2, OUTPUT_COL
- vst1.8 {d16}, [TMP1]
- vtrn.8 d20, d21
- vst1.8 {d17}, [TMP2]
- ldmia OUTPUT_BUF!, {TMP1, TMP2}
- add TMP1, TMP1, OUTPUT_COL
- add TMP2, TMP2, OUTPUT_COL
- vst1.8 {d18}, [TMP1]
- vadd.u8 q10, q10, q0
- vst1.8 {d19}, [TMP2]
- ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
- add TMP1, TMP1, OUTPUT_COL
- add TMP2, TMP2, OUTPUT_COL
- add TMP3, TMP3, OUTPUT_COL
- add TMP4, TMP4, OUTPUT_COL
- vtrn.8 d22, d23
- vst1.8 {d20}, [TMP1]
- vadd.u8 q11, q11, q0
- vst1.8 {d21}, [TMP2]
- vst1.8 {d22}, [TMP3]
- vst1.8 {d23}, [TMP4]
- bx lr
-
-3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
-
- /* Transpose left 4x8 half */
- vtrn.16 ROW6L, ROW7L
- vtrn.16 ROW2L, ROW3L
- vtrn.16 ROW0L, ROW1L
- vtrn.16 ROW4L, ROW5L
- vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */
- vtrn.32 ROW1L, ROW3L
- vtrn.32 ROW4L, ROW6L
- vtrn.32 ROW0L, ROW2L
- vtrn.32 ROW5L, ROW7L
-
- cmp r0, #0
- beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second
- pass */
-
- /* Only row 0 is non-zero for the right 4x8 half */
- vdup.s16 ROW1R, ROW0R[1]
- vdup.s16 ROW2R, ROW0R[2]
- vdup.s16 ROW3R, ROW0R[3]
- vdup.s16 ROW4R, ROW0R[0]
- vdup.s16 ROW5R, ROW0R[1]
- vdup.s16 ROW6R, ROW0R[2]
- vdup.s16 ROW7R, ROW0R[3]
- vdup.s16 ROW0R, ROW0R[0]
- b 1b /* Go to 'normal' second pass */
-
-4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
- vld1.s16 {d2}, [ip, :64] /* reload constants */
- vmull.s16 q6, ROW1L, XFIX_1_175875602
- vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
- vmull.s16 q7, ROW3L, XFIX_1_175875602
- vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
- vmull.s16 q2, ROW2L, XFIX_0_541196100
- vshll.s16 q3, ROW0L, #13
- vmov q4, q6
- vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
- vmlsl.s16 q4, ROW1L, XFIX_0_899976223
- vadd.s32 q1, q3, q2
- vmov q5, q7
- vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
- vadd.s32 q1, q1, q6
- vadd.s32 q6, q6, q6
- vmlsl.s16 q5, ROW3L, XFIX_2_562915447
- vshrn.s32 ROW1L, q1, #16
- vsub.s32 q1, q1, q6
- vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
- vsub.s32 q3, q3, q2
- vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
- vadd.s32 q1, q3, q5
- vsub.s32 q3, q3, q5
- vshll.s16 q5, ROW0L, #13
- vshrn.s32 ROW2L, q1, #16
- vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
- vadd.s32 q2, q5, q6
- vsub.s32 q1, q5, q6
- vadd.s32 q6, q2, q7
- vsub.s32 q2, q2, q7
- vadd.s32 q5, q1, q4
- vsub.s32 q3, q1, q4
- vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
- vshrn.s32 ROW3L, q5, #16
- vshrn.s32 ROW0L, q6, #16
- vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
- /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
- vld1.s16 {d2}, [ip, :64] /* reload constants */
- vmull.s16 q6, ROW5L, XFIX_1_175875602
- vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
- vmull.s16 q7, ROW7L, XFIX_1_175875602
- vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
- vmull.s16 q2, ROW6L, XFIX_0_541196100
- vshll.s16 q3, ROW4L, #13
- vmov q4, q6
- vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
- vmlsl.s16 q4, ROW5L, XFIX_0_899976223
- vadd.s32 q1, q3, q2
- vmov q5, q7
- vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
- vadd.s32 q1, q1, q6
- vadd.s32 q6, q6, q6
- vmlsl.s16 q5, ROW7L, XFIX_2_562915447
- vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
- vsub.s32 q1, q1, q6
- vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
- vsub.s32 q3, q3, q2
- vshrn.s32 ROW6R, q1, #16
- vadd.s32 q1, q3, q5
- vsub.s32 q3, q3, q5
- vshll.s16 q5, ROW4L, #13
- vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
- vshrn.s32 ROW5R, q3, #16
- vadd.s32 q2, q5, q6
- vsub.s32 q1, q5, q6
- vadd.s32 q6, q2, q7
- vsub.s32 q2, q2, q7
- vadd.s32 q5, q1, q4
- vsub.s32 q3, q1, q4
- vshrn.s32 ROW7R, q2, #16
- vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
- vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
- vshrn.s32 ROW4R, q3, #16
- b 2b /* Go to epilogue */
-
- .unreq DCT_TABLE
- .unreq COEF_BLOCK
- .unreq OUTPUT_BUF
- .unreq OUTPUT_COL
- .unreq TMP1
- .unreq TMP2
- .unreq TMP3
- .unreq TMP4
-
- .unreq ROW0L
- .unreq ROW0R
- .unreq ROW1L
- .unreq ROW1R
- .unreq ROW2L
- .unreq ROW2R
- .unreq ROW3L
- .unreq ROW3R
- .unreq ROW4L
- .unreq ROW4R
- .unreq ROW5L
- .unreq ROW5R
- .unreq ROW6L
- .unreq ROW6R
- .unreq ROW7L
- .unreq ROW7R
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_ifast_neon
- *
- * This function contains a fast, not so accurate integer implementation of
- * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
- * function from jidctfst.c
- *
- * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
- * But in ARM NEON case some extra additions are required because VQDMULH
- * instruction can't handle the constants larger than 1. So the expressions
- * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
- * which introduces an extra addition. Overall, there are 6 extra additions
- * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
- */
-
-#define XFIX_1_082392200 d0[0]
-#define XFIX_1_414213562 d0[1]
-#define XFIX_1_847759065 d0[2]
-#define XFIX_2_613125930 d0[3]
-
-.balign 16
-jsimd_idct_ifast_neon_consts:
- .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
- .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
- .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
- .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
-
-asm_function jsimd_idct_ifast_neon
-
- DCT_TABLE .req r0
- COEF_BLOCK .req r1
- OUTPUT_BUF .req r2
- OUTPUT_COL .req r3
- TMP1 .req r0
- TMP2 .req r1
- TMP3 .req r2
- TMP4 .req ip
-
- /* Load and dequantize coefficients into NEON registers
- * with the following allocation:
- * 0 1 2 3 | 4 5 6 7
- * ---------+--------
- * 0 | d16 | d17 ( q8 )
- * 1 | d18 | d19 ( q9 )
- * 2 | d20 | d21 ( q10 )
- * 3 | d22 | d23 ( q11 )
- * 4 | d24 | d25 ( q12 )
- * 5 | d26 | d27 ( q13 )
- * 6 | d28 | d29 ( q14 )
- * 7 | d30 | d31 ( q15 )
- */
- adr ip, jsimd_idct_ifast_neon_consts
- vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
- vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
- vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
- vmul.s16 q8, q8, q0
- vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
- vmul.s16 q9, q9, q1
- vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
- vmul.s16 q10, q10, q2
- vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
- vmul.s16 q11, q11, q3
- vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
- vmul.s16 q12, q12, q0
- vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
- vmul.s16 q14, q14, q2
- vmul.s16 q13, q13, q1
- vld1.16 {d0}, [ip, :64] /* load constants */
- vmul.s16 q15, q15, q3
- vpush {d8-d13} /* save NEON registers */
- /* 1-D IDCT, pass 1 */
- vsub.s16 q2, q10, q14
- vadd.s16 q14, q10, q14
- vsub.s16 q1, q11, q13
- vadd.s16 q13, q11, q13
- vsub.s16 q5, q9, q15
- vadd.s16 q15, q9, q15
- vqdmulh.s16 q4, q2, XFIX_1_414213562
- vqdmulh.s16 q6, q1, XFIX_2_613125930
- vadd.s16 q3, q1, q1
- vsub.s16 q1, q5, q1
- vadd.s16 q10, q2, q4
- vqdmulh.s16 q4, q1, XFIX_1_847759065
- vsub.s16 q2, q15, q13
- vadd.s16 q3, q3, q6
- vqdmulh.s16 q6, q2, XFIX_1_414213562
- vadd.s16 q1, q1, q4
- vqdmulh.s16 q4, q5, XFIX_1_082392200
- vsub.s16 q10, q10, q14
- vadd.s16 q2, q2, q6
- vsub.s16 q6, q8, q12
- vadd.s16 q12, q8, q12
- vadd.s16 q9, q5, q4
- vadd.s16 q5, q6, q10
- vsub.s16 q10, q6, q10
- vadd.s16 q6, q15, q13
- vadd.s16 q8, q12, q14
- vsub.s16 q3, q6, q3
- vsub.s16 q12, q12, q14
- vsub.s16 q3, q3, q1
- vsub.s16 q1, q9, q1
- vadd.s16 q2, q3, q2
- vsub.s16 q15, q8, q6
- vadd.s16 q1, q1, q2
- vadd.s16 q8, q8, q6
- vadd.s16 q14, q5, q3
- vsub.s16 q9, q5, q3
- vsub.s16 q13, q10, q2
- vadd.s16 q10, q10, q2
- /* Transpose */
- vtrn.16 q8, q9
- vsub.s16 q11, q12, q1
- vtrn.16 q14, q15
- vadd.s16 q12, q12, q1
- vtrn.16 q10, q11
- vtrn.16 q12, q13
- vtrn.32 q9, q11
- vtrn.32 q12, q14
- vtrn.32 q8, q10
- vtrn.32 q13, q15
- vswp d28, d21
- vswp d26, d19
- /* 1-D IDCT, pass 2 */
- vsub.s16 q2, q10, q14
- vswp d30, d23
- vadd.s16 q14, q10, q14
- vswp d24, d17
- vsub.s16 q1, q11, q13
- vadd.s16 q13, q11, q13
- vsub.s16 q5, q9, q15
- vadd.s16 q15, q9, q15
- vqdmulh.s16 q4, q2, XFIX_1_414213562
- vqdmulh.s16 q6, q1, XFIX_2_613125930
- vadd.s16 q3, q1, q1
- vsub.s16 q1, q5, q1
- vadd.s16 q10, q2, q4
- vqdmulh.s16 q4, q1, XFIX_1_847759065
- vsub.s16 q2, q15, q13
- vadd.s16 q3, q3, q6
- vqdmulh.s16 q6, q2, XFIX_1_414213562
- vadd.s16 q1, q1, q4
- vqdmulh.s16 q4, q5, XFIX_1_082392200
- vsub.s16 q10, q10, q14
- vadd.s16 q2, q2, q6
- vsub.s16 q6, q8, q12
- vadd.s16 q12, q8, q12
- vadd.s16 q9, q5, q4
- vadd.s16 q5, q6, q10
- vsub.s16 q10, q6, q10
- vadd.s16 q6, q15, q13
- vadd.s16 q8, q12, q14
- vsub.s16 q3, q6, q3
- vsub.s16 q12, q12, q14
- vsub.s16 q3, q3, q1
- vsub.s16 q1, q9, q1
- vadd.s16 q2, q3, q2
- vsub.s16 q15, q8, q6
- vadd.s16 q1, q1, q2
- vadd.s16 q8, q8, q6
- vadd.s16 q14, q5, q3
- vsub.s16 q9, q5, q3
- vsub.s16 q13, q10, q2
- vpop {d8-d13} /* restore NEON registers */
- vadd.s16 q10, q10, q2
- vsub.s16 q11, q12, q1
- vadd.s16 q12, q12, q1
- /* Descale to 8-bit and range limit */
- vmov.u8 q0, #0x80
- vqshrn.s16 d16, q8, #5
- vqshrn.s16 d17, q9, #5
- vqshrn.s16 d18, q10, #5
- vqshrn.s16 d19, q11, #5
- vqshrn.s16 d20, q12, #5
- vqshrn.s16 d21, q13, #5
- vqshrn.s16 d22, q14, #5
- vqshrn.s16 d23, q15, #5
- vadd.u8 q8, q8, q0
- vadd.u8 q9, q9, q0
- vadd.u8 q10, q10, q0
- vadd.u8 q11, q11, q0
- /* Transpose the final 8-bit samples */
- vtrn.16 q8, q9
- vtrn.16 q10, q11
- vtrn.32 q8, q10
- vtrn.32 q9, q11
- vtrn.8 d16, d17
- vtrn.8 d18, d19
- /* Store results to the output buffer */
- ldmia OUTPUT_BUF!, {TMP1, TMP2}
- add TMP1, TMP1, OUTPUT_COL
- add TMP2, TMP2, OUTPUT_COL
- vst1.8 {d16}, [TMP1]
- vst1.8 {d17}, [TMP2]
- ldmia OUTPUT_BUF!, {TMP1, TMP2}
- add TMP1, TMP1, OUTPUT_COL
- add TMP2, TMP2, OUTPUT_COL
- vst1.8 {d18}, [TMP1]
- vtrn.8 d20, d21
- vst1.8 {d19}, [TMP2]
- ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
- add TMP1, TMP1, OUTPUT_COL
- add TMP2, TMP2, OUTPUT_COL
- add TMP3, TMP3, OUTPUT_COL
- add TMP4, TMP4, OUTPUT_COL
- vst1.8 {d20}, [TMP1]
- vtrn.8 d22, d23
- vst1.8 {d21}, [TMP2]
- vst1.8 {d22}, [TMP3]
- vst1.8 {d23}, [TMP4]
- bx lr
-
- .unreq DCT_TABLE
- .unreq COEF_BLOCK
- .unreq OUTPUT_BUF
- .unreq OUTPUT_COL
- .unreq TMP1
- .unreq TMP2
- .unreq TMP3
- .unreq TMP4
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_4x4_neon
- *
- * This function contains inverse-DCT code for getting reduced-size
- * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
- * function from jpeg-6b (jidctred.c).
- *
- * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
- * requires much less arithmetic operations and hence should be faster.
- * The primary purpose of this particular NEON optimized function is
- * bit exact compatibility with jpeg-6b.
- *
- * TODO: a bit better instructions scheduling can be achieved by expanding
- * idct_helper/transpose_4x4 macros and reordering instructions,
- * but readability will suffer somewhat.
- */
-
-#define CONST_BITS 13
-
-#define FIX_0_211164243 (1730) /* FIX(0.211164243) */
-#define FIX_0_509795579 (4176) /* FIX(0.509795579) */
-#define FIX_0_601344887 (4926) /* FIX(0.601344887) */
-#define FIX_0_720959822 (5906) /* FIX(0.720959822) */
-#define FIX_0_765366865 (6270) /* FIX(0.765366865) */
-#define FIX_0_850430095 (6967) /* FIX(0.850430095) */
-#define FIX_0_899976223 (7373) /* FIX(0.899976223) */
-#define FIX_1_061594337 (8697) /* FIX(1.061594337) */
-#define FIX_1_272758580 (10426) /* FIX(1.272758580) */
-#define FIX_1_451774981 (11893) /* FIX(1.451774981) */
-#define FIX_1_847759065 (15137) /* FIX(1.847759065) */
-#define FIX_2_172734803 (17799) /* FIX(2.172734803) */
-#define FIX_2_562915447 (20995) /* FIX(2.562915447) */
-#define FIX_3_624509785 (29692) /* FIX(3.624509785) */
-
-.balign 16
-jsimd_idct_4x4_neon_consts:
- .short FIX_1_847759065 /* d0[0] */
- .short -FIX_0_765366865 /* d0[1] */
- .short -FIX_0_211164243 /* d0[2] */
- .short FIX_1_451774981 /* d0[3] */
- .short -FIX_2_172734803 /* d1[0] */
- .short FIX_1_061594337 /* d1[1] */
- .short -FIX_0_509795579 /* d1[2] */
- .short -FIX_0_601344887 /* d1[3] */
- .short FIX_0_899976223 /* d2[0] */
- .short FIX_2_562915447 /* d2[1] */
- .short 1 << (CONST_BITS + 1) /* d2[2] */
- .short 0 /* d2[3] */
-
-.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
- vmull.s16 q14, \x4, d2[2]
- vmlal.s16 q14, \x8, d0[0]
- vmlal.s16 q14, \x14, d0[1]
-
- vmull.s16 q13, \x16, d1[2]
- vmlal.s16 q13, \x12, d1[3]
- vmlal.s16 q13, \x10, d2[0]
- vmlal.s16 q13, \x6, d2[1]
-
- vmull.s16 q15, \x4, d2[2]
- vmlsl.s16 q15, \x8, d0[0]
- vmlsl.s16 q15, \x14, d0[1]
-
- vmull.s16 q12, \x16, d0[2]
- vmlal.s16 q12, \x12, d0[3]
- vmlal.s16 q12, \x10, d1[0]
- vmlal.s16 q12, \x6, d1[1]
-
- vadd.s32 q10, q14, q13
- vsub.s32 q14, q14, q13
-
- .if \shift > 16
- vrshr.s32 q10, q10, #\shift
- vrshr.s32 q14, q14, #\shift
- vmovn.s32 \y26, q10
- vmovn.s32 \y29, q14
- .else
- vrshrn.s32 \y26, q10, #\shift
- vrshrn.s32 \y29, q14, #\shift
- .endif
-
- vadd.s32 q10, q15, q12
- vsub.s32 q15, q15, q12
-
- .if \shift > 16
- vrshr.s32 q10, q10, #\shift
- vrshr.s32 q15, q15, #\shift
- vmovn.s32 \y27, q10
- vmovn.s32 \y28, q15
- .else
- vrshrn.s32 \y27, q10, #\shift
- vrshrn.s32 \y28, q15, #\shift
- .endif
-.endm
-
-asm_function jsimd_idct_4x4_neon
-
- DCT_TABLE .req r0
- COEF_BLOCK .req r1
- OUTPUT_BUF .req r2
- OUTPUT_COL .req r3
- TMP1 .req r0
- TMP2 .req r1
- TMP3 .req r2
- TMP4 .req ip
-
- vpush {d8-d15}
-
- /* Load constants (d3 is just used for padding) */
- adr TMP4, jsimd_idct_4x4_neon_consts
- vld1.16 {d0, d1, d2, d3}, [TMP4, :128]
-
- /* Load all COEF_BLOCK into NEON registers with the following allocation:
- * 0 1 2 3 | 4 5 6 7
- * ---------+--------
- * 0 | d4 | d5
- * 1 | d6 | d7
- * 2 | d8 | d9
- * 3 | d10 | d11
- * 4 | - | -
- * 5 | d12 | d13
- * 6 | d14 | d15
- * 7 | d16 | d17
- */
- vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
- vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
- add COEF_BLOCK, COEF_BLOCK, #16
- vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
- vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
- /* dequantize */
- vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
- vmul.s16 q2, q2, q9
- vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]!
- vmul.s16 q3, q3, q10
- vmul.s16 q4, q4, q11
- add DCT_TABLE, DCT_TABLE, #16
- vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]!
- vmul.s16 q5, q5, q12
- vmul.s16 q6, q6, q13
- vld1.16 {d30, d31}, [DCT_TABLE, :128]!
- vmul.s16 q7, q7, q14
- vmul.s16 q8, q8, q15
-
- /* Pass 1 */
- idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
- transpose_4x4 d4, d6, d8, d10
- idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
- transpose_4x4 d5, d7, d9, d11
-
- /* Pass 2 */
- idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
- transpose_4x4 d26, d27, d28, d29
-
- /* Range limit */
- vmov.u16 q15, #0x80
- vadd.s16 q13, q13, q15
- vadd.s16 q14, q14, q15
- vqmovun.s16 d26, q13
- vqmovun.s16 d27, q14
-
- /* Store results to the output buffer */
- ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
- add TMP1, TMP1, OUTPUT_COL
- add TMP2, TMP2, OUTPUT_COL
- add TMP3, TMP3, OUTPUT_COL
- add TMP4, TMP4, OUTPUT_COL
-
-#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
- /* We can use much less instructions on little endian systems if the
- * OS kernel is not configured to trap unaligned memory accesses
- */
- vst1.32 {d26[0]}, [TMP1]!
- vst1.32 {d27[0]}, [TMP3]!
- vst1.32 {d26[1]}, [TMP2]!
- vst1.32 {d27[1]}, [TMP4]!
-#else
- vst1.8 {d26[0]}, [TMP1]!
- vst1.8 {d27[0]}, [TMP3]!
- vst1.8 {d26[1]}, [TMP1]!
- vst1.8 {d27[1]}, [TMP3]!
- vst1.8 {d26[2]}, [TMP1]!
- vst1.8 {d27[2]}, [TMP3]!
- vst1.8 {d26[3]}, [TMP1]!
- vst1.8 {d27[3]}, [TMP3]!
-
- vst1.8 {d26[4]}, [TMP2]!
- vst1.8 {d27[4]}, [TMP4]!
- vst1.8 {d26[5]}, [TMP2]!
- vst1.8 {d27[5]}, [TMP4]!
- vst1.8 {d26[6]}, [TMP2]!
- vst1.8 {d27[6]}, [TMP4]!
- vst1.8 {d26[7]}, [TMP2]!
- vst1.8 {d27[7]}, [TMP4]!
-#endif
-
- vpop {d8-d15}
- bx lr
-
- .unreq DCT_TABLE
- .unreq COEF_BLOCK
- .unreq OUTPUT_BUF
- .unreq OUTPUT_COL
- .unreq TMP1
- .unreq TMP2
- .unreq TMP3
- .unreq TMP4
-
-.purgem idct_helper
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_2x2_neon
- *
- * This function contains inverse-DCT code for getting reduced-size
- * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
- * function from jpeg-6b (jidctred.c).
- *
- * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
- * requires much less arithmetic operations and hence should be faster.
- * The primary purpose of this particular NEON optimized function is
- * bit exact compatibility with jpeg-6b.
- */
-
-.balign 8
-jsimd_idct_2x2_neon_consts:
- .short -FIX_0_720959822 /* d0[0] */
- .short FIX_0_850430095 /* d0[1] */
- .short -FIX_1_272758580 /* d0[2] */
- .short FIX_3_624509785 /* d0[3] */
-
-.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
- vshll.s16 q14, \x4, #15
- vmull.s16 q13, \x6, d0[3]
- vmlal.s16 q13, \x10, d0[2]
- vmlal.s16 q13, \x12, d0[1]
- vmlal.s16 q13, \x16, d0[0]
-
- vadd.s32 q10, q14, q13
- vsub.s32 q14, q14, q13
-
- .if \shift > 16
- vrshr.s32 q10, q10, #\shift
- vrshr.s32 q14, q14, #\shift
- vmovn.s32 \y26, q10
- vmovn.s32 \y27, q14
- .else
- vrshrn.s32 \y26, q10, #\shift
- vrshrn.s32 \y27, q14, #\shift
- .endif
-.endm
-
-asm_function jsimd_idct_2x2_neon
-
- DCT_TABLE .req r0
- COEF_BLOCK .req r1
- OUTPUT_BUF .req r2
- OUTPUT_COL .req r3
- TMP1 .req r0
- TMP2 .req ip
-
- vpush {d8-d15}
-
- /* Load constants */
- adr TMP2, jsimd_idct_2x2_neon_consts
- vld1.16 {d0}, [TMP2, :64]
-
- /* Load all COEF_BLOCK into NEON registers with the following allocation:
- * 0 1 2 3 | 4 5 6 7
- * ---------+--------
- * 0 | d4 | d5
- * 1 | d6 | d7
- * 2 | - | -
- * 3 | d10 | d11
- * 4 | - | -
- * 5 | d12 | d13
- * 6 | - | -
- * 7 | d16 | d17
- */
- vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
- add COEF_BLOCK, COEF_BLOCK, #16
- vld1.16 {d10, d11}, [COEF_BLOCK, :128]!
- add COEF_BLOCK, COEF_BLOCK, #16
- vld1.16 {d12, d13}, [COEF_BLOCK, :128]!
- add COEF_BLOCK, COEF_BLOCK, #16
- vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
- /* Dequantize */
- vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
- vmul.s16 q2, q2, q9
- vmul.s16 q3, q3, q10
- add DCT_TABLE, DCT_TABLE, #16
- vld1.16 {d24, d25}, [DCT_TABLE, :128]!
- vmul.s16 q5, q5, q12
- add DCT_TABLE, DCT_TABLE, #16
- vld1.16 {d26, d27}, [DCT_TABLE, :128]!
- vmul.s16 q6, q6, q13
- add DCT_TABLE, DCT_TABLE, #16
- vld1.16 {d30, d31}, [DCT_TABLE, :128]!
- vmul.s16 q8, q8, q15
-
- /* Pass 1 */
-#if 0
- idct_helper d4, d6, d10, d12, d16, 13, d4, d6
- transpose_4x4 d4, d6, d8, d10
- idct_helper d5, d7, d11, d13, d17, 13, d5, d7
- transpose_4x4 d5, d7, d9, d11
-#else
- vmull.s16 q13, d6, d0[3]
- vmlal.s16 q13, d10, d0[2]
- vmlal.s16 q13, d12, d0[1]
- vmlal.s16 q13, d16, d0[0]
- vmull.s16 q12, d7, d0[3]
- vmlal.s16 q12, d11, d0[2]
- vmlal.s16 q12, d13, d0[1]
- vmlal.s16 q12, d17, d0[0]
- vshll.s16 q14, d4, #15
- vshll.s16 q15, d5, #15
- vadd.s32 q10, q14, q13
- vsub.s32 q14, q14, q13
- vrshrn.s32 d4, q10, #13
- vrshrn.s32 d6, q14, #13
- vadd.s32 q10, q15, q12
- vsub.s32 q14, q15, q12
- vrshrn.s32 d5, q10, #13
- vrshrn.s32 d7, q14, #13
- vtrn.16 q2, q3
- vtrn.32 q3, q5
-#endif
-
- /* Pass 2 */
- idct_helper d4, d6, d10, d7, d11, 20, d26, d27
-
- /* Range limit */
- vmov.u16 q15, #0x80
- vadd.s16 q13, q13, q15
- vqmovun.s16 d26, q13
- vqmovun.s16 d27, q13
-
- /* Store results to the output buffer */
- ldmia OUTPUT_BUF, {TMP1, TMP2}
- add TMP1, TMP1, OUTPUT_COL
- add TMP2, TMP2, OUTPUT_COL
-
- vst1.8 {d26[0]}, [TMP1]!
- vst1.8 {d27[4]}, [TMP1]!
- vst1.8 {d26[1]}, [TMP2]!
- vst1.8 {d27[5]}, [TMP2]!
-
- vpop {d8-d15}
- bx lr
-
- .unreq DCT_TABLE
- .unreq COEF_BLOCK
- .unreq OUTPUT_BUF
- .unreq OUTPUT_COL
- .unreq TMP1
- .unreq TMP2
-
-.purgem idct_helper
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_ycc_extrgb_convert_neon
- * jsimd_ycc_extbgr_convert_neon
- * jsimd_ycc_extrgbx_convert_neon
- * jsimd_ycc_extbgrx_convert_neon
- * jsimd_ycc_extxbgr_convert_neon
- * jsimd_ycc_extxrgb_convert_neon
- *
- * Colorspace conversion YCbCr -> RGB
- */
-
-
-.macro do_load size
- .if \size == 8
- vld1.8 {d4}, [U, :64]!
- vld1.8 {d5}, [V, :64]!
- vld1.8 {d0}, [Y, :64]!
- pld [U, #64]
- pld [V, #64]
- pld [Y, #64]
- .elseif \size == 4
- vld1.8 {d4[0]}, [U]!
- vld1.8 {d4[1]}, [U]!
- vld1.8 {d4[2]}, [U]!
- vld1.8 {d4[3]}, [U]!
- vld1.8 {d5[0]}, [V]!
- vld1.8 {d5[1]}, [V]!
- vld1.8 {d5[2]}, [V]!
- vld1.8 {d5[3]}, [V]!
- vld1.8 {d0[0]}, [Y]!
- vld1.8 {d0[1]}, [Y]!
- vld1.8 {d0[2]}, [Y]!
- vld1.8 {d0[3]}, [Y]!
- .elseif \size == 2
- vld1.8 {d4[4]}, [U]!
- vld1.8 {d4[5]}, [U]!
- vld1.8 {d5[4]}, [V]!
- vld1.8 {d5[5]}, [V]!
- vld1.8 {d0[4]}, [Y]!
- vld1.8 {d0[5]}, [Y]!
- .elseif \size == 1
- vld1.8 {d4[6]}, [U]!
- vld1.8 {d5[6]}, [V]!
- vld1.8 {d0[6]}, [Y]!
- .else
- .error unsupported macroblock size
- .endif
-.endm
-
-.macro do_store bpp, size
- .if \bpp == 24
- .if \size == 8
- vst3.8 {d10, d11, d12}, [RGB]!
- .elseif \size == 4
- vst3.8 {d10[0], d11[0], d12[0]}, [RGB]!
- vst3.8 {d10[1], d11[1], d12[1]}, [RGB]!
- vst3.8 {d10[2], d11[2], d12[2]}, [RGB]!
- vst3.8 {d10[3], d11[3], d12[3]}, [RGB]!
- .elseif \size == 2
- vst3.8 {d10[4], d11[4], d12[4]}, [RGB]!
- vst3.8 {d10[5], d11[5], d12[5]}, [RGB]!
- .elseif \size == 1
- vst3.8 {d10[6], d11[6], d12[6]}, [RGB]!
- .else
- .error unsupported macroblock size
- .endif
- .elseif \bpp == 32
- .if \size == 8
- vst4.8 {d10, d11, d12, d13}, [RGB]!
- .elseif \size == 4
- vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
- vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
- vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
- vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
- .elseif \size == 2
- vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
- vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
- .elseif \size == 1
- vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
- .else
- .error unsupported macroblock size
- .endif
- .elseif \bpp == 16
- .if \size == 8
- vst1.16 {q15}, [RGB]!
- .elseif \size == 4
- vst1.16 {d30}, [RGB]!
- .elseif \size == 2
- vst1.16 {d31[0]}, [RGB]!
- vst1.16 {d31[1]}, [RGB]!
- .elseif \size == 1
- vst1.16 {d31[2]}, [RGB]!
- .else
- .error unsupported macroblock size
- .endif
- .else
- .error unsupported bpp
- .endif
-.endm
-
-.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
-
-/*
- * 2-stage pipelined YCbCr->RGB conversion
- */
-
-.macro do_yuv_to_rgb_stage1
- vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
- vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
- vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
- vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
- vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
- vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
- vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
- vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
- vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
- vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
-.endm
-
-.macro do_yuv_to_rgb_stage2
- vrshrn.s32 d20, q10, #15
- vrshrn.s32 d21, q11, #15
- vrshrn.s32 d24, q12, #14
- vrshrn.s32 d25, q13, #14
- vrshrn.s32 d28, q14, #14
- vrshrn.s32 d29, q15, #14
- vaddw.u8 q11, q10, d0
- vaddw.u8 q12, q12, d0
- vaddw.u8 q14, q14, d0
- .if \bpp != 16
- vqmovun.s16 d1\g_offs, q11
- vqmovun.s16 d1\r_offs, q12
- vqmovun.s16 d1\b_offs, q14
- .else /* rgb565 */
- vqshlu.s16 q13, q11, #8
- vqshlu.s16 q15, q12, #8
- vqshlu.s16 q14, q14, #8
- vsri.u16 q15, q13, #5
- vsri.u16 q15, q14, #11
- .endif
-.endm
-
-.macro do_yuv_to_rgb_stage2_store_load_stage1
- /* "do_yuv_to_rgb_stage2" and "store" */
- vrshrn.s32 d20, q10, #15
- /* "load" and "do_yuv_to_rgb_stage1" */
- pld [U, #64]
- vrshrn.s32 d21, q11, #15
- pld [V, #64]
- vrshrn.s32 d24, q12, #14
- vrshrn.s32 d25, q13, #14
- vld1.8 {d4}, [U, :64]!
- vrshrn.s32 d28, q14, #14
- vld1.8 {d5}, [V, :64]!
- vrshrn.s32 d29, q15, #14
- vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
- vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
- vaddw.u8 q11, q10, d0
- vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
- vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
- vaddw.u8 q12, q12, d0
- vaddw.u8 q14, q14, d0
- .if \bpp != 16 /**************** rgb24/rgb32 ******************************/
- vqmovun.s16 d1\g_offs, q11
- pld [Y, #64]
- vqmovun.s16 d1\r_offs, q12
- vld1.8 {d0}, [Y, :64]!
- vqmovun.s16 d1\b_offs, q14
- vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
- vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
- do_store \bpp, 8
- vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
- vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
- vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
- vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
- .else /**************************** rgb565 ********************************/
- vqshlu.s16 q13, q11, #8
- pld [Y, #64]
- vqshlu.s16 q15, q12, #8
- vqshlu.s16 q14, q14, #8
- vld1.8 {d0}, [Y, :64]!
- vmull.s16 q11, d7, d1[1]
- vmlal.s16 q11, d9, d1[2]
- vsri.u16 q15, q13, #5
- vmull.s16 q12, d8, d1[0]
- vsri.u16 q15, q14, #11
- vmull.s16 q13, d9, d1[0]
- vmull.s16 q14, d6, d1[3]
- do_store \bpp, 8
- vmull.s16 q15, d7, d1[3]
- .endif
-.endm
-
-.macro do_yuv_to_rgb
- do_yuv_to_rgb_stage1
- do_yuv_to_rgb_stage2
-.endm
-
-/* Apple gas crashes on adrl, work around that by using adr.
- * But this requires a copy of these constants for each function.
- */
-
-.balign 16
-jsimd_ycc_\colorid\()_neon_consts:
- .short 0, 0, 0, 0
- .short 22971, -11277, -23401, 29033
- .short -128, -128, -128, -128
- .short -128, -128, -128, -128
-
-asm_function jsimd_ycc_\colorid\()_convert_neon
- OUTPUT_WIDTH .req r0
- INPUT_BUF .req r1
- INPUT_ROW .req r2
- OUTPUT_BUF .req r3
- NUM_ROWS .req r4
-
- INPUT_BUF0 .req r5
- INPUT_BUF1 .req r6
- INPUT_BUF2 .req INPUT_BUF
-
- RGB .req r7
- Y .req r8
- U .req r9
- V .req r10
- N .req ip
-
- /* Load constants to d1, d2, d3 (d0 is just used for padding) */
- adr ip, jsimd_ycc_\colorid\()_neon_consts
- vld1.16 {d0, d1, d2, d3}, [ip, :128]
-
- /* Save ARM registers and handle input arguments */
- push {r4, r5, r6, r7, r8, r9, r10, lr}
- ldr NUM_ROWS, [sp, #(4 * 8)]
- ldr INPUT_BUF0, [INPUT_BUF]
- ldr INPUT_BUF1, [INPUT_BUF, #4]
- ldr INPUT_BUF2, [INPUT_BUF, #8]
- .unreq INPUT_BUF
-
- /* Save NEON registers */
- vpush {d8-d15}
-
- /* Initially set d10, d11, d12, d13 to 0xFF */
- vmov.u8 q5, #255
- vmov.u8 q6, #255
-
- /* Outer loop over scanlines */
- cmp NUM_ROWS, #1
- blt 9f
-0:
- ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
- ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2]
- mov N, OUTPUT_WIDTH
- ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2]
- add INPUT_ROW, INPUT_ROW, #1
- ldr RGB, [OUTPUT_BUF], #4
-
- /* Inner loop over pixels */
- subs N, N, #8
- blt 3f
- do_load 8
- do_yuv_to_rgb_stage1
- subs N, N, #8
- blt 2f
-1:
- do_yuv_to_rgb_stage2_store_load_stage1
- subs N, N, #8
- bge 1b
-2:
- do_yuv_to_rgb_stage2
- do_store \bpp, 8
- tst N, #7
- beq 8f
-3:
- tst N, #4
- beq 3f
- do_load 4
-3:
- tst N, #2
- beq 4f
- do_load 2
-4:
- tst N, #1
- beq 5f
- do_load 1
-5:
- do_yuv_to_rgb
- tst N, #4
- beq 6f
- do_store \bpp, 4
-6:
- tst N, #2
- beq 7f
- do_store \bpp, 2
-7:
- tst N, #1
- beq 8f
- do_store \bpp, 1
-8:
- subs NUM_ROWS, NUM_ROWS, #1
- bgt 0b
-9:
- /* Restore all registers and return */
- vpop {d8-d15}
- pop {r4, r5, r6, r7, r8, r9, r10, pc}
-
- .unreq OUTPUT_WIDTH
- .unreq INPUT_ROW
- .unreq OUTPUT_BUF
- .unreq NUM_ROWS
- .unreq INPUT_BUF0
- .unreq INPUT_BUF1
- .unreq INPUT_BUF2
- .unreq RGB
- .unreq Y
- .unreq U
- .unreq V
- .unreq N
-
-.purgem do_yuv_to_rgb
-.purgem do_yuv_to_rgb_stage1
-.purgem do_yuv_to_rgb_stage2
-.purgem do_yuv_to_rgb_stage2_store_load_stage1
-
-.endm
-
-/*--------------------------------- id ----- bpp R G B */
-generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2
-generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0
-generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
-generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
-generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
-generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
-generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, 0, 0
-
-.purgem do_load
-.purgem do_store
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_extrgb_ycc_convert_neon
- * jsimd_extbgr_ycc_convert_neon
- * jsimd_extrgbx_ycc_convert_neon
- * jsimd_extbgrx_ycc_convert_neon
- * jsimd_extxbgr_ycc_convert_neon
- * jsimd_extxrgb_ycc_convert_neon
- *
- * Colorspace conversion RGB -> YCbCr
- */
-
-.macro do_store size
- .if \size == 8
- vst1.8 {d20}, [Y]!
- vst1.8 {d21}, [U]!
- vst1.8 {d22}, [V]!
- .elseif \size == 4
- vst1.8 {d20[0]}, [Y]!
- vst1.8 {d20[1]}, [Y]!
- vst1.8 {d20[2]}, [Y]!
- vst1.8 {d20[3]}, [Y]!
- vst1.8 {d21[0]}, [U]!
- vst1.8 {d21[1]}, [U]!
- vst1.8 {d21[2]}, [U]!
- vst1.8 {d21[3]}, [U]!
- vst1.8 {d22[0]}, [V]!
- vst1.8 {d22[1]}, [V]!
- vst1.8 {d22[2]}, [V]!
- vst1.8 {d22[3]}, [V]!
- .elseif \size == 2
- vst1.8 {d20[4]}, [Y]!
- vst1.8 {d20[5]}, [Y]!
- vst1.8 {d21[4]}, [U]!
- vst1.8 {d21[5]}, [U]!
- vst1.8 {d22[4]}, [V]!
- vst1.8 {d22[5]}, [V]!
- .elseif \size == 1
- vst1.8 {d20[6]}, [Y]!
- vst1.8 {d21[6]}, [U]!
- vst1.8 {d22[6]}, [V]!
- .else
- .error unsupported macroblock size
- .endif
-.endm
-
-.macro do_load bpp, size
- .if \bpp == 24
- .if \size == 8
- vld3.8 {d10, d11, d12}, [RGB]!
- pld [RGB, #128]
- .elseif \size == 4
- vld3.8 {d10[0], d11[0], d12[0]}, [RGB]!
- vld3.8 {d10[1], d11[1], d12[1]}, [RGB]!
- vld3.8 {d10[2], d11[2], d12[2]}, [RGB]!
- vld3.8 {d10[3], d11[3], d12[3]}, [RGB]!
- .elseif \size == 2
- vld3.8 {d10[4], d11[4], d12[4]}, [RGB]!
- vld3.8 {d10[5], d11[5], d12[5]}, [RGB]!
- .elseif \size == 1
- vld3.8 {d10[6], d11[6], d12[6]}, [RGB]!
- .else
- .error unsupported macroblock size
- .endif
- .elseif \bpp == 32
- .if \size == 8
- vld4.8 {d10, d11, d12, d13}, [RGB]!
- pld [RGB, #128]
- .elseif \size == 4
- vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
- vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
- vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
- vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
- .elseif \size == 2
- vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
- vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
- .elseif \size == 1
- vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
- .else
- .error unsupported macroblock size
- .endif
- .else
- .error unsupported bpp
- .endif
-.endm
-
-.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
-
-/*
- * 2-stage pipelined RGB->YCbCr conversion
- */
-
-.macro do_rgb_to_yuv_stage1
- vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
- vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
- vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
- vmull.u16 q7, d4, d0[0]
- vmlal.u16 q7, d6, d0[1]
- vmlal.u16 q7, d8, d0[2]
- vmull.u16 q8, d5, d0[0]
- vmlal.u16 q8, d7, d0[1]
- vmlal.u16 q8, d9, d0[2]
- vrev64.32 q9, q1
- vrev64.32 q13, q1
- vmlsl.u16 q9, d4, d0[3]
- vmlsl.u16 q9, d6, d1[0]
- vmlal.u16 q9, d8, d1[1]
- vmlsl.u16 q13, d5, d0[3]
- vmlsl.u16 q13, d7, d1[0]
- vmlal.u16 q13, d9, d1[1]
- vrev64.32 q14, q1
- vrev64.32 q15, q1
- vmlal.u16 q14, d4, d1[1]
- vmlsl.u16 q14, d6, d1[2]
- vmlsl.u16 q14, d8, d1[3]
- vmlal.u16 q15, d5, d1[1]
- vmlsl.u16 q15, d7, d1[2]
- vmlsl.u16 q15, d9, d1[3]
-.endm
-
-.macro do_rgb_to_yuv_stage2
- vrshrn.u32 d20, q7, #16
- vrshrn.u32 d21, q8, #16
- vshrn.u32 d22, q9, #16
- vshrn.u32 d23, q13, #16
- vshrn.u32 d24, q14, #16
- vshrn.u32 d25, q15, #16
- vmovn.u16 d20, q10 /* d20 = y */
- vmovn.u16 d21, q11 /* d21 = u */
- vmovn.u16 d22, q12 /* d22 = v */
-.endm
-
-.macro do_rgb_to_yuv
- do_rgb_to_yuv_stage1
- do_rgb_to_yuv_stage2
-.endm
-
-.macro do_rgb_to_yuv_stage2_store_load_stage1
- vrshrn.u32 d20, q7, #16
- vrshrn.u32 d21, q8, #16
- vshrn.u32 d22, q9, #16
- vrev64.32 q9, q1
- vshrn.u32 d23, q13, #16
- vrev64.32 q13, q1
- vshrn.u32 d24, q14, #16
- vshrn.u32 d25, q15, #16
- do_load \bpp, 8
- vmovn.u16 d20, q10 /* d20 = y */
- vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
- vmovn.u16 d21, q11 /* d21 = u */
- vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
- vmovn.u16 d22, q12 /* d22 = v */
- vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
- vmull.u16 q7, d4, d0[0]
- vmlal.u16 q7, d6, d0[1]
- vmlal.u16 q7, d8, d0[2]
- vst1.8 {d20}, [Y]!
- vmull.u16 q8, d5, d0[0]
- vmlal.u16 q8, d7, d0[1]
- vmlal.u16 q8, d9, d0[2]
- vmlsl.u16 q9, d4, d0[3]
- vmlsl.u16 q9, d6, d1[0]
- vmlal.u16 q9, d8, d1[1]
- vst1.8 {d21}, [U]!
- vmlsl.u16 q13, d5, d0[3]
- vmlsl.u16 q13, d7, d1[0]
- vmlal.u16 q13, d9, d1[1]
- vrev64.32 q14, q1
- vrev64.32 q15, q1
- vmlal.u16 q14, d4, d1[1]
- vmlsl.u16 q14, d6, d1[2]
- vmlsl.u16 q14, d8, d1[3]
- vst1.8 {d22}, [V]!
- vmlal.u16 q15, d5, d1[1]
- vmlsl.u16 q15, d7, d1[2]
- vmlsl.u16 q15, d9, d1[3]
-.endm
-
-.balign 16
-jsimd_\colorid\()_ycc_neon_consts:
- .short 19595, 38470, 7471, 11059
- .short 21709, 32768, 27439, 5329
- .short 32767, 128, 32767, 128
- .short 32767, 128, 32767, 128
-
-asm_function jsimd_\colorid\()_ycc_convert_neon
- OUTPUT_WIDTH .req r0
- INPUT_BUF .req r1
- OUTPUT_BUF .req r2
- OUTPUT_ROW .req r3
- NUM_ROWS .req r4
-
- OUTPUT_BUF0 .req r5
- OUTPUT_BUF1 .req r6
- OUTPUT_BUF2 .req OUTPUT_BUF
-
- RGB .req r7
- Y .req r8
- U .req r9
- V .req r10
- N .req ip
-
- /* Load constants to d0, d1, d2, d3 */
- adr ip, jsimd_\colorid\()_ycc_neon_consts
- vld1.16 {d0, d1, d2, d3}, [ip, :128]
-
- /* Save ARM registers and handle input arguments */
- push {r4, r5, r6, r7, r8, r9, r10, lr}
- ldr NUM_ROWS, [sp, #(4 * 8)]
- ldr OUTPUT_BUF0, [OUTPUT_BUF]
- ldr OUTPUT_BUF1, [OUTPUT_BUF, #4]
- ldr OUTPUT_BUF2, [OUTPUT_BUF, #8]
- .unreq OUTPUT_BUF
-
- /* Save NEON registers */
- vpush {d8-d15}
-
- /* Outer loop over scanlines */
- cmp NUM_ROWS, #1
- blt 9f
-0:
- ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
- ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
- mov N, OUTPUT_WIDTH
- ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
- add OUTPUT_ROW, OUTPUT_ROW, #1
- ldr RGB, [INPUT_BUF], #4
-
- /* Inner loop over pixels */
- subs N, N, #8
- blt 3f
- do_load \bpp, 8
- do_rgb_to_yuv_stage1
- subs N, N, #8
- blt 2f
-1:
- do_rgb_to_yuv_stage2_store_load_stage1
- subs N, N, #8
- bge 1b
-2:
- do_rgb_to_yuv_stage2
- do_store 8
- tst N, #7
- beq 8f
-3:
- tst N, #4
- beq 3f
- do_load \bpp, 4
-3:
- tst N, #2
- beq 4f
- do_load \bpp, 2
-4:
- tst N, #1
- beq 5f
- do_load \bpp, 1
-5:
- do_rgb_to_yuv
- tst N, #4
- beq 6f
- do_store 4
-6:
- tst N, #2
- beq 7f
- do_store 2
-7:
- tst N, #1
- beq 8f
- do_store 1
-8:
- subs NUM_ROWS, NUM_ROWS, #1
- bgt 0b
-9:
- /* Restore all registers and return */
- vpop {d8-d15}
- pop {r4, r5, r6, r7, r8, r9, r10, pc}
-
- .unreq OUTPUT_WIDTH
- .unreq OUTPUT_ROW
- .unreq INPUT_BUF
- .unreq NUM_ROWS
- .unreq OUTPUT_BUF0
- .unreq OUTPUT_BUF1
- .unreq OUTPUT_BUF2
- .unreq RGB
- .unreq Y
- .unreq U
- .unreq V
- .unreq N
-
-.purgem do_rgb_to_yuv
-.purgem do_rgb_to_yuv_stage1
-.purgem do_rgb_to_yuv_stage2
-.purgem do_rgb_to_yuv_stage2_store_load_stage1
-
-.endm
-
-/*--------------------------------- id ----- bpp R G B */
-generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2
-generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0
-generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
-generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
-generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
-generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
-
-.purgem do_load
-.purgem do_store
-
-
-/*****************************************************************************/
-
-/*
- * Load data into workspace, applying unsigned->signed conversion
- *
- * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
- * rid of VST1.16 instructions
- */
-
-asm_function jsimd_convsamp_neon
- SAMPLE_DATA .req r0
- START_COL .req r1
- WORKSPACE .req r2
- TMP1 .req r3
- TMP2 .req r4
- TMP3 .req r5
- TMP4 .req ip
-
- push {r4, r5}
- vmov.u8 d0, #128
-
- ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
- add TMP1, TMP1, START_COL
- add TMP2, TMP2, START_COL
- add TMP3, TMP3, START_COL
- add TMP4, TMP4, START_COL
- vld1.8 {d16}, [TMP1]
- vsubl.u8 q8, d16, d0
- vld1.8 {d18}, [TMP2]
- vsubl.u8 q9, d18, d0
- vld1.8 {d20}, [TMP3]
- vsubl.u8 q10, d20, d0
- vld1.8 {d22}, [TMP4]
- ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
- vsubl.u8 q11, d22, d0
- vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]!
- add TMP1, TMP1, START_COL
- add TMP2, TMP2, START_COL
- vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]!
- add TMP3, TMP3, START_COL
- add TMP4, TMP4, START_COL
- vld1.8 {d24}, [TMP1]
- vsubl.u8 q12, d24, d0
- vld1.8 {d26}, [TMP2]
- vsubl.u8 q13, d26, d0
- vld1.8 {d28}, [TMP3]
- vsubl.u8 q14, d28, d0
- vld1.8 {d30}, [TMP4]
- vsubl.u8 q15, d30, d0
- vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]!
- vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]!
- pop {r4, r5}
- bx lr
-
- .unreq SAMPLE_DATA
- .unreq START_COL
- .unreq WORKSPACE
- .unreq TMP1
- .unreq TMP2
- .unreq TMP3
- .unreq TMP4
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_fdct_ifast_neon
- *
- * This function contains a fast, not so accurate integer implementation of
- * the forward DCT (Discrete Cosine Transform). It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
- * function from jfdctfst.c
- *
- * TODO: can be combined with 'jsimd_convsamp_neon' to get
- * rid of a bunch of VLD1.16 instructions
- */
-
-#define XFIX_0_382683433 d0[0]
-#define XFIX_0_541196100 d0[1]
-#define XFIX_0_707106781 d0[2]
-#define XFIX_1_306562965 d0[3]
-
-.balign 16
-jsimd_fdct_ifast_neon_consts:
- .short (98 * 128) /* XFIX_0_382683433 */
- .short (139 * 128) /* XFIX_0_541196100 */
- .short (181 * 128) /* XFIX_0_707106781 */
- .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
-
-asm_function jsimd_fdct_ifast_neon
-
- DATA .req r0
- TMP .req ip
-
- vpush {d8-d15}
-
- /* Load constants */
- adr TMP, jsimd_fdct_ifast_neon_consts
- vld1.16 {d0}, [TMP, :64]
-
- /* Load all DATA into NEON registers with the following allocation:
- * 0 1 2 3 | 4 5 6 7
- * ---------+--------
- * 0 | d16 | d17 | q8
- * 1 | d18 | d19 | q9
- * 2 | d20 | d21 | q10
- * 3 | d22 | d23 | q11
- * 4 | d24 | d25 | q12
- * 5 | d26 | d27 | q13
- * 6 | d28 | d29 | q14
- * 7 | d30 | d31 | q15
- */
-
- vld1.16 {d16, d17, d18, d19}, [DATA, :128]!
- vld1.16 {d20, d21, d22, d23}, [DATA, :128]!
- vld1.16 {d24, d25, d26, d27}, [DATA, :128]!
- vld1.16 {d28, d29, d30, d31}, [DATA, :128]
- sub DATA, DATA, #(128 - 32)
-
- mov TMP, #2
-1:
- /* Transpose */
- vtrn.16 q12, q13
- vtrn.16 q10, q11
- vtrn.16 q8, q9
- vtrn.16 q14, q15
- vtrn.32 q9, q11
- vtrn.32 q13, q15
- vtrn.32 q8, q10
- vtrn.32 q12, q14
- vswp d30, d23
- vswp d24, d17
- vswp d26, d19
- /* 1-D FDCT */
- vadd.s16 q2, q11, q12
- vswp d28, d21
- vsub.s16 q12, q11, q12
- vsub.s16 q6, q10, q13
- vadd.s16 q10, q10, q13
- vsub.s16 q7, q9, q14
- vadd.s16 q9, q9, q14
- vsub.s16 q1, q8, q15
- vadd.s16 q8, q8, q15
- vsub.s16 q4, q9, q10
- vsub.s16 q5, q8, q2
- vadd.s16 q3, q9, q10
- vadd.s16 q4, q4, q5
- vadd.s16 q2, q8, q2
- vqdmulh.s16 q4, q4, XFIX_0_707106781
- vadd.s16 q11, q12, q6
- vadd.s16 q8, q2, q3
- vsub.s16 q12, q2, q3
- vadd.s16 q3, q6, q7
- vadd.s16 q7, q7, q1
- vqdmulh.s16 q3, q3, XFIX_0_707106781
- vsub.s16 q6, q11, q7
- vadd.s16 q10, q5, q4
- vqdmulh.s16 q6, q6, XFIX_0_382683433
- vsub.s16 q14, q5, q4
- vqdmulh.s16 q11, q11, XFIX_0_541196100
- vqdmulh.s16 q5, q7, XFIX_1_306562965
- vadd.s16 q4, q1, q3
- vsub.s16 q3, q1, q3
- vadd.s16 q7, q7, q6
- vadd.s16 q11, q11, q6
- vadd.s16 q7, q7, q5
- vadd.s16 q13, q3, q11
- vsub.s16 q11, q3, q11
- vadd.s16 q9, q4, q7
- vsub.s16 q15, q4, q7
- subs TMP, TMP, #1
- bne 1b
-
- /* store results */
- vst1.16 {d16, d17, d18, d19}, [DATA, :128]!
- vst1.16 {d20, d21, d22, d23}, [DATA, :128]!
- vst1.16 {d24, d25, d26, d27}, [DATA, :128]!
- vst1.16 {d28, d29, d30, d31}, [DATA, :128]
-
- vpop {d8-d15}
- bx lr
-
- .unreq DATA
- .unreq TMP
-
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(void)
- * jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
- * DCTELEM *workspace);
- *
- * Note: the code uses 2 stage pipelining in order to improve instructions
- * scheduling and eliminate stalls (this provides ~15% better
- * performance for this function on both ARM Cortex-A8 and
- * ARM Cortex-A9 when compared to the non-pipelined variant).
- * The instructions which belong to the second stage use different
- * indentation for better readiability.
- */
-asm_function jsimd_quantize_neon
-
- COEF_BLOCK .req r0
- DIVISORS .req r1
- WORKSPACE .req r2
-
- RECIPROCAL .req DIVISORS
- CORRECTION .req r3
- SHIFT .req ip
- LOOP_COUNT .req r4
-
- vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]!
- vabs.s16 q12, q0
- add CORRECTION, DIVISORS, #(64 * 2)
- add SHIFT, DIVISORS, #(64 * 6)
- vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]!
- vabs.s16 q13, q1
- vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]!
- vadd.u16 q12, q12, q10 /* add correction */
- vadd.u16 q13, q13, q11
- vmull.u16 q10, d24, d16 /* multiply by reciprocal */
- vmull.u16 q11, d25, d17
- vmull.u16 q8, d26, d18
- vmull.u16 q9, d27, d19
- vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]!
- vshrn.u32 d20, q10, #16
- vshrn.u32 d21, q11, #16
- vshrn.u32 d22, q8, #16
- vshrn.u32 d23, q9, #16
- vneg.s16 q12, q12
- vneg.s16 q13, q13
- vshr.s16 q2, q0, #15 /* extract sign */
- vshr.s16 q3, q1, #15
- vshl.u16 q14, q10, q12 /* shift */
- vshl.u16 q15, q11, q13
-
- push {r4, r5}
- mov LOOP_COUNT, #3
-1:
- vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]!
- veor.u16 q14, q14, q2 /* restore sign */
- vabs.s16 q12, q0
- vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]!
- vabs.s16 q13, q1
- veor.u16 q15, q15, q3
- vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]!
- vadd.u16 q12, q12, q10 /* add correction */
- vadd.u16 q13, q13, q11
- vmull.u16 q10, d24, d16 /* multiply by reciprocal */
- vmull.u16 q11, d25, d17
- vmull.u16 q8, d26, d18
- vmull.u16 q9, d27, d19
- vsub.u16 q14, q14, q2
- vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]!
- vsub.u16 q15, q15, q3
- vshrn.u32 d20, q10, #16
- vshrn.u32 d21, q11, #16
- vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
- vshrn.u32 d22, q8, #16
- vshrn.u32 d23, q9, #16
- vneg.s16 q12, q12
- vneg.s16 q13, q13
- vshr.s16 q2, q0, #15 /* extract sign */
- vshr.s16 q3, q1, #15
- vshl.u16 q14, q10, q12 /* shift */
- vshl.u16 q15, q11, q13
- subs LOOP_COUNT, LOOP_COUNT, #1
- bne 1b
- pop {r4, r5}
-
- veor.u16 q14, q14, q2 /* restore sign */
- veor.u16 q15, q15, q3
- vsub.u16 q14, q14, q2
- vsub.u16 q15, q15, q3
- vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
-
- bx lr /* return */
-
- .unreq COEF_BLOCK
- .unreq DIVISORS
- .unreq WORKSPACE
- .unreq RECIPROCAL
- .unreq CORRECTION
- .unreq SHIFT
- .unreq LOOP_COUNT
-
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(void)
- * jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
- * JDIMENSION downsampled_width,
- * JSAMPARRAY input_data,
- * JSAMPARRAY *output_data_ptr);
- *
- * Note: the use of unaligned writes is the main remaining bottleneck in
- * this code, which can be potentially solved to get up to tens
- * of percents performance improvement on Cortex-A8/Cortex-A9.
- */
-
-/*
- * Upsample 16 source pixels to 32 destination pixels. The new 16 source
- * pixels are loaded to q0. The previous 16 source pixels are in q1. The
- * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
- * Register d28 is used for multiplication by 3. Register q15 is used
- * for adding +1 bias.
- */
-.macro upsample16 OUTPTR, INPTR
- vld1.8 {q0}, [\INPTR]!
- vmovl.u8 q8, d0
- vext.8 q2, q1, q0, #15
- vmovl.u8 q9, d1
- vaddw.u8 q10, q15, d4
- vaddw.u8 q11, q15, d5
- vmlal.u8 q8, d4, d28
- vmlal.u8 q9, d5, d28
- vmlal.u8 q10, d0, d28
- vmlal.u8 q11, d1, d28
- vmov q1, q0 /* backup source pixels to q1 */
- vrshrn.u16 d6, q8, #2
- vrshrn.u16 d7, q9, #2
- vshrn.u16 d8, q10, #2
- vshrn.u16 d9, q11, #2
- vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
-.endm
-
-/*
- * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
- * macro, the roles of q0 and q1 registers are reversed for even and odd
- * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
- * Also this unrolling allows to reorder loads and stores to compensate
- * multiplication latency and reduce stalls.
- */
-.macro upsample32 OUTPTR, INPTR
- /* even 16 pixels group */
- vld1.8 {q0}, [\INPTR]!
- vmovl.u8 q8, d0
- vext.8 q2, q1, q0, #15
- vmovl.u8 q9, d1
- vaddw.u8 q10, q15, d4
- vaddw.u8 q11, q15, d5
- vmlal.u8 q8, d4, d28
- vmlal.u8 q9, d5, d28
- vmlal.u8 q10, d0, d28
- vmlal.u8 q11, d1, d28
- /* odd 16 pixels group */
- vld1.8 {q1}, [\INPTR]!
- vrshrn.u16 d6, q8, #2
- vrshrn.u16 d7, q9, #2
- vshrn.u16 d8, q10, #2
- vshrn.u16 d9, q11, #2
- vmovl.u8 q8, d2
- vext.8 q2, q0, q1, #15
- vmovl.u8 q9, d3
- vaddw.u8 q10, q15, d4
- vaddw.u8 q11, q15, d5
- vmlal.u8 q8, d4, d28
- vmlal.u8 q9, d5, d28
- vmlal.u8 q10, d2, d28
- vmlal.u8 q11, d3, d28
- vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
- vrshrn.u16 d6, q8, #2
- vrshrn.u16 d7, q9, #2
- vshrn.u16 d8, q10, #2
- vshrn.u16 d9, q11, #2
- vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
-.endm
-
-/*
- * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
- */
-.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
- /* special case for the first and last pixels */
- sub \WIDTH, \WIDTH, #1
- add \OUTPTR, \OUTPTR, #1
- ldrb \TMP1, [\INPTR, \WIDTH]
- strb \TMP1, [\OUTPTR, \WIDTH, asl #1]
- ldrb \TMP1, [\INPTR], #1
- strb \TMP1, [\OUTPTR, #-1]
- vmov.8 d3[7], \TMP1
-
- subs \WIDTH, \WIDTH, #32
- blt 5f
-0: /* process 32 pixels per iteration */
- upsample32 \OUTPTR, \INPTR
- subs \WIDTH, \WIDTH, #32
- bge 0b
-5:
- adds \WIDTH, \WIDTH, #16
- blt 1f
-0: /* process 16 pixels if needed */
- upsample16 \OUTPTR, \INPTR
- subs \WIDTH, \WIDTH, #16
-1:
- adds \WIDTH, \WIDTH, #16
- beq 9f
-
- /* load the remaining 1-15 pixels */
- add \INPTR, \INPTR, \WIDTH
- tst \WIDTH, #1
- beq 2f
- sub \INPTR, \INPTR, #1
- vld1.8 {d0[0]}, [\INPTR]
-2:
- tst \WIDTH, #2
- beq 2f
- vext.8 d0, d0, d0, #6
- sub \INPTR, \INPTR, #1
- vld1.8 {d0[1]}, [\INPTR]
- sub \INPTR, \INPTR, #1
- vld1.8 {d0[0]}, [\INPTR]
-2:
- tst \WIDTH, #4
- beq 2f
- vrev64.32 d0, d0
- sub \INPTR, \INPTR, #1
- vld1.8 {d0[3]}, [\INPTR]
- sub \INPTR, \INPTR, #1
- vld1.8 {d0[2]}, [\INPTR]
- sub \INPTR, \INPTR, #1
- vld1.8 {d0[1]}, [\INPTR]
- sub \INPTR, \INPTR, #1
- vld1.8 {d0[0]}, [\INPTR]
-2:
- tst \WIDTH, #8
- beq 2f
- vmov d1, d0
- sub \INPTR, \INPTR, #8
- vld1.8 {d0}, [\INPTR]
-2: /* upsample the remaining pixels */
- vmovl.u8 q8, d0
- vext.8 q2, q1, q0, #15
- vmovl.u8 q9, d1
- vaddw.u8 q10, q15, d4
- vaddw.u8 q11, q15, d5
- vmlal.u8 q8, d4, d28
- vmlal.u8 q9, d5, d28
- vmlal.u8 q10, d0, d28
- vmlal.u8 q11, d1, d28
- vrshrn.u16 d10, q8, #2
- vrshrn.u16 d12, q9, #2
- vshrn.u16 d11, q10, #2
- vshrn.u16 d13, q11, #2
- vzip.8 d10, d11
- vzip.8 d12, d13
- /* store the remaining pixels */
- tst \WIDTH, #8
- beq 2f
- vst1.8 {d10, d11}, [\OUTPTR]!
- vmov q5, q6
-2:
- tst \WIDTH, #4
- beq 2f
- vst1.8 {d10}, [\OUTPTR]!
- vmov d10, d11
-2:
- tst \WIDTH, #2
- beq 2f
- vst1.8 {d10[0]}, [\OUTPTR]!
- vst1.8 {d10[1]}, [\OUTPTR]!
- vst1.8 {d10[2]}, [\OUTPTR]!
- vst1.8 {d10[3]}, [\OUTPTR]!
- vext.8 d10, d10, d10, #4
-2:
- tst \WIDTH, #1
- beq 2f
- vst1.8 {d10[0]}, [\OUTPTR]!
- vst1.8 {d10[1]}, [\OUTPTR]!
-2:
-9:
-.endm
-
-asm_function jsimd_h2v1_fancy_upsample_neon
-
- MAX_V_SAMP_FACTOR .req r0
- DOWNSAMPLED_WIDTH .req r1
- INPUT_DATA .req r2
- OUTPUT_DATA_PTR .req r3
- OUTPUT_DATA .req OUTPUT_DATA_PTR
-
- OUTPTR .req r4
- INPTR .req r5
- WIDTH .req ip
- TMP .req lr
-
- push {r4, r5, r6, lr}
- vpush {d8-d15}
-
- ldr OUTPUT_DATA, [OUTPUT_DATA_PTR]
- cmp MAX_V_SAMP_FACTOR, #0
- ble 99f
-
- /* initialize constants */
- vmov.u8 d28, #3
- vmov.u16 q15, #1
-11:
- ldr INPTR, [INPUT_DATA], #4
- ldr OUTPTR, [OUTPUT_DATA], #4
- mov WIDTH, DOWNSAMPLED_WIDTH
- upsample_row OUTPTR, INPTR, WIDTH, TMP
- subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
- bgt 11b
-
-99:
- vpop {d8-d15}
- pop {r4, r5, r6, pc}
-
- .unreq MAX_V_SAMP_FACTOR
- .unreq DOWNSAMPLED_WIDTH
- .unreq INPUT_DATA
- .unreq OUTPUT_DATA_PTR
- .unreq OUTPUT_DATA
-
- .unreq OUTPTR
- .unreq INPTR
- .unreq WIDTH
- .unreq TMP
-
-.purgem upsample16
-.purgem upsample32
-.purgem upsample_row
-
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(JOCTET *)
- * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
- * JCOEFPTR block, int last_dc_val,
- * c_derived_tbl *dctbl, c_derived_tbl *actbl)
- *
- */
-
-.macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
- sub \PUT_BITS, \PUT_BITS, #0x8
- lsr \TMP, \PUT_BUFFER, \PUT_BITS
- uxtb \TMP, \TMP
- strb \TMP, [\BUFFER, #1]!
- cmp \TMP, #0xff
- /*it eq*/
- strbeq \ZERO, [\BUFFER, #1]!
-.endm
-
-.macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE
- /*lsl \PUT_BUFFER, \PUT_BUFFER, \SIZE*/
- add \PUT_BITS, \SIZE
- /*orr \PUT_BUFFER, \PUT_BUFFER, \CODE*/
- orr \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE
-.endm
-
-.macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
- cmp \PUT_BITS, #0x10
- blt 15f
- eor \ZERO, \ZERO, \ZERO
- emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
- emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
-15:
-.endm
-
-.balign 16
-jsimd_huff_encode_one_block_neon_consts:
- .byte 0x01
- .byte 0x02
- .byte 0x04
- .byte 0x08
- .byte 0x10
- .byte 0x20
- .byte 0x40
- .byte 0x80
-
-asm_function jsimd_huff_encode_one_block_neon
- push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
- add r7, sp, #0x1c
- sub r4, sp, #0x40
- bfc r4, #0, #5
- mov sp, r4 /* align sp on 32 bytes */
- vst1.64 {d8, d9, d10, d11}, [r4, :128]!
- vst1.64 {d12, d13, d14, d15}, [r4, :128]
- sub sp, #0x140 /* reserve 320 bytes */
- str r0, [sp, #0x18] /* working state > sp + Ox18 */
- add r4, sp, #0x20 /* r4 = t1 */
- ldr lr, [r7, #0x8] /* lr = dctbl */
- sub r10, r1, #0x1 /* r10=buffer-- */
- ldrsh r1, [r2]
- mov r9, #0x10
- mov r8, #0x1
- adr r5, jsimd_huff_encode_one_block_neon_consts
- /* prepare data */
- vld1.8 {d26}, [r5, :64]
- veor q8, q8, q8
- veor q9, q9, q9
- vdup.16 q14, r9
- vdup.16 q15, r8
- veor q10, q10, q10
- veor q11, q11, q11
- sub r1, r1, r3
- add r9, r2, #0x22
- add r8, r2, #0x18
- add r3, r2, #0x36
- vmov.16 d0[0], r1
- vld1.16 {d2[0]}, [r9, :16]
- vld1.16 {d4[0]}, [r8, :16]
- vld1.16 {d6[0]}, [r3, :16]
- add r1, r2, #0x2
- add r9, r2, #0x30
- add r8, r2, #0x26
- add r3, r2, #0x28
- vld1.16 {d0[1]}, [r1, :16]
- vld1.16 {d2[1]}, [r9, :16]
- vld1.16 {d4[1]}, [r8, :16]
- vld1.16 {d6[1]}, [r3, :16]
- add r1, r2, #0x10
- add r9, r2, #0x40
- add r8, r2, #0x34
- add r3, r2, #0x1a
- vld1.16 {d0[2]}, [r1, :16]
- vld1.16 {d2[2]}, [r9, :16]
- vld1.16 {d4[2]}, [r8, :16]
- vld1.16 {d6[2]}, [r3, :16]
- add r1, r2, #0x20
- add r9, r2, #0x32
- add r8, r2, #0x42
- add r3, r2, #0xc
- vld1.16 {d0[3]}, [r1, :16]
- vld1.16 {d2[3]}, [r9, :16]
- vld1.16 {d4[3]}, [r8, :16]
- vld1.16 {d6[3]}, [r3, :16]
- add r1, r2, #0x12
- add r9, r2, #0x24
- add r8, r2, #0x50
- add r3, r2, #0xe
- vld1.16 {d1[0]}, [r1, :16]
- vld1.16 {d3[0]}, [r9, :16]
- vld1.16 {d5[0]}, [r8, :16]
- vld1.16 {d7[0]}, [r3, :16]
- add r1, r2, #0x4
- add r9, r2, #0x16
- add r8, r2, #0x60
- add r3, r2, #0x1c
- vld1.16 {d1[1]}, [r1, :16]
- vld1.16 {d3[1]}, [r9, :16]
- vld1.16 {d5[1]}, [r8, :16]
- vld1.16 {d7[1]}, [r3, :16]
- add r1, r2, #0x6
- add r9, r2, #0x8
- add r8, r2, #0x52
- add r3, r2, #0x2a
- vld1.16 {d1[2]}, [r1, :16]
- vld1.16 {d3[2]}, [r9, :16]
- vld1.16 {d5[2]}, [r8, :16]
- vld1.16 {d7[2]}, [r3, :16]
- add r1, r2, #0x14
- add r9, r2, #0xa
- add r8, r2, #0x44
- add r3, r2, #0x38
- vld1.16 {d1[3]}, [r1, :16]
- vld1.16 {d3[3]}, [r9, :16]
- vld1.16 {d5[3]}, [r8, :16]
- vld1.16 {d7[3]}, [r3, :16]
- vcgt.s16 q8, q8, q0
- vcgt.s16 q9, q9, q1
- vcgt.s16 q10, q10, q2
- vcgt.s16 q11, q11, q3
- vabs.s16 q0, q0
- vabs.s16 q1, q1
- vabs.s16 q2, q2
- vabs.s16 q3, q3
- veor q8, q8, q0
- veor q9, q9, q1
- veor q10, q10, q2
- veor q11, q11, q3
- add r9, r4, #0x20
- add r8, r4, #0x80
- add r3, r4, #0xa0
- vclz.i16 q0, q0
- vclz.i16 q1, q1
- vclz.i16 q2, q2
- vclz.i16 q3, q3
- vsub.i16 q0, q14, q0
- vsub.i16 q1, q14, q1
- vsub.i16 q2, q14, q2
- vsub.i16 q3, q14, q3
- vst1.16 {d0, d1, d2, d3}, [r4, :256]
- vst1.16 {d4, d5, d6, d7}, [r9, :256]
- vshl.s16 q0, q15, q0
- vshl.s16 q1, q15, q1
- vshl.s16 q2, q15, q2
- vshl.s16 q3, q15, q3
- vsub.i16 q0, q0, q15
- vsub.i16 q1, q1, q15
- vsub.i16 q2, q2, q15
- vsub.i16 q3, q3, q15
- vand q8, q8, q0
- vand q9, q9, q1
- vand q10, q10, q2
- vand q11, q11, q3
- vst1.16 {d16, d17, d18, d19}, [r8, :256]
- vst1.16 {d20, d21, d22, d23}, [r3, :256]
- add r1, r2, #0x46
- add r9, r2, #0x3a
- add r8, r2, #0x74
- add r3, r2, #0x6a
- vld1.16 {d8[0]}, [r1, :16]
- vld1.16 {d10[0]}, [r9, :16]
- vld1.16 {d12[0]}, [r8, :16]
- vld1.16 {d14[0]}, [r3, :16]
- veor q8, q8, q8
- veor q9, q9, q9
- veor q10, q10, q10
- veor q11, q11, q11
- add r1, r2, #0x54
- add r9, r2, #0x2c
- add r8, r2, #0x76
- add r3, r2, #0x78
- vld1.16 {d8[1]}, [r1, :16]
- vld1.16 {d10[1]}, [r9, :16]
- vld1.16 {d12[1]}, [r8, :16]
- vld1.16 {d14[1]}, [r3, :16]
- add r1, r2, #0x62
- add r9, r2, #0x1e
- add r8, r2, #0x68
- add r3, r2, #0x7a
- vld1.16 {d8[2]}, [r1, :16]
- vld1.16 {d10[2]}, [r9, :16]
- vld1.16 {d12[2]}, [r8, :16]
- vld1.16 {d14[2]}, [r3, :16]
- add r1, r2, #0x70
- add r9, r2, #0x2e
- add r8, r2, #0x5a
- add r3, r2, #0x6c
- vld1.16 {d8[3]}, [r1, :16]
- vld1.16 {d10[3]}, [r9, :16]
- vld1.16 {d12[3]}, [r8, :16]
- vld1.16 {d14[3]}, [r3, :16]
- add r1, r2, #0x72
- add r9, r2, #0x3c
- add r8, r2, #0x4c
- add r3, r2, #0x5e
- vld1.16 {d9[0]}, [r1, :16]
- vld1.16 {d11[0]}, [r9, :16]
- vld1.16 {d13[0]}, [r8, :16]
- vld1.16 {d15[0]}, [r3, :16]
- add r1, r2, #0x64
- add r9, r2, #0x4a
- add r8, r2, #0x3e
- add r3, r2, #0x6e
- vld1.16 {d9[1]}, [r1, :16]
- vld1.16 {d11[1]}, [r9, :16]
- vld1.16 {d13[1]}, [r8, :16]
- vld1.16 {d15[1]}, [r3, :16]
- add r1, r2, #0x56
- add r9, r2, #0x58
- add r8, r2, #0x4e
- add r3, r2, #0x7c
- vld1.16 {d9[2]}, [r1, :16]
- vld1.16 {d11[2]}, [r9, :16]
- vld1.16 {d13[2]}, [r8, :16]
- vld1.16 {d15[2]}, [r3, :16]
- add r1, r2, #0x48
- add r9, r2, #0x66
- add r8, r2, #0x5c
- add r3, r2, #0x7e
- vld1.16 {d9[3]}, [r1, :16]
- vld1.16 {d11[3]}, [r9, :16]
- vld1.16 {d13[3]}, [r8, :16]
- vld1.16 {d15[3]}, [r3, :16]
- vcgt.s16 q8, q8, q4
- vcgt.s16 q9, q9, q5
- vcgt.s16 q10, q10, q6
- vcgt.s16 q11, q11, q7
- vabs.s16 q4, q4
- vabs.s16 q5, q5
- vabs.s16 q6, q6
- vabs.s16 q7, q7
- veor q8, q8, q4
- veor q9, q9, q5
- veor q10, q10, q6
- veor q11, q11, q7
- add r1, r4, #0x40
- add r9, r4, #0x60
- add r8, r4, #0xc0
- add r3, r4, #0xe0
- vclz.i16 q4, q4
- vclz.i16 q5, q5
- vclz.i16 q6, q6
- vclz.i16 q7, q7
- vsub.i16 q4, q14, q4
- vsub.i16 q5, q14, q5
- vsub.i16 q6, q14, q6
- vsub.i16 q7, q14, q7
- vst1.16 {d8, d9, d10, d11}, [r1, :256]
- vst1.16 {d12, d13, d14, d15}, [r9, :256]
- vshl.s16 q4, q15, q4
- vshl.s16 q5, q15, q5
- vshl.s16 q6, q15, q6
- vshl.s16 q7, q15, q7
- vsub.i16 q4, q4, q15
- vsub.i16 q5, q5, q15
- vsub.i16 q6, q6, q15
- vsub.i16 q7, q7, q15
- vand q8, q8, q4
- vand q9, q9, q5
- vand q10, q10, q6
- vand q11, q11, q7
- vst1.16 {d16, d17, d18, d19}, [r8, :256]
- vst1.16 {d20, d21, d22, d23}, [r3, :256]
- ldr r12, [r7, #0xc] /* r12 = actbl */
- add r1, lr, #0x400 /* r1 = dctbl->ehufsi */
- mov r9, r12 /* r9 = actbl */
- add r6, r4, #0x80 /* r6 = t2 */
- ldr r11, [r0, #0x8] /* r11 = put_buffer */
- ldr r4, [r0, #0xc] /* r4 = put_bits */
- ldrh r2, [r6, #-128] /* r2 = nbits */
- ldrh r3, [r6] /* r3 = temp2 & (((JLONG)1)<<nbits) - 1; */
- ldr r0, [lr, r2, lsl #2]
- ldrb r5, [r1, r2]
- put_bits r11, r4, r0, r5
- checkbuf15 r10, r11, r4, r5, r0
- put_bits r11, r4, r3, r2
- checkbuf15 r10, r11, r4, r5, r0
- mov lr, r6 /* lr = t2 */
- add r5, r9, #0x400 /* r5 = actbl->ehufsi */
- ldrsb r6, [r5, #0xf0] /* r6 = actbl->ehufsi[0xf0] */
- veor q8, q8, q8
- vceq.i16 q0, q0, q8
- vceq.i16 q1, q1, q8
- vceq.i16 q2, q2, q8
- vceq.i16 q3, q3, q8
- vceq.i16 q4, q4, q8
- vceq.i16 q5, q5, q8
- vceq.i16 q6, q6, q8
- vceq.i16 q7, q7, q8
- vmovn.i16 d0, q0
- vmovn.i16 d2, q1
- vmovn.i16 d4, q2
- vmovn.i16 d6, q3
- vmovn.i16 d8, q4
- vmovn.i16 d10, q5
- vmovn.i16 d12, q6
- vmovn.i16 d14, q7
- vand d0, d0, d26
- vand d2, d2, d26
- vand d4, d4, d26
- vand d6, d6, d26
- vand d8, d8, d26
- vand d10, d10, d26
- vand d12, d12, d26
- vand d14, d14, d26
- vpadd.i8 d0, d0, d2
- vpadd.i8 d4, d4, d6
- vpadd.i8 d8, d8, d10
- vpadd.i8 d12, d12, d14
- vpadd.i8 d0, d0, d4
- vpadd.i8 d8, d8, d12
- vpadd.i8 d0, d0, d8
- vmov.32 r1, d0[1]
- vmov.32 r8, d0[0]
- mvn r1, r1
- mvn r8, r8
- lsrs r1, r1, #0x1
- rrx r8, r8 /* shift in last r1 bit while shifting out DC bit */
- rbit r1, r1 /* r1 = index1 */
- rbit r8, r8 /* r8 = index0 */
- ldr r0, [r9, #0x3c0] /* r0 = actbl->ehufco[0xf0] */
- str r1, [sp, #0x14] /* index1 > sp + 0x14 */
- cmp r8, #0x0
- beq 6f
-1:
- clz r2, r8
- add lr, lr, r2, lsl #1
- lsl r8, r8, r2
- ldrh r1, [lr, #-126]
-2:
- cmp r2, #0x10
- blt 3f
- sub r2, r2, #0x10
- put_bits r11, r4, r0, r6
- cmp r4, #0x10
- blt 2b
- eor r3, r3, r3
- emit_byte r10, r11, r4, r3, r12
- emit_byte r10, r11, r4, r3, r12
- b 2b
-3:
- add r2, r1, r2, lsl #4
- ldrh r3, [lr, #2]!
- ldr r12, [r9, r2, lsl #2]
- ldrb r2, [r5, r2]
- put_bits r11, r4, r12, r2
- checkbuf15 r10, r11, r4, r2, r12
- put_bits r11, r4, r3, r1
- checkbuf15 r10, r11, r4, r2, r12
- lsls r8, r8, #0x1
- bne 1b
-6:
- add r12, sp, #0x20 /* r12 = t1 */
- ldr r8, [sp, #0x14] /* r8 = index1 */
- adds r12, #0xc0 /* r12 = t2 + (DCTSIZE2/2) */
- cmp r8, #0x0
- beq 6f
- clz r2, r8
- sub r12, r12, lr
- lsl r8, r8, r2
- add r2, r2, r12, lsr #1
- add lr, lr, r2, lsl #1
- b 7f
-1:
- clz r2, r8
- add lr, lr, r2, lsl #1
- lsl r8, r8, r2
-7:
- ldrh r1, [lr, #-126]
-2:
- cmp r2, #0x10
- blt 3f
- sub r2, r2, #0x10
- put_bits r11, r4, r0, r6
- cmp r4, #0x10
- blt 2b
- eor r3, r3, r3
- emit_byte r10, r11, r4, r3, r12
- emit_byte r10, r11, r4, r3, r12
- b 2b
-3:
- add r2, r1, r2, lsl #4
- ldrh r3, [lr, #2]!
- ldr r12, [r9, r2, lsl #2]
- ldrb r2, [r5, r2]
- put_bits r11, r4, r12, r2
- checkbuf15 r10, r11, r4, r2, r12
- put_bits r11, r4, r3, r1
- checkbuf15 r10, r11, r4, r2, r12
- lsls r8, r8, #0x1
- bne 1b
-6:
- add r0, sp, #0x20
- add r0, #0xfe
- cmp lr, r0
- bhs 1f
- ldr r1, [r9]
- ldrb r0, [r5]
- put_bits r11, r4, r1, r0
- checkbuf15 r10, r11, r4, r0, r1
-1:
- ldr r12, [sp, #0x18]
- str r11, [r12, #0x8]
- str r4, [r12, #0xc]
- add r0, r10, #0x1
- add r4, sp, #0x140
- vld1.64 {d8, d9, d10, d11}, [r4, :128]!
- vld1.64 {d12, d13, d14, d15}, [r4, :128]
- sub r4, r7, #0x1c
- mov sp, r4
- pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
-
-.purgem emit_byte
-.purgem put_bits
-.purgem checkbuf15
diff --git a/simd/arm64/jsimd_neon.S b/simd/arm64/jsimd_neon.S
deleted file mode 100644
index d30715a..0000000
--- a/simd/arm64/jsimd_neon.S
+++ /dev/null
@@ -1,3432 +0,0 @@
-/*
- * ARMv8 NEON optimizations for libjpeg-turbo
- *
- * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
- * All Rights Reserved.
- * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
- * Copyright (C) 2013-2014, Linaro Limited. All Rights Reserved.
- * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
- * Copyright (C) 2014-2016, D. R. Commander. All Rights Reserved.
- * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved.
- * Copyright (C) 2016, Siarhei Siamashka. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */
-#endif
-
-#if defined(__APPLE__)
-.section __DATA,__const
-#else
-.section .rodata, "a", %progbits
-#endif
-
-#define F_0_298 2446 /* FIX(0.298631336) */
-#define F_0_390 3196 /* FIX(0.390180644) */
-#define F_0_541 4433 /* FIX(0.541196100) */
-#define F_0_765 6270 /* FIX(0.765366865) */
-#define F_0_899 7373 /* FIX(0.899976223) */
-#define F_1_175 9633 /* FIX(1.175875602) */
-#define F_1_501 12299 /* FIX(1.501321110) */
-#define F_1_847 15137 /* FIX(1.847759065) */
-#define F_1_961 16069 /* FIX(1.961570560) */
-#define F_2_053 16819 /* FIX(2.053119869) */
-#define F_2_562 20995 /* FIX(2.562915447) */
-#define F_3_072 25172 /* FIX(3.072711026) */
-
-.balign 16
-Ljsimd_idct_islow_neon_consts:
- .short F_0_298
- .short -F_0_390
- .short F_0_541
- .short F_0_765
- .short - F_0_899
- .short F_1_175
- .short F_1_501
- .short - F_1_847
- .short - F_1_961
- .short F_2_053
- .short - F_2_562
- .short F_3_072
- .short 0 /* padding */
- .short 0
- .short 0
- .short 0
-
-#undef F_0_298
-#undef F_0_390
-#undef F_0_541
-#undef F_0_765
-#undef F_0_899
-#undef F_1_175
-#undef F_1_501
-#undef F_1_847
-#undef F_1_961
-#undef F_2_053
-#undef F_2_562
-#undef F_3_072
-
-
-#define XFIX_1_082392200 v0.h[0]
-#define XFIX_1_414213562 v0.h[1]
-#define XFIX_1_847759065 v0.h[2]
-#define XFIX_2_613125930 v0.h[3]
-
-.balign 16
-Ljsimd_idct_ifast_neon_consts:
- .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
- .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
- .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
- .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
-
-#define CONST_BITS 13
-#define PASS1_BITS 2
-
-#define FIX_0_211164243 (1730) /* FIX(0.211164243) */
-#define FIX_0_509795579 (4176) /* FIX(0.509795579) */
-#define FIX_0_601344887 (4926) /* FIX(0.601344887) */
-#define FIX_0_720959822 (5906) /* FIX(0.720959822) */
-#define FIX_0_765366865 (6270) /* FIX(0.765366865) */
-#define FIX_0_850430095 (6967) /* FIX(0.850430095) */
-#define FIX_0_899976223 (7373) /* FIX(0.899976223) */
-#define FIX_1_061594337 (8697) /* FIX(1.061594337) */
-#define FIX_1_272758580 (10426) /* FIX(1.272758580) */
-#define FIX_1_451774981 (11893) /* FIX(1.451774981) */
-#define FIX_1_847759065 (15137) /* FIX(1.847759065) */
-#define FIX_2_172734803 (17799) /* FIX(2.172734803) */
-#define FIX_2_562915447 (20995) /* FIX(2.562915447) */
-#define FIX_3_624509785 (29692) /* FIX(3.624509785) */
-
-.balign 16
-Ljsimd_idct_4x4_neon_consts:
- .short FIX_1_847759065 /* v0.h[0] */
- .short -FIX_0_765366865 /* v0.h[1] */
- .short -FIX_0_211164243 /* v0.h[2] */
- .short FIX_1_451774981 /* v0.h[3] */
- .short -FIX_2_172734803 /* d1[0] */
- .short FIX_1_061594337 /* d1[1] */
- .short -FIX_0_509795579 /* d1[2] */
- .short -FIX_0_601344887 /* d1[3] */
- .short FIX_0_899976223 /* v2.h[0] */
- .short FIX_2_562915447 /* v2.h[1] */
- .short 1 << (CONST_BITS + 1) /* v2.h[2] */
- .short 0 /* v2.h[3] */
-
-.balign 8
-Ljsimd_idct_2x2_neon_consts:
- .short -FIX_0_720959822 /* v14[0] */
- .short FIX_0_850430095 /* v14[1] */
- .short -FIX_1_272758580 /* v14[2] */
- .short FIX_3_624509785 /* v14[3] */
-
-.balign 16
-Ljsimd_ycc_colorid_neon_consts:
- .short 0, 0, 0, 0
- .short 22971, -11277, -23401, 29033
- .short -128, -128, -128, -128
- .short -128, -128, -128, -128
-
-.balign 16
-Ljsimd_colorid_ycc_neon_consts:
- .short 19595, 38470, 7471, 11059
- .short 21709, 32768, 27439, 5329
- .short 32767, 128, 32767, 128
- .short 32767, 128, 32767, 128
-
-#define F_0_298 2446 /* FIX(0.298631336) */
-#define F_0_390 3196 /* FIX(0.390180644) */
-#define F_0_541 4433 /* FIX(0.541196100) */
-#define F_0_765 6270 /* FIX(0.765366865) */
-#define F_0_899 7373 /* FIX(0.899976223) */
-#define F_1_175 9633 /* FIX(1.175875602) */
-#define F_1_501 12299 /* FIX(1.501321110) */
-#define F_1_847 15137 /* FIX(1.847759065) */
-#define F_1_961 16069 /* FIX(1.961570560) */
-#define F_2_053 16819 /* FIX(2.053119869) */
-#define F_2_562 20995 /* FIX(2.562915447) */
-#define F_3_072 25172 /* FIX(3.072711026) */
-
-.balign 16
-Ljsimd_fdct_islow_neon_consts:
- .short F_0_298
- .short -F_0_390
- .short F_0_541
- .short F_0_765
- .short - F_0_899
- .short F_1_175
- .short F_1_501
- .short - F_1_847
- .short - F_1_961
- .short F_2_053
- .short - F_2_562
- .short F_3_072
- .short 0 /* padding */
- .short 0
- .short 0
- .short 0
-
-#undef F_0_298
-#undef F_0_390
-#undef F_0_541
-#undef F_0_765
-#undef F_0_899
-#undef F_1_175
-#undef F_1_501
-#undef F_1_847
-#undef F_1_961
-#undef F_2_053
-#undef F_2_562
-#undef F_3_072
-
-.balign 16
-Ljsimd_fdct_ifast_neon_consts:
- .short (98 * 128) /* XFIX_0_382683433 */
- .short (139 * 128) /* XFIX_0_541196100 */
- .short (181 * 128) /* XFIX_0_707106781 */
- .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
-
-.balign 16
-Ljsimd_h2_downsample_neon_consts:
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
- 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F /* diff 0 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
- 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E /* diff 1 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
- 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D /* diff 2 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
- 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C /* diff 3 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
- 0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B /* diff 4 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
- 0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A /* diff 5 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
- 0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09 /* diff 6 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
- 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08 /* diff 7 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
- 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 /* diff 8 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
- 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06 /* diff 9 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
- 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05 /* diff 10 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
- 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 /* diff 11 */
- .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
- 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 /* diff 12 */
- .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
- 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 /* diff 13 */
- .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
- 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 /* diff 14 */
- .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* diff 15 */
-
-Ljsimd_huff_encode_one_block_neon_consts:
- .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
- 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
- .byte 0, 1, 2, 3, 16, 17, 32, 33, \
- 18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */
- .byte 34, 35, 48, 49, 255, 255, 50, 51, \
- 36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */
- .byte 8, 9, 22, 23, 36, 37, 50, 51, \
- 255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */
- .byte 54, 55, 40, 41, 26, 27, 12, 13, \
- 14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */
- .byte 6, 7, 20, 21, 34, 35, 48, 49, \
- 50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */
- .byte 42, 43, 28, 29, 14, 15, 30, 31, \
- 44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */
- .byte 255, 255, 255, 255, 56, 57, 42, 43, \
- 28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */
- .byte 26, 27, 40, 41, 42, 43, 28, 29, \
- 14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */
- .byte 255, 255, 255, 255, 0, 1, 255, 255, \
- 255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */
- .byte 255, 255, 255, 255, 255, 255, 255, 255, \
- 0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */
- .byte 255, 255, 255, 255, 255, 255, 255, 255, \
- 255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */
- .byte 4, 5, 6, 7, 255, 255, 255, 255, \
- 255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */
-Ljsimd_huff_encode_one_block_neon_slowtbl_consts:
- .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
- 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
-
-.text
-
-
-#define RESPECT_STRICT_ALIGNMENT 1
-
-
-/*****************************************************************************/
-
-/* Supplementary macro for setting function attributes */
-.macro asm_function fname
-#ifdef __APPLE__
- .private_extern _\fname
- .globl _\fname
-_\fname:
-#else
- .global \fname
-#ifdef __ELF__
- .hidden \fname
- .type \fname, %function
-#endif
-\fname:
-#endif
-.endm
-
-.macro get_symbol_loc xi, symbol
-#ifdef __APPLE__
- adrp \xi, \symbol@PAGE
- add \xi, \xi, \symbol@PAGEOFF
-#else
- adrp \xi, \symbol
- add \xi, \xi, :lo12:\symbol
-#endif
-.endm
-
-/* Transpose elements of single 128 bit registers */
-.macro transpose_single x0, x1, xi, xilen, literal
- ins \xi\xilen[0], \x0\xilen[0]
- ins \x1\xilen[0], \x0\xilen[1]
- trn1 \x0\literal, \x0\literal, \x1\literal
- trn2 \x1\literal, \xi\literal, \x1\literal
-.endm
-
-/* Transpose elements of 2 different registers */
-.macro transpose x0, x1, xi, xilen, literal
- mov \xi\xilen, \x0\xilen
- trn1 \x0\literal, \x0\literal, \x1\literal
- trn2 \x1\literal, \xi\literal, \x1\literal
-.endm
-
-/* Transpose a block of 4x4 coefficients in four 64-bit registers */
-.macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
- mov \xi\xilen, \x0\xilen
- trn1 \x0\x0len, \x0\x0len, \x2\x2len
- trn2 \x2\x2len, \xi\x0len, \x2\x2len
- mov \xi\xilen, \x1\xilen
- trn1 \x1\x1len, \x1\x1len, \x3\x3len
- trn2 \x3\x3len, \xi\x1len, \x3\x3len
-.endm
-
-.macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
- mov \xi\xilen, \x0\xilen
- trn1 \x0\x0len, \x0\x0len, \x1\x1len
- trn2 \x1\x2len, \xi\x0len, \x1\x2len
- mov \xi\xilen, \x2\xilen
- trn1 \x2\x2len, \x2\x2len, \x3\x3len
- trn2 \x3\x2len, \xi\x1len, \x3\x3len
-.endm
-
-.macro transpose_4x4 x0, x1, x2, x3, x5
- transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b
- transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b
-.endm
-
-.macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
- trn1 \t0\().8h, \l0\().8h, \l1\().8h
- trn1 \t1\().8h, \l2\().8h, \l3\().8h
- trn1 \t2\().8h, \l4\().8h, \l5\().8h
- trn1 \t3\().8h, \l6\().8h, \l7\().8h
- trn2 \l1\().8h, \l0\().8h, \l1\().8h
- trn2 \l3\().8h, \l2\().8h, \l3\().8h
- trn2 \l5\().8h, \l4\().8h, \l5\().8h
- trn2 \l7\().8h, \l6\().8h, \l7\().8h
-
- trn1 \l4\().4s, \t2\().4s, \t3\().4s
- trn2 \t3\().4s, \t2\().4s, \t3\().4s
- trn1 \t2\().4s, \t0\().4s, \t1\().4s
- trn2 \l2\().4s, \t0\().4s, \t1\().4s
- trn1 \t0\().4s, \l1\().4s, \l3\().4s
- trn2 \l3\().4s, \l1\().4s, \l3\().4s
- trn2 \t1\().4s, \l5\().4s, \l7\().4s
- trn1 \l5\().4s, \l5\().4s, \l7\().4s
-
- trn2 \l6\().2d, \l2\().2d, \t3\().2d
- trn1 \l0\().2d, \t2\().2d, \l4\().2d
- trn1 \l1\().2d, \t0\().2d, \l5\().2d
- trn2 \l7\().2d, \l3\().2d, \t1\().2d
- trn1 \l2\().2d, \l2\().2d, \t3\().2d
- trn2 \l4\().2d, \t2\().2d, \l4\().2d
- trn1 \l3\().2d, \l3\().2d, \t1\().2d
- trn2 \l5\().2d, \t0\().2d, \l5\().2d
-.endm
-
-
-#define CENTERJSAMPLE 128
-
-/*****************************************************************************/
-
-/*
- * Perform dequantization and inverse DCT on one block of coefficients.
- *
- * GLOBAL(void)
- * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
- * JSAMPARRAY output_buf, JDIMENSION output_col)
- */
-
-#define CONST_BITS 13
-#define PASS1_BITS 2
-
-#define XFIX_P_0_298 v0.h[0]
-#define XFIX_N_0_390 v0.h[1]
-#define XFIX_P_0_541 v0.h[2]
-#define XFIX_P_0_765 v0.h[3]
-#define XFIX_N_0_899 v0.h[4]
-#define XFIX_P_1_175 v0.h[5]
-#define XFIX_P_1_501 v0.h[6]
-#define XFIX_N_1_847 v0.h[7]
-#define XFIX_N_1_961 v1.h[0]
-#define XFIX_P_2_053 v1.h[1]
-#define XFIX_N_2_562 v1.h[2]
-#define XFIX_P_3_072 v1.h[3]
-
-asm_function jsimd_idct_islow_neon
- DCT_TABLE .req x0
- COEF_BLOCK .req x1
- OUTPUT_BUF .req x2
- OUTPUT_COL .req x3
- TMP1 .req x0
- TMP2 .req x1
- TMP3 .req x9
- TMP4 .req x10
- TMP5 .req x11
- TMP6 .req x12
- TMP7 .req x13
- TMP8 .req x14
-
- /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
- guarantee that the upper (unused) 32 bits of x3 are valid. This
- instruction ensures that those bits are set to zero. */
- uxtw x3, w3
-
- sub sp, sp, #64
- get_symbol_loc x15, Ljsimd_idct_islow_neon_consts
- mov x10, sp
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
- ld1 {v0.8h, v1.8h}, [x15]
- ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
- ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
- ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64
- ld1 {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64
-
- cmeq v16.8h, v3.8h, #0
- cmeq v26.8h, v4.8h, #0
- cmeq v27.8h, v5.8h, #0
- cmeq v28.8h, v6.8h, #0
- cmeq v29.8h, v7.8h, #0
- cmeq v30.8h, v8.8h, #0
- cmeq v31.8h, v9.8h, #0
-
- and v10.16b, v16.16b, v26.16b
- and v11.16b, v27.16b, v28.16b
- and v12.16b, v29.16b, v30.16b
- and v13.16b, v31.16b, v10.16b
- and v14.16b, v11.16b, v12.16b
- mul v2.8h, v2.8h, v18.8h
- and v15.16b, v13.16b, v14.16b
- shl v10.8h, v2.8h, #(PASS1_BITS)
- sqxtn v16.8b, v15.8h
- mov TMP1, v16.d[0]
- mvn TMP2, TMP1
-
- cbnz TMP2, 2f
- /* case all AC coeffs are zeros */
- dup v2.2d, v10.d[0]
- dup v6.2d, v10.d[1]
- mov v3.16b, v2.16b
- mov v7.16b, v6.16b
- mov v4.16b, v2.16b
- mov v8.16b, v6.16b
- mov v5.16b, v2.16b
- mov v9.16b, v6.16b
-1:
- /* for this transpose, we should organise data like this:
- * 00, 01, 02, 03, 40, 41, 42, 43
- * 10, 11, 12, 13, 50, 51, 52, 53
- * 20, 21, 22, 23, 60, 61, 62, 63
- * 30, 31, 32, 33, 70, 71, 72, 73
- * 04, 05, 06, 07, 44, 45, 46, 47
- * 14, 15, 16, 17, 54, 55, 56, 57
- * 24, 25, 26, 27, 64, 65, 66, 67
- * 34, 35, 36, 37, 74, 75, 76, 77
- */
- trn1 v28.8h, v2.8h, v3.8h
- trn1 v29.8h, v4.8h, v5.8h
- trn1 v30.8h, v6.8h, v7.8h
- trn1 v31.8h, v8.8h, v9.8h
- trn2 v16.8h, v2.8h, v3.8h
- trn2 v17.8h, v4.8h, v5.8h
- trn2 v18.8h, v6.8h, v7.8h
- trn2 v19.8h, v8.8h, v9.8h
- trn1 v2.4s, v28.4s, v29.4s
- trn1 v6.4s, v30.4s, v31.4s
- trn1 v3.4s, v16.4s, v17.4s
- trn1 v7.4s, v18.4s, v19.4s
- trn2 v4.4s, v28.4s, v29.4s
- trn2 v8.4s, v30.4s, v31.4s
- trn2 v5.4s, v16.4s, v17.4s
- trn2 v9.4s, v18.4s, v19.4s
- /* Even part: reverse the even part of the forward DCT. */
- add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
- add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
- smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
- sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
- smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
- sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
- mov v21.16b, v19.16b /* tmp3 = z1 */
- mov v20.16b, v18.16b /* tmp3 = z1 */
- smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
- smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
- sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
- smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
- smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
- sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
- sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
- add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
- sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
- add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */
- sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */
- add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
- sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */
- add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */
- sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */
-
- /* Odd part per figure 8; the matrix is unitary and hence its
- * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
- */
-
- add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
- add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
- add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
- add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
- add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */
-
- smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
- smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
- smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
- smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
- smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
- smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
- smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
- smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
- smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
-
- smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
- smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
- smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
- smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
- smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
- smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
- smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
- smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
- smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
-
- add v23.4s, v23.4s, v27.4s /* z3 += z5 */
- add v22.4s, v22.4s, v26.4s /* z3 += z5 */
- add v25.4s, v25.4s, v27.4s /* z4 += z5 */
- add v24.4s, v24.4s, v26.4s /* z4 += z5 */
-
- add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */
- add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */
- add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */
- add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */
- add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */
- add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */
- add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */
- add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */
-
- add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */
- add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */
- add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */
- add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */
- add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */
- add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */
- add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */
- add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */
-
- /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
-
- add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */
- add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */
- sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */
- sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */
- add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */
- add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */
- sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */
- sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */
- add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */
- add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */
- sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */
- sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */
- add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
- add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */
- sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
- sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
-
- shrn v2.4h, v18.4s, #16 /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
- shrn v9.4h, v20.4s, #16 /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
- shrn v3.4h, v22.4s, #16 /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
- shrn v8.4h, v24.4s, #16 /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
- shrn v4.4h, v26.4s, #16 /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
- shrn v7.4h, v28.4s, #16 /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
- shrn v5.4h, v14.4s, #16 /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
- shrn v6.4h, v16.4s, #16 /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
- shrn2 v2.8h, v19.4s, #16 /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
- shrn2 v9.8h, v21.4s, #16 /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
- shrn2 v3.8h, v23.4s, #16 /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
- shrn2 v8.8h, v25.4s, #16 /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
- shrn2 v4.8h, v27.4s, #16 /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
- shrn2 v7.8h, v29.4s, #16 /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
- shrn2 v5.8h, v15.4s, #16 /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
- shrn2 v6.8h, v17.4s, #16 /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
- movi v0.16b, #(CENTERJSAMPLE)
- /* Prepare pointers (dual-issue with NEON instructions) */
- ldp TMP1, TMP2, [OUTPUT_BUF], 16
- sqrshrn v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16)
- ldp TMP3, TMP4, [OUTPUT_BUF], 16
- sqrshrn v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16)
- add TMP1, TMP1, OUTPUT_COL
- sqrshrn v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16)
- add TMP2, TMP2, OUTPUT_COL
- sqrshrn v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16)
- add TMP3, TMP3, OUTPUT_COL
- sqrshrn2 v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16)
- add TMP4, TMP4, OUTPUT_COL
- sqrshrn2 v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16)
- ldp TMP5, TMP6, [OUTPUT_BUF], 16
- sqrshrn2 v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16)
- ldp TMP7, TMP8, [OUTPUT_BUF], 16
- sqrshrn2 v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16)
- add TMP5, TMP5, OUTPUT_COL
- add v16.16b, v28.16b, v0.16b
- add TMP6, TMP6, OUTPUT_COL
- add v18.16b, v29.16b, v0.16b
- add TMP7, TMP7, OUTPUT_COL
- add v20.16b, v30.16b, v0.16b
- add TMP8, TMP8, OUTPUT_COL
- add v22.16b, v31.16b, v0.16b
-
- /* Transpose the final 8-bit samples */
- trn1 v28.16b, v16.16b, v18.16b
- trn1 v30.16b, v20.16b, v22.16b
- trn2 v29.16b, v16.16b, v18.16b
- trn2 v31.16b, v20.16b, v22.16b
-
- trn1 v16.8h, v28.8h, v30.8h
- trn2 v18.8h, v28.8h, v30.8h
- trn1 v20.8h, v29.8h, v31.8h
- trn2 v22.8h, v29.8h, v31.8h
-
- uzp1 v28.4s, v16.4s, v18.4s
- uzp2 v30.4s, v16.4s, v18.4s
- uzp1 v29.4s, v20.4s, v22.4s
- uzp2 v31.4s, v20.4s, v22.4s
-
- /* Store results to the output buffer */
- st1 {v28.d}[0], [TMP1]
- st1 {v29.d}[0], [TMP2]
- st1 {v28.d}[1], [TMP3]
- st1 {v29.d}[1], [TMP4]
- st1 {v30.d}[0], [TMP5]
- st1 {v31.d}[0], [TMP6]
- st1 {v30.d}[1], [TMP7]
- st1 {v31.d}[1], [TMP8]
- ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
- ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
- blr x30
-
-.balign 16
-2:
- mul v3.8h, v3.8h, v19.8h
- mul v4.8h, v4.8h, v20.8h
- mul v5.8h, v5.8h, v21.8h
- add TMP4, xzr, TMP2, LSL #32
- mul v6.8h, v6.8h, v22.8h
- mul v7.8h, v7.8h, v23.8h
- adds TMP3, xzr, TMP2, LSR #32
- mul v8.8h, v8.8h, v24.8h
- mul v9.8h, v9.8h, v25.8h
- b.ne 3f
- /* Right AC coef is zero */
- dup v15.2d, v10.d[1]
- /* Even part: reverse the even part of the forward DCT. */
- add v18.4h, v4.4h, v8.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
- add v22.4h, v2.4h, v6.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
- sub v26.4h, v2.4h, v6.4h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
- smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
- sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
- mov v20.16b, v18.16b /* tmp3 = z1 */
- sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
- smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
- smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
- add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
- sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
- add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */
- sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */
-
- /* Odd part per figure 8; the matrix is unitary and hence its
- * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
- */
-
- add v22.4h, v9.4h, v5.4h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
- add v24.4h, v7.4h, v3.4h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
- add v18.4h, v9.4h, v3.4h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
- add v20.4h, v7.4h, v5.4h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
- add v26.4h, v22.4h, v24.4h /* z5 = z3 + z4 */
-
- smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
- smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
- smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
- smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
- smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
- smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
- smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
- smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
- smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
-
- add v22.4s, v22.4s, v26.4s /* z3 += z5 */
- add v24.4s, v24.4s, v26.4s /* z4 += z5 */
-
- add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */
- add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */
- add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */
- add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */
-
- add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */
- add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */
- add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */
- add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */
-
- /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
-
- add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */
- sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */
- add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */
- sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */
- add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */
- sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */
- add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
- sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
-
- rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
- rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
- rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
- rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
- rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
- rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
- rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
- rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
- mov v6.16b, v15.16b
- mov v7.16b, v15.16b
- mov v8.16b, v15.16b
- mov v9.16b, v15.16b
- b 1b
-
-.balign 16
-3:
- cbnz TMP4, 4f
- /* Left AC coef is zero */
- dup v14.2d, v10.d[0]
- /* Even part: reverse the even part of the forward DCT. */
- add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
- add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
- smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
- sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
- sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
- mov v21.16b, v19.16b /* tmp3 = z1 */
- smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
- sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
- smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
- add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
- sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */
- add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */
- sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */
-
- /* Odd part per figure 8; the matrix is unitary and hence its
- * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
- */
-
- add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
- add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
- add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
- add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
- add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */
-
- smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
- smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
- smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
- smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
- smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
- smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
- smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
- smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
- smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
-
- add v23.4s, v23.4s, v27.4s /* z3 += z5 */
- add v22.4s, v22.4s, v26.4s /* z3 += z5 */
- add v25.4s, v25.4s, v27.4s /* z4 += z5 */
- add v24.4s, v24.4s, v26.4s /* z4 += z5 */
-
- add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */
- add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */
- add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */
- add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */
-
- add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */
- add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */
- add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */
- add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */
-
- /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
-
- add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */
- sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */
- add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */
- sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */
- add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */
- sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */
- add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */
- sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
-
- mov v2.16b, v14.16b
- mov v3.16b, v14.16b
- mov v4.16b, v14.16b
- mov v5.16b, v14.16b
- rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
- rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
- rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
- rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
- rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
- rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
- rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
- rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
- b 1b
-
-.balign 16
-4:
- /* "No" AC coef is zero */
- /* Even part: reverse the even part of the forward DCT. */
- add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
- add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
- smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
- sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
- smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
- sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
- mov v21.16b, v19.16b /* tmp3 = z1 */
- mov v20.16b, v18.16b /* tmp3 = z1 */
- smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
- smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
- sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
- smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
- smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
- sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
- sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
- add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
- sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
- add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */
- sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */
- add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
- sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */
- add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */
- sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */
-
- /* Odd part per figure 8; the matrix is unitary and hence its
- * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
- */
-
- add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
- add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
- add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
- add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
- add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */
-
- smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
- smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
- smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
- smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
- smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
- smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
- smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
- smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
- smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
-
- smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
- smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
- smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
- smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
- smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
- smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
- smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
- smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
- smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
-
- add v23.4s, v23.4s, v27.4s /* z3 += z5 */
- add v22.4s, v22.4s, v26.4s /* z3 += z5 */
- add v25.4s, v25.4s, v27.4s /* z4 += z5 */
- add v24.4s, v24.4s, v26.4s /* z4 += z5 */
-
- add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */
- add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */
- add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */
- add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */
- add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */
- add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */
- add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */
- add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */
-
- add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */
- add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */
- add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */
- add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */
- add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */
- add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */
- add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */
- add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */
-
- /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
-
- add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */
- add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */
- sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */
- sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */
- add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */
- add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */
- sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */
- sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */
- add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */
- add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */
- sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */
- sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */
- add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
- add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */
- sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
- sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
-
- rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
- rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
- rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
- rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
- rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
- rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
- rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
- rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
- rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
- rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
- rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
- rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
- rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
- rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
- rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
- rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
- b 1b
-
- .unreq DCT_TABLE
- .unreq COEF_BLOCK
- .unreq OUTPUT_BUF
- .unreq OUTPUT_COL
- .unreq TMP1
- .unreq TMP2
- .unreq TMP3
- .unreq TMP4
- .unreq TMP5
- .unreq TMP6
- .unreq TMP7
- .unreq TMP8
-
-#undef CENTERJSAMPLE
-#undef CONST_BITS
-#undef PASS1_BITS
-#undef XFIX_P_0_298
-#undef XFIX_N_0_390
-#undef XFIX_P_0_541
-#undef XFIX_P_0_765
-#undef XFIX_N_0_899
-#undef XFIX_P_1_175
-#undef XFIX_P_1_501
-#undef XFIX_N_1_847
-#undef XFIX_N_1_961
-#undef XFIX_P_2_053
-#undef XFIX_N_2_562
-#undef XFIX_P_3_072
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_ifast_neon
- *
- * This function contains a fast, not so accurate integer implementation of
- * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
- * function from jidctfst.c
- *
- * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
- * But in ARM NEON case some extra additions are required because VQDMULH
- * instruction can't handle the constants larger than 1. So the expressions
- * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
- * which introduces an extra addition. Overall, there are 6 extra additions
- * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
- */
-
-asm_function jsimd_idct_ifast_neon
-
- DCT_TABLE .req x0
- COEF_BLOCK .req x1
- OUTPUT_BUF .req x2
- OUTPUT_COL .req x3
- TMP1 .req x0
- TMP2 .req x1
- TMP3 .req x9
- TMP4 .req x10
- TMP5 .req x11
- TMP6 .req x12
- TMP7 .req x13
- TMP8 .req x14
-
- /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
- guarantee that the upper (unused) 32 bits of x3 are valid. This
- instruction ensures that those bits are set to zero. */
- uxtw x3, w3
-
- /* Load and dequantize coefficients into NEON registers
- * with the following allocation:
- * 0 1 2 3 | 4 5 6 7
- * ---------+--------
- * 0 | d16 | d17 ( v16.8h )
- * 1 | d18 | d19 ( v17.8h )
- * 2 | d20 | d21 ( v18.8h )
- * 3 | d22 | d23 ( v19.8h )
- * 4 | d24 | d25 ( v20.8h )
- * 5 | d26 | d27 ( v21.8h )
- * 6 | d28 | d29 ( v22.8h )
- * 7 | d30 | d31 ( v23.8h )
- */
- /* Save NEON registers used in fast IDCT */
- get_symbol_loc TMP5, Ljsimd_idct_ifast_neon_consts
- ld1 {v16.8h, v17.8h}, [COEF_BLOCK], 32
- ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
- ld1 {v18.8h, v19.8h}, [COEF_BLOCK], 32
- mul v16.8h, v16.8h, v0.8h
- ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32
- mul v17.8h, v17.8h, v1.8h
- ld1 {v20.8h, v21.8h}, [COEF_BLOCK], 32
- mul v18.8h, v18.8h, v2.8h
- ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
- mul v19.8h, v19.8h, v3.8h
- ld1 {v22.8h, v23.8h}, [COEF_BLOCK], 32
- mul v20.8h, v20.8h, v0.8h
- ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32
- mul v22.8h, v22.8h, v2.8h
- mul v21.8h, v21.8h, v1.8h
- ld1 {v0.4h}, [TMP5] /* load constants */
- mul v23.8h, v23.8h, v3.8h
-
- /* 1-D IDCT, pass 1 */
- sub v2.8h, v18.8h, v22.8h
- add v22.8h, v18.8h, v22.8h
- sub v1.8h, v19.8h, v21.8h
- add v21.8h, v19.8h, v21.8h
- sub v5.8h, v17.8h, v23.8h
- add v23.8h, v17.8h, v23.8h
- sqdmulh v4.8h, v2.8h, XFIX_1_414213562
- sqdmulh v6.8h, v1.8h, XFIX_2_613125930
- add v3.8h, v1.8h, v1.8h
- sub v1.8h, v5.8h, v1.8h
- add v18.8h, v2.8h, v4.8h
- sqdmulh v4.8h, v1.8h, XFIX_1_847759065
- sub v2.8h, v23.8h, v21.8h
- add v3.8h, v3.8h, v6.8h
- sqdmulh v6.8h, v2.8h, XFIX_1_414213562
- add v1.8h, v1.8h, v4.8h
- sqdmulh v4.8h, v5.8h, XFIX_1_082392200
- sub v18.8h, v18.8h, v22.8h
- add v2.8h, v2.8h, v6.8h
- sub v6.8h, v16.8h, v20.8h
- add v20.8h, v16.8h, v20.8h
- add v17.8h, v5.8h, v4.8h
- add v5.8h, v6.8h, v18.8h
- sub v18.8h, v6.8h, v18.8h
- add v6.8h, v23.8h, v21.8h
- add v16.8h, v20.8h, v22.8h
- sub v3.8h, v6.8h, v3.8h
- sub v20.8h, v20.8h, v22.8h
- sub v3.8h, v3.8h, v1.8h
- sub v1.8h, v17.8h, v1.8h
- add v2.8h, v3.8h, v2.8h
- sub v23.8h, v16.8h, v6.8h
- add v1.8h, v1.8h, v2.8h
- add v16.8h, v16.8h, v6.8h
- add v22.8h, v5.8h, v3.8h
- sub v17.8h, v5.8h, v3.8h
- sub v21.8h, v18.8h, v2.8h
- add v18.8h, v18.8h, v2.8h
- sub v19.8h, v20.8h, v1.8h
- add v20.8h, v20.8h, v1.8h
- transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31
- /* 1-D IDCT, pass 2 */
- sub v2.8h, v18.8h, v22.8h
- add v22.8h, v18.8h, v22.8h
- sub v1.8h, v19.8h, v21.8h
- add v21.8h, v19.8h, v21.8h
- sub v5.8h, v17.8h, v23.8h
- add v23.8h, v17.8h, v23.8h
- sqdmulh v4.8h, v2.8h, XFIX_1_414213562
- sqdmulh v6.8h, v1.8h, XFIX_2_613125930
- add v3.8h, v1.8h, v1.8h
- sub v1.8h, v5.8h, v1.8h
- add v18.8h, v2.8h, v4.8h
- sqdmulh v4.8h, v1.8h, XFIX_1_847759065
- sub v2.8h, v23.8h, v21.8h
- add v3.8h, v3.8h, v6.8h
- sqdmulh v6.8h, v2.8h, XFIX_1_414213562
- add v1.8h, v1.8h, v4.8h
- sqdmulh v4.8h, v5.8h, XFIX_1_082392200
- sub v18.8h, v18.8h, v22.8h
- add v2.8h, v2.8h, v6.8h
- sub v6.8h, v16.8h, v20.8h
- add v20.8h, v16.8h, v20.8h
- add v17.8h, v5.8h, v4.8h
- add v5.8h, v6.8h, v18.8h
- sub v18.8h, v6.8h, v18.8h
- add v6.8h, v23.8h, v21.8h
- add v16.8h, v20.8h, v22.8h
- sub v3.8h, v6.8h, v3.8h
- sub v20.8h, v20.8h, v22.8h
- sub v3.8h, v3.8h, v1.8h
- sub v1.8h, v17.8h, v1.8h
- add v2.8h, v3.8h, v2.8h
- sub v23.8h, v16.8h, v6.8h
- add v1.8h, v1.8h, v2.8h
- add v16.8h, v16.8h, v6.8h
- add v22.8h, v5.8h, v3.8h
- sub v17.8h, v5.8h, v3.8h
- sub v21.8h, v18.8h, v2.8h
- add v18.8h, v18.8h, v2.8h
- sub v19.8h, v20.8h, v1.8h
- add v20.8h, v20.8h, v1.8h
- /* Descale to 8-bit and range limit */
- movi v0.16b, #0x80
- /* Prepare pointers (dual-issue with NEON instructions) */
- ldp TMP1, TMP2, [OUTPUT_BUF], 16
- sqshrn v28.8b, v16.8h, #5
- ldp TMP3, TMP4, [OUTPUT_BUF], 16
- sqshrn v29.8b, v17.8h, #5
- add TMP1, TMP1, OUTPUT_COL
- sqshrn v30.8b, v18.8h, #5
- add TMP2, TMP2, OUTPUT_COL
- sqshrn v31.8b, v19.8h, #5
- add TMP3, TMP3, OUTPUT_COL
- sqshrn2 v28.16b, v20.8h, #5
- add TMP4, TMP4, OUTPUT_COL
- sqshrn2 v29.16b, v21.8h, #5
- ldp TMP5, TMP6, [OUTPUT_BUF], 16
- sqshrn2 v30.16b, v22.8h, #5
- ldp TMP7, TMP8, [OUTPUT_BUF], 16
- sqshrn2 v31.16b, v23.8h, #5
- add TMP5, TMP5, OUTPUT_COL
- add v16.16b, v28.16b, v0.16b
- add TMP6, TMP6, OUTPUT_COL
- add v18.16b, v29.16b, v0.16b
- add TMP7, TMP7, OUTPUT_COL
- add v20.16b, v30.16b, v0.16b
- add TMP8, TMP8, OUTPUT_COL
- add v22.16b, v31.16b, v0.16b
-
- /* Transpose the final 8-bit samples */
- trn1 v28.16b, v16.16b, v18.16b
- trn1 v30.16b, v20.16b, v22.16b
- trn2 v29.16b, v16.16b, v18.16b
- trn2 v31.16b, v20.16b, v22.16b
-
- trn1 v16.8h, v28.8h, v30.8h
- trn2 v18.8h, v28.8h, v30.8h
- trn1 v20.8h, v29.8h, v31.8h
- trn2 v22.8h, v29.8h, v31.8h
-
- uzp1 v28.4s, v16.4s, v18.4s
- uzp2 v30.4s, v16.4s, v18.4s
- uzp1 v29.4s, v20.4s, v22.4s
- uzp2 v31.4s, v20.4s, v22.4s
-
- /* Store results to the output buffer */
- st1 {v28.d}[0], [TMP1]
- st1 {v29.d}[0], [TMP2]
- st1 {v28.d}[1], [TMP3]
- st1 {v29.d}[1], [TMP4]
- st1 {v30.d}[0], [TMP5]
- st1 {v31.d}[0], [TMP6]
- st1 {v30.d}[1], [TMP7]
- st1 {v31.d}[1], [TMP8]
- blr x30
-
- .unreq DCT_TABLE
- .unreq COEF_BLOCK
- .unreq OUTPUT_BUF
- .unreq OUTPUT_COL
- .unreq TMP1
- .unreq TMP2
- .unreq TMP3
- .unreq TMP4
- .unreq TMP5
- .unreq TMP6
- .unreq TMP7
- .unreq TMP8
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_4x4_neon
- *
- * This function contains inverse-DCT code for getting reduced-size
- * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
- * function from jpeg-6b (jidctred.c).
- *
- * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
- * requires much less arithmetic operations and hence should be faster.
- * The primary purpose of this particular NEON optimized function is
- * bit exact compatibility with jpeg-6b.
- *
- * TODO: a bit better instructions scheduling can be achieved by expanding
- * idct_helper/transpose_4x4 macros and reordering instructions,
- * but readability will suffer somewhat.
- */
-
-#define CONST_BITS 13
-
-.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
- smull v28.4s, \x4, v2.h[2]
- smlal v28.4s, \x8, v0.h[0]
- smlal v28.4s, \x14, v0.h[1]
-
- smull v26.4s, \x16, v1.h[2]
- smlal v26.4s, \x12, v1.h[3]
- smlal v26.4s, \x10, v2.h[0]
- smlal v26.4s, \x6, v2.h[1]
-
- smull v30.4s, \x4, v2.h[2]
- smlsl v30.4s, \x8, v0.h[0]
- smlsl v30.4s, \x14, v0.h[1]
-
- smull v24.4s, \x16, v0.h[2]
- smlal v24.4s, \x12, v0.h[3]
- smlal v24.4s, \x10, v1.h[0]
- smlal v24.4s, \x6, v1.h[1]
-
- add v20.4s, v28.4s, v26.4s
- sub v28.4s, v28.4s, v26.4s
-
- .if \shift > 16
- srshr v20.4s, v20.4s, #\shift
- srshr v28.4s, v28.4s, #\shift
- xtn \y26, v20.4s
- xtn \y29, v28.4s
- .else
- rshrn \y26, v20.4s, #\shift
- rshrn \y29, v28.4s, #\shift
- .endif
-
- add v20.4s, v30.4s, v24.4s
- sub v30.4s, v30.4s, v24.4s
-
- .if \shift > 16
- srshr v20.4s, v20.4s, #\shift
- srshr v30.4s, v30.4s, #\shift
- xtn \y27, v20.4s
- xtn \y28, v30.4s
- .else
- rshrn \y27, v20.4s, #\shift
- rshrn \y28, v30.4s, #\shift
- .endif
-.endm
-
-asm_function jsimd_idct_4x4_neon
-
- DCT_TABLE .req x0
- COEF_BLOCK .req x1
- OUTPUT_BUF .req x2
- OUTPUT_COL .req x3
- TMP1 .req x0
- TMP2 .req x1
- TMP3 .req x2
- TMP4 .req x15
-
- /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
- guarantee that the upper (unused) 32 bits of x3 are valid. This
- instruction ensures that those bits are set to zero. */
- uxtw x3, w3
-
- /* Save all used NEON registers */
- sub sp, sp, 64
- mov x9, sp
- /* Load constants (v3.4h is just used for padding) */
- get_symbol_loc TMP4, Ljsimd_idct_4x4_neon_consts
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
- ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
-
- /* Load all COEF_BLOCK into NEON registers with the following allocation:
- * 0 1 2 3 | 4 5 6 7
- * ---------+--------
- * 0 | v4.4h | v5.4h
- * 1 | v6.4h | v7.4h
- * 2 | v8.4h | v9.4h
- * 3 | v10.4h | v11.4h
- * 4 | - | -
- * 5 | v12.4h | v13.4h
- * 6 | v14.4h | v15.4h
- * 7 | v16.4h | v17.4h
- */
- ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
- ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
- add COEF_BLOCK, COEF_BLOCK, #16
- ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
- ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16
- /* dequantize */
- ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
- mul v4.4h, v4.4h, v18.4h
- mul v5.4h, v5.4h, v19.4h
- ins v4.d[1], v5.d[0] /* 128 bit q4 */
- ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
- mul v6.4h, v6.4h, v20.4h
- mul v7.4h, v7.4h, v21.4h
- ins v6.d[1], v7.d[0] /* 128 bit q6 */
- mul v8.4h, v8.4h, v22.4h
- mul v9.4h, v9.4h, v23.4h
- ins v8.d[1], v9.d[0] /* 128 bit q8 */
- add DCT_TABLE, DCT_TABLE, #16
- ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
- mul v10.4h, v10.4h, v24.4h
- mul v11.4h, v11.4h, v25.4h
- ins v10.d[1], v11.d[0] /* 128 bit q10 */
- mul v12.4h, v12.4h, v26.4h
- mul v13.4h, v13.4h, v27.4h
- ins v12.d[1], v13.d[0] /* 128 bit q12 */
- ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
- mul v14.4h, v14.4h, v28.4h
- mul v15.4h, v15.4h, v29.4h
- ins v14.d[1], v15.d[0] /* 128 bit q14 */
- mul v16.4h, v16.4h, v30.4h
- mul v17.4h, v17.4h, v31.4h
- ins v16.d[1], v17.d[0] /* 128 bit q16 */
-
- /* Pass 1 */
- idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \
- v4.4h, v6.4h, v8.4h, v10.4h
- transpose_4x4 v4, v6, v8, v10, v3
- ins v10.d[1], v11.d[0]
- idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \
- v5.4h, v7.4h, v9.4h, v11.4h
- transpose_4x4 v5, v7, v9, v11, v3
- ins v10.d[1], v11.d[0]
-
- /* Pass 2 */
- idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \
- v26.4h, v27.4h, v28.4h, v29.4h
- transpose_4x4 v26, v27, v28, v29, v3
-
- /* Range limit */
- movi v30.8h, #0x80
- ins v26.d[1], v27.d[0]
- ins v28.d[1], v29.d[0]
- add v26.8h, v26.8h, v30.8h
- add v28.8h, v28.8h, v30.8h
- sqxtun v26.8b, v26.8h
- sqxtun v27.8b, v28.8h
-
- /* Store results to the output buffer */
- ldp TMP1, TMP2, [OUTPUT_BUF], 16
- ldp TMP3, TMP4, [OUTPUT_BUF]
- add TMP1, TMP1, OUTPUT_COL
- add TMP2, TMP2, OUTPUT_COL
- add TMP3, TMP3, OUTPUT_COL
- add TMP4, TMP4, OUTPUT_COL
-
-#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
- /* We can use much less instructions on little endian systems if the
- * OS kernel is not configured to trap unaligned memory accesses
- */
- st1 {v26.s}[0], [TMP1], 4
- st1 {v27.s}[0], [TMP3], 4
- st1 {v26.s}[1], [TMP2], 4
- st1 {v27.s}[1], [TMP4], 4
-#else
- st1 {v26.b}[0], [TMP1], 1
- st1 {v27.b}[0], [TMP3], 1
- st1 {v26.b}[1], [TMP1], 1
- st1 {v27.b}[1], [TMP3], 1
- st1 {v26.b}[2], [TMP1], 1
- st1 {v27.b}[2], [TMP3], 1
- st1 {v26.b}[3], [TMP1], 1
- st1 {v27.b}[3], [TMP3], 1
-
- st1 {v26.b}[4], [TMP2], 1
- st1 {v27.b}[4], [TMP4], 1
- st1 {v26.b}[5], [TMP2], 1
- st1 {v27.b}[5], [TMP4], 1
- st1 {v26.b}[6], [TMP2], 1
- st1 {v27.b}[6], [TMP4], 1
- st1 {v26.b}[7], [TMP2], 1
- st1 {v27.b}[7], [TMP4], 1
-#endif
-
- /* vpop {v8.4h - v15.4h} ;not available */
- ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
- ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
- blr x30
-
- .unreq DCT_TABLE
- .unreq COEF_BLOCK
- .unreq OUTPUT_BUF
- .unreq OUTPUT_COL
- .unreq TMP1
- .unreq TMP2
- .unreq TMP3
- .unreq TMP4
-
-.purgem idct_helper
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_2x2_neon
- *
- * This function contains inverse-DCT code for getting reduced-size
- * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
- * function from jpeg-6b (jidctred.c).
- *
- * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
- * requires much less arithmetic operations and hence should be faster.
- * The primary purpose of this particular NEON optimized function is
- * bit exact compatibility with jpeg-6b.
- */
-
-.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
- sshll v15.4s, \x4, #15
- smull v26.4s, \x6, v14.h[3]
- smlal v26.4s, \x10, v14.h[2]
- smlal v26.4s, \x12, v14.h[1]
- smlal v26.4s, \x16, v14.h[0]
-
- add v20.4s, v15.4s, v26.4s
- sub v15.4s, v15.4s, v26.4s
-
- .if \shift > 16
- srshr v20.4s, v20.4s, #\shift
- srshr v15.4s, v15.4s, #\shift
- xtn \y26, v20.4s
- xtn \y27, v15.4s
- .else
- rshrn \y26, v20.4s, #\shift
- rshrn \y27, v15.4s, #\shift
- .endif
-.endm
-
-asm_function jsimd_idct_2x2_neon
-
- DCT_TABLE .req x0
- COEF_BLOCK .req x1
- OUTPUT_BUF .req x2
- OUTPUT_COL .req x3
- TMP1 .req x0
- TMP2 .req x15
-
- /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
- guarantee that the upper (unused) 32 bits of x3 are valid. This
- instruction ensures that those bits are set to zero. */
- uxtw x3, w3
-
- /* vpush {v8.4h - v15.4h} ; not available */
- sub sp, sp, 64
- mov x9, sp
-
- /* Load constants */
- get_symbol_loc TMP2, Ljsimd_idct_2x2_neon_consts
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
- ld1 {v14.4h}, [TMP2]
-
- /* Load all COEF_BLOCK into NEON registers with the following allocation:
- * 0 1 2 3 | 4 5 6 7
- * ---------+--------
- * 0 | v4.4h | v5.4h
- * 1 | v6.4h | v7.4h
- * 2 | - | -
- * 3 | v10.4h | v11.4h
- * 4 | - | -
- * 5 | v12.4h | v13.4h
- * 6 | - | -
- * 7 | v16.4h | v17.4h
- */
- ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
- add COEF_BLOCK, COEF_BLOCK, #16
- ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16
- add COEF_BLOCK, COEF_BLOCK, #16
- ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16
- add COEF_BLOCK, COEF_BLOCK, #16
- ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16
- /* Dequantize */
- ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
- mul v4.4h, v4.4h, v18.4h
- mul v5.4h, v5.4h, v19.4h
- ins v4.d[1], v5.d[0]
- mul v6.4h, v6.4h, v20.4h
- mul v7.4h, v7.4h, v21.4h
- ins v6.d[1], v7.d[0]
- add DCT_TABLE, DCT_TABLE, #16
- ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16
- mul v10.4h, v10.4h, v24.4h
- mul v11.4h, v11.4h, v25.4h
- ins v10.d[1], v11.d[0]
- add DCT_TABLE, DCT_TABLE, #16
- ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16
- mul v12.4h, v12.4h, v26.4h
- mul v13.4h, v13.4h, v27.4h
- ins v12.d[1], v13.d[0]
- add DCT_TABLE, DCT_TABLE, #16
- ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
- mul v16.4h, v16.4h, v30.4h
- mul v17.4h, v17.4h, v31.4h
- ins v16.d[1], v17.d[0]
-
- /* Pass 1 */
-#if 0
- idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
- transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h
- idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
- transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h
-#else
- smull v26.4s, v6.4h, v14.h[3]
- smlal v26.4s, v10.4h, v14.h[2]
- smlal v26.4s, v12.4h, v14.h[1]
- smlal v26.4s, v16.4h, v14.h[0]
- smull v24.4s, v7.4h, v14.h[3]
- smlal v24.4s, v11.4h, v14.h[2]
- smlal v24.4s, v13.4h, v14.h[1]
- smlal v24.4s, v17.4h, v14.h[0]
- sshll v15.4s, v4.4h, #15
- sshll v30.4s, v5.4h, #15
- add v20.4s, v15.4s, v26.4s
- sub v15.4s, v15.4s, v26.4s
- rshrn v4.4h, v20.4s, #13
- rshrn v6.4h, v15.4s, #13
- add v20.4s, v30.4s, v24.4s
- sub v15.4s, v30.4s, v24.4s
- rshrn v5.4h, v20.4s, #13
- rshrn v7.4h, v15.4s, #13
- ins v4.d[1], v5.d[0]
- ins v6.d[1], v7.d[0]
- transpose v4, v6, v3, .16b, .8h
- transpose v6, v10, v3, .16b, .4s
- ins v11.d[0], v10.d[1]
- ins v7.d[0], v6.d[1]
-#endif
-
- /* Pass 2 */
- idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
-
- /* Range limit */
- movi v30.8h, #0x80
- ins v26.d[1], v27.d[0]
- add v26.8h, v26.8h, v30.8h
- sqxtun v30.8b, v26.8h
- ins v26.d[0], v30.d[0]
- sqxtun v27.8b, v26.8h
-
- /* Store results to the output buffer */
- ldp TMP1, TMP2, [OUTPUT_BUF]
- add TMP1, TMP1, OUTPUT_COL
- add TMP2, TMP2, OUTPUT_COL
-
- st1 {v26.b}[0], [TMP1], 1
- st1 {v27.b}[4], [TMP1], 1
- st1 {v26.b}[1], [TMP2], 1
- st1 {v27.b}[5], [TMP2], 1
-
- ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
- ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
- blr x30
-
- .unreq DCT_TABLE
- .unreq COEF_BLOCK
- .unreq OUTPUT_BUF
- .unreq OUTPUT_COL
- .unreq TMP1
- .unreq TMP2
-
-.purgem idct_helper
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_ycc_extrgb_convert_neon
- * jsimd_ycc_extbgr_convert_neon
- * jsimd_ycc_extrgbx_convert_neon
- * jsimd_ycc_extbgrx_convert_neon
- * jsimd_ycc_extxbgr_convert_neon
- * jsimd_ycc_extxrgb_convert_neon
- *
- * Colorspace conversion YCbCr -> RGB
- */
-
-.macro do_load size
- .if \size == 8
- ld1 {v4.8b}, [U], 8
- ld1 {v5.8b}, [V], 8
- ld1 {v0.8b}, [Y], 8
- prfm pldl1keep, [U, #64]
- prfm pldl1keep, [V, #64]
- prfm pldl1keep, [Y, #64]
- .elseif \size == 4
- ld1 {v4.b}[0], [U], 1
- ld1 {v4.b}[1], [U], 1
- ld1 {v4.b}[2], [U], 1
- ld1 {v4.b}[3], [U], 1
- ld1 {v5.b}[0], [V], 1
- ld1 {v5.b}[1], [V], 1
- ld1 {v5.b}[2], [V], 1
- ld1 {v5.b}[3], [V], 1
- ld1 {v0.b}[0], [Y], 1
- ld1 {v0.b}[1], [Y], 1
- ld1 {v0.b}[2], [Y], 1
- ld1 {v0.b}[3], [Y], 1
- .elseif \size == 2
- ld1 {v4.b}[4], [U], 1
- ld1 {v4.b}[5], [U], 1
- ld1 {v5.b}[4], [V], 1
- ld1 {v5.b}[5], [V], 1
- ld1 {v0.b}[4], [Y], 1
- ld1 {v0.b}[5], [Y], 1
- .elseif \size == 1
- ld1 {v4.b}[6], [U], 1
- ld1 {v5.b}[6], [V], 1
- ld1 {v0.b}[6], [Y], 1
- .else
- .error unsupported macroblock size
- .endif
-.endm
-
-.macro do_store bpp, size, fast_st3
- .if \bpp == 24
- .if \size == 8
- .if \fast_st3 == 1
- st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24
- .else
- st1 {v10.b}[0], [RGB], #1
- st1 {v11.b}[0], [RGB], #1
- st1 {v12.b}[0], [RGB], #1
-
- st1 {v10.b}[1], [RGB], #1
- st1 {v11.b}[1], [RGB], #1
- st1 {v12.b}[1], [RGB], #1
-
- st1 {v10.b}[2], [RGB], #1
- st1 {v11.b}[2], [RGB], #1
- st1 {v12.b}[2], [RGB], #1
-
- st1 {v10.b}[3], [RGB], #1
- st1 {v11.b}[3], [RGB], #1
- st1 {v12.b}[3], [RGB], #1
-
- st1 {v10.b}[4], [RGB], #1
- st1 {v11.b}[4], [RGB], #1
- st1 {v12.b}[4], [RGB], #1
-
- st1 {v10.b}[5], [RGB], #1
- st1 {v11.b}[5], [RGB], #1
- st1 {v12.b}[5], [RGB], #1
-
- st1 {v10.b}[6], [RGB], #1
- st1 {v11.b}[6], [RGB], #1
- st1 {v12.b}[6], [RGB], #1
-
- st1 {v10.b}[7], [RGB], #1
- st1 {v11.b}[7], [RGB], #1
- st1 {v12.b}[7], [RGB], #1
- .endif
- .elseif \size == 4
- st3 {v10.b, v11.b, v12.b}[0], [RGB], 3
- st3 {v10.b, v11.b, v12.b}[1], [RGB], 3
- st3 {v10.b, v11.b, v12.b}[2], [RGB], 3
- st3 {v10.b, v11.b, v12.b}[3], [RGB], 3
- .elseif \size == 2
- st3 {v10.b, v11.b, v12.b}[4], [RGB], 3
- st3 {v10.b, v11.b, v12.b}[5], [RGB], 3
- .elseif \size == 1
- st3 {v10.b, v11.b, v12.b}[6], [RGB], 3
- .else
- .error unsupported macroblock size
- .endif
- .elseif \bpp == 32
- .if \size == 8
- st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
- .elseif \size == 4
- st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
- st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
- st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
- st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
- .elseif \size == 2
- st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
- st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
- .elseif \size == 1
- st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
- .else
- .error unsupported macroblock size
- .endif
- .elseif \bpp == 16
- .if \size == 8
- st1 {v25.8h}, [RGB], 16
- .elseif \size == 4
- st1 {v25.4h}, [RGB], 8
- .elseif \size == 2
- st1 {v25.h}[4], [RGB], 2
- st1 {v25.h}[5], [RGB], 2
- .elseif \size == 1
- st1 {v25.h}[6], [RGB], 2
- .else
- .error unsupported macroblock size
- .endif
- .else
- .error unsupported bpp
- .endif
-.endm
-
-.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \
- g_offs, gsize, b_offs, bsize, \
- defsize, fast_st3
-
-/*
- * 2-stage pipelined YCbCr->RGB conversion
- */
-
-.macro do_yuv_to_rgb_stage1
- uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */
- uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
- smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
- smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
- smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
- smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
- smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
- smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
- smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */
- smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */
-.endm
-
-.macro do_yuv_to_rgb_stage2
- rshrn v20.4h, v20.4s, #15
- rshrn2 v20.8h, v22.4s, #15
- rshrn v24.4h, v24.4s, #14
- rshrn2 v24.8h, v26.4s, #14
- rshrn v28.4h, v28.4s, #14
- rshrn2 v28.8h, v30.4s, #14
- uaddw v20.8h, v20.8h, v0.8b
- uaddw v24.8h, v24.8h, v0.8b
- uaddw v28.8h, v28.8h, v0.8b
- .if \bpp != 16
- sqxtun v1\g_offs\defsize, v20.8h
- sqxtun v1\r_offs\defsize, v24.8h
- sqxtun v1\b_offs\defsize, v28.8h
- .else
- sqshlu v21.8h, v20.8h, #8
- sqshlu v25.8h, v24.8h, #8
- sqshlu v29.8h, v28.8h, #8
- sri v25.8h, v21.8h, #5
- sri v25.8h, v29.8h, #11
- .endif
-.endm
-
-.macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3
- rshrn v20.4h, v20.4s, #15
- rshrn v24.4h, v24.4s, #14
- rshrn v28.4h, v28.4s, #14
- ld1 {v4.8b}, [U], 8
- rshrn2 v20.8h, v22.4s, #15
- rshrn2 v24.8h, v26.4s, #14
- rshrn2 v28.8h, v30.4s, #14
- ld1 {v5.8b}, [V], 8
- uaddw v20.8h, v20.8h, v0.8b
- uaddw v24.8h, v24.8h, v0.8b
- uaddw v28.8h, v28.8h, v0.8b
- .if \bpp != 16 /**************** rgb24/rgb32 ******************************/
- sqxtun v1\g_offs\defsize, v20.8h
- ld1 {v0.8b}, [Y], 8
- sqxtun v1\r_offs\defsize, v24.8h
- prfm pldl1keep, [U, #64]
- prfm pldl1keep, [V, #64]
- prfm pldl1keep, [Y, #64]
- sqxtun v1\b_offs\defsize, v28.8h
- uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
- uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
- smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
- smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
- smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
- smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
- smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
- smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
- .else /**************************** rgb565 ********************************/
- sqshlu v21.8h, v20.8h, #8
- sqshlu v25.8h, v24.8h, #8
- sqshlu v29.8h, v28.8h, #8
- uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
- uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
- ld1 {v0.8b}, [Y], 8
- smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
- smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
- smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
- smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
- sri v25.8h, v21.8h, #5
- smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
- smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
- prfm pldl1keep, [U, #64]
- prfm pldl1keep, [V, #64]
- prfm pldl1keep, [Y, #64]
- sri v25.8h, v29.8h, #11
- .endif
- do_store \bpp, 8, \fast_st3
- smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */
- smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */
-.endm
-
-.macro do_yuv_to_rgb
- do_yuv_to_rgb_stage1
- do_yuv_to_rgb_stage2
-.endm
-
-/* Apple gas crashes on adrl, work around that by using adr.
- * But this requires a copy of these constants for each function.
- */
-
-.if \fast_st3 == 1
-asm_function jsimd_ycc_\colorid\()_convert_neon
-.else
-asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
-.endif
- OUTPUT_WIDTH .req w0
- INPUT_BUF .req x1
- INPUT_ROW .req w2
- OUTPUT_BUF .req x3
- NUM_ROWS .req w4
-
- INPUT_BUF0 .req x5
- INPUT_BUF1 .req x6
- INPUT_BUF2 .req x1
-
- RGB .req x7
- Y .req x9
- U .req x10
- V .req x11
- N .req w15
-
- sub sp, sp, 64
- mov x9, sp
-
- /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
- get_symbol_loc x15, Ljsimd_ycc_colorid_neon_consts
-
- /* Save NEON registers */
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
- ld1 {v0.4h, v1.4h}, [x15], 16
- ld1 {v2.8h}, [x15]
-
- ldr INPUT_BUF0, [INPUT_BUF]
- ldr INPUT_BUF1, [INPUT_BUF, #8]
- ldr INPUT_BUF2, [INPUT_BUF, #16]
- .unreq INPUT_BUF
-
- /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
- movi v10.16b, #255
- movi v13.16b, #255
-
- /* Outer loop over scanlines */
- cmp NUM_ROWS, #1
- b.lt 9f
-0:
- ldr Y, [INPUT_BUF0, INPUT_ROW, uxtw #3]
- ldr U, [INPUT_BUF1, INPUT_ROW, uxtw #3]
- mov N, OUTPUT_WIDTH
- ldr V, [INPUT_BUF2, INPUT_ROW, uxtw #3]
- add INPUT_ROW, INPUT_ROW, #1
- ldr RGB, [OUTPUT_BUF], #8
-
- /* Inner loop over pixels */
- subs N, N, #8
- b.lt 3f
- do_load 8
- do_yuv_to_rgb_stage1
- subs N, N, #8
- b.lt 2f
-1:
- do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3
- subs N, N, #8
- b.ge 1b
-2:
- do_yuv_to_rgb_stage2
- do_store \bpp, 8, \fast_st3
- tst N, #7
- b.eq 8f
-3:
- tst N, #4
- b.eq 3f
- do_load 4
-3:
- tst N, #2
- b.eq 4f
- do_load 2
-4:
- tst N, #1
- b.eq 5f
- do_load 1
-5:
- do_yuv_to_rgb
- tst N, #4
- b.eq 6f
- do_store \bpp, 4, \fast_st3
-6:
- tst N, #2
- b.eq 7f
- do_store \bpp, 2, \fast_st3
-7:
- tst N, #1
- b.eq 8f
- do_store \bpp, 1, \fast_st3
-8:
- subs NUM_ROWS, NUM_ROWS, #1
- b.gt 0b
-9:
- /* Restore all registers and return */
- ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
- ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
- br x30
- .unreq OUTPUT_WIDTH
- .unreq INPUT_ROW
- .unreq OUTPUT_BUF
- .unreq NUM_ROWS
- .unreq INPUT_BUF0
- .unreq INPUT_BUF1
- .unreq INPUT_BUF2
- .unreq RGB
- .unreq Y
- .unreq U
- .unreq V
- .unreq N
-
-.purgem do_yuv_to_rgb
-.purgem do_yuv_to_rgb_stage1
-.purgem do_yuv_to_rgb_stage2
-.purgem do_yuv_to_rgb_stage2_store_load_stage1
-
-.endm
-
-/*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize fast_st3*/
-generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 1
-generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 1
-generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .8b, 1
-generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b, 1
-generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b, 1
-generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b, 1
-generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b, 1
-
-generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 0
-generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 0
-
-.purgem do_load
-.purgem do_store
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_extrgb_ycc_convert_neon
- * jsimd_extbgr_ycc_convert_neon
- * jsimd_extrgbx_ycc_convert_neon
- * jsimd_extbgrx_ycc_convert_neon
- * jsimd_extxbgr_ycc_convert_neon
- * jsimd_extxrgb_ycc_convert_neon
- *
- * Colorspace conversion RGB -> YCbCr
- */
-
-.macro do_store size
- .if \size == 8
- st1 {v20.8b}, [Y], #8
- st1 {v21.8b}, [U], #8
- st1 {v22.8b}, [V], #8
- .elseif \size == 4
- st1 {v20.b}[0], [Y], #1
- st1 {v20.b}[1], [Y], #1
- st1 {v20.b}[2], [Y], #1
- st1 {v20.b}[3], [Y], #1
- st1 {v21.b}[0], [U], #1
- st1 {v21.b}[1], [U], #1
- st1 {v21.b}[2], [U], #1
- st1 {v21.b}[3], [U], #1
- st1 {v22.b}[0], [V], #1
- st1 {v22.b}[1], [V], #1
- st1 {v22.b}[2], [V], #1
- st1 {v22.b}[3], [V], #1
- .elseif \size == 2
- st1 {v20.b}[4], [Y], #1
- st1 {v20.b}[5], [Y], #1
- st1 {v21.b}[4], [U], #1
- st1 {v21.b}[5], [U], #1
- st1 {v22.b}[4], [V], #1
- st1 {v22.b}[5], [V], #1
- .elseif \size == 1
- st1 {v20.b}[6], [Y], #1
- st1 {v21.b}[6], [U], #1
- st1 {v22.b}[6], [V], #1
- .else
- .error unsupported macroblock size
- .endif
-.endm
-
-.macro do_load bpp, size, fast_ld3
- .if \bpp == 24
- .if \size == 8
- .if \fast_ld3 == 1
- ld3 {v10.8b, v11.8b, v12.8b}, [RGB], #24
- .else
- ld1 {v10.b}[0], [RGB], #1
- ld1 {v11.b}[0], [RGB], #1
- ld1 {v12.b}[0], [RGB], #1
-
- ld1 {v10.b}[1], [RGB], #1
- ld1 {v11.b}[1], [RGB], #1
- ld1 {v12.b}[1], [RGB], #1
-
- ld1 {v10.b}[2], [RGB], #1
- ld1 {v11.b}[2], [RGB], #1
- ld1 {v12.b}[2], [RGB], #1
-
- ld1 {v10.b}[3], [RGB], #1
- ld1 {v11.b}[3], [RGB], #1
- ld1 {v12.b}[3], [RGB], #1
-
- ld1 {v10.b}[4], [RGB], #1
- ld1 {v11.b}[4], [RGB], #1
- ld1 {v12.b}[4], [RGB], #1
-
- ld1 {v10.b}[5], [RGB], #1
- ld1 {v11.b}[5], [RGB], #1
- ld1 {v12.b}[5], [RGB], #1
-
- ld1 {v10.b}[6], [RGB], #1
- ld1 {v11.b}[6], [RGB], #1
- ld1 {v12.b}[6], [RGB], #1
-
- ld1 {v10.b}[7], [RGB], #1
- ld1 {v11.b}[7], [RGB], #1
- ld1 {v12.b}[7], [RGB], #1
- .endif
- prfm pldl1keep, [RGB, #128]
- .elseif \size == 4
- ld3 {v10.b, v11.b, v12.b}[0], [RGB], #3
- ld3 {v10.b, v11.b, v12.b}[1], [RGB], #3
- ld3 {v10.b, v11.b, v12.b}[2], [RGB], #3
- ld3 {v10.b, v11.b, v12.b}[3], [RGB], #3
- .elseif \size == 2
- ld3 {v10.b, v11.b, v12.b}[4], [RGB], #3
- ld3 {v10.b, v11.b, v12.b}[5], [RGB], #3
- .elseif \size == 1
- ld3 {v10.b, v11.b, v12.b}[6], [RGB], #3
- .else
- .error unsupported macroblock size
- .endif
- .elseif \bpp == 32
- .if \size == 8
- ld4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32
- prfm pldl1keep, [RGB, #128]
- .elseif \size == 4
- ld4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4
- ld4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4
- ld4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4
- ld4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4
- .elseif \size == 2
- ld4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4
- ld4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4
- .elseif \size == 1
- ld4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4
- .else
- .error unsupported macroblock size
- .endif
- .else
- .error unsupported bpp
- .endif
-.endm
-
-.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \
- b_offs, fast_ld3
-
-/*
- * 2-stage pipelined RGB->YCbCr conversion
- */
-
-.macro do_rgb_to_yuv_stage1
- ushll v4.8h, v1\r_offs\().8b, #0 /* r = v4 */
- ushll v6.8h, v1\g_offs\().8b, #0 /* g = v6 */
- ushll v8.8h, v1\b_offs\().8b, #0 /* b = v8 */
- rev64 v18.4s, v1.4s
- rev64 v26.4s, v1.4s
- rev64 v28.4s, v1.4s
- rev64 v30.4s, v1.4s
- umull v14.4s, v4.4h, v0.h[0]
- umull2 v16.4s, v4.8h, v0.h[0]
- umlsl v18.4s, v4.4h, v0.h[3]
- umlsl2 v26.4s, v4.8h, v0.h[3]
- umlal v28.4s, v4.4h, v0.h[5]
- umlal2 v30.4s, v4.8h, v0.h[5]
- umlal v14.4s, v6.4h, v0.h[1]
- umlal2 v16.4s, v6.8h, v0.h[1]
- umlsl v18.4s, v6.4h, v0.h[4]
- umlsl2 v26.4s, v6.8h, v0.h[4]
- umlsl v28.4s, v6.4h, v0.h[6]
- umlsl2 v30.4s, v6.8h, v0.h[6]
- umlal v14.4s, v8.4h, v0.h[2]
- umlal2 v16.4s, v8.8h, v0.h[2]
- umlal v18.4s, v8.4h, v0.h[5]
- umlal2 v26.4s, v8.8h, v0.h[5]
- umlsl v28.4s, v8.4h, v0.h[7]
- umlsl2 v30.4s, v8.8h, v0.h[7]
-.endm
-
-.macro do_rgb_to_yuv_stage2
- rshrn v20.4h, v14.4s, #16
- shrn v22.4h, v18.4s, #16
- shrn v24.4h, v28.4s, #16
- rshrn2 v20.8h, v16.4s, #16
- shrn2 v22.8h, v26.4s, #16
- shrn2 v24.8h, v30.4s, #16
- xtn v20.8b, v20.8h /* v20 = y */
- xtn v21.8b, v22.8h /* v21 = u */
- xtn v22.8b, v24.8h /* v22 = v */
-.endm
-
-.macro do_rgb_to_yuv
- do_rgb_to_yuv_stage1
- do_rgb_to_yuv_stage2
-.endm
-
-/* TODO: expand macros and interleave instructions if some in-order
- * ARM64 processor actually can dual-issue LOAD/STORE with ALU */
-.macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3
- do_rgb_to_yuv_stage2
- do_load \bpp, 8, \fast_ld3
- st1 {v20.8b}, [Y], #8
- st1 {v21.8b}, [U], #8
- st1 {v22.8b}, [V], #8
- do_rgb_to_yuv_stage1
-.endm
-
-
-.if \fast_ld3 == 1
-asm_function jsimd_\colorid\()_ycc_convert_neon
-.else
-asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
-.endif
- OUTPUT_WIDTH .req w0
- INPUT_BUF .req x1
- OUTPUT_BUF .req x2
- OUTPUT_ROW .req w3
- NUM_ROWS .req w4
-
- OUTPUT_BUF0 .req x5
- OUTPUT_BUF1 .req x6
- OUTPUT_BUF2 .req x2 /* OUTPUT_BUF */
-
- RGB .req x7
- Y .req x9
- U .req x10
- V .req x11
- N .req w12
-
- /* Load constants to d0, d1, d2, d3 */
- get_symbol_loc x13, Ljsimd_colorid_ycc_neon_consts
-
- ld1 {v0.8h, v1.8h}, [x13]
-
- ldr OUTPUT_BUF0, [OUTPUT_BUF]
- ldr OUTPUT_BUF1, [OUTPUT_BUF, #8]
- ldr OUTPUT_BUF2, [OUTPUT_BUF, #16]
- .unreq OUTPUT_BUF
-
- /* Save NEON registers */
- sub sp, sp, #64
- mov x9, sp
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
-
- /* Outer loop over scanlines */
- cmp NUM_ROWS, #1
- b.lt 9f
-0:
- ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3]
- ldr U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3]
- mov N, OUTPUT_WIDTH
- ldr V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3]
- add OUTPUT_ROW, OUTPUT_ROW, #1
- ldr RGB, [INPUT_BUF], #8
-
- /* Inner loop over pixels */
- subs N, N, #8
- b.lt 3f
- do_load \bpp, 8, \fast_ld3
- do_rgb_to_yuv_stage1
- subs N, N, #8
- b.lt 2f
-1:
- do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3
- subs N, N, #8
- b.ge 1b
-2:
- do_rgb_to_yuv_stage2
- do_store 8
- tst N, #7
- b.eq 8f
-3:
- tbz N, #2, 3f
- do_load \bpp, 4, \fast_ld3
-3:
- tbz N, #1, 4f
- do_load \bpp, 2, \fast_ld3
-4:
- tbz N, #0, 5f
- do_load \bpp, 1, \fast_ld3
-5:
- do_rgb_to_yuv
- tbz N, #2, 6f
- do_store 4
-6:
- tbz N, #1, 7f
- do_store 2
-7:
- tbz N, #0, 8f
- do_store 1
-8:
- subs NUM_ROWS, NUM_ROWS, #1
- b.gt 0b
-9:
- /* Restore all registers and return */
- ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
- ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
- br x30
-
- .unreq OUTPUT_WIDTH
- .unreq OUTPUT_ROW
- .unreq INPUT_BUF
- .unreq NUM_ROWS
- .unreq OUTPUT_BUF0
- .unreq OUTPUT_BUF1
- .unreq OUTPUT_BUF2
- .unreq RGB
- .unreq Y
- .unreq U
- .unreq V
- .unreq N
-
-.purgem do_rgb_to_yuv
-.purgem do_rgb_to_yuv_stage1
-.purgem do_rgb_to_yuv_stage2
-.purgem do_rgb_to_yuv_stage2_store_load_stage1
-
-.endm
-
-/*--------------------------------- id ----- bpp R G B Fast LD3 */
-generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 1
-generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 1
-generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1
-generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1
-generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1
-generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1
-
-generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 0
-generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 0
-
-.purgem do_load
-.purgem do_store
-
-
-/*****************************************************************************/
-
-/*
- * Load data into workspace, applying unsigned->signed conversion
- *
- * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
- * rid of VST1.16 instructions
- */
-
-asm_function jsimd_convsamp_neon
- SAMPLE_DATA .req x0
- START_COL .req x1
- WORKSPACE .req x2
- TMP1 .req x9
- TMP2 .req x10
- TMP3 .req x11
- TMP4 .req x12
- TMP5 .req x13
- TMP6 .req x14
- TMP7 .req x15
- TMP8 .req x4
- TMPDUP .req w3
-
- /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
- guarantee that the upper (unused) 32 bits of x1 are valid. This
- instruction ensures that those bits are set to zero. */
- uxtw x1, w1
-
- mov TMPDUP, #128
- ldp TMP1, TMP2, [SAMPLE_DATA], 16
- ldp TMP3, TMP4, [SAMPLE_DATA], 16
- dup v0.8b, TMPDUP
- add TMP1, TMP1, START_COL
- add TMP2, TMP2, START_COL
- ldp TMP5, TMP6, [SAMPLE_DATA], 16
- add TMP3, TMP3, START_COL
- add TMP4, TMP4, START_COL
- ldp TMP7, TMP8, [SAMPLE_DATA], 16
- add TMP5, TMP5, START_COL
- add TMP6, TMP6, START_COL
- ld1 {v16.8b}, [TMP1]
- add TMP7, TMP7, START_COL
- add TMP8, TMP8, START_COL
- ld1 {v17.8b}, [TMP2]
- usubl v16.8h, v16.8b, v0.8b
- ld1 {v18.8b}, [TMP3]
- usubl v17.8h, v17.8b, v0.8b
- ld1 {v19.8b}, [TMP4]
- usubl v18.8h, v18.8b, v0.8b
- ld1 {v20.8b}, [TMP5]
- usubl v19.8h, v19.8b, v0.8b
- ld1 {v21.8b}, [TMP6]
- st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64
- usubl v20.8h, v20.8b, v0.8b
- ld1 {v22.8b}, [TMP7]
- usubl v21.8h, v21.8b, v0.8b
- ld1 {v23.8b}, [TMP8]
- usubl v22.8h, v22.8b, v0.8b
- usubl v23.8h, v23.8b, v0.8b
- st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64
-
- br x30
-
- .unreq SAMPLE_DATA
- .unreq START_COL
- .unreq WORKSPACE
- .unreq TMP1
- .unreq TMP2
- .unreq TMP3
- .unreq TMP4
- .unreq TMP5
- .unreq TMP6
- .unreq TMP7
- .unreq TMP8
- .unreq TMPDUP
-
-/*****************************************************************************/
-
-/*
- * jsimd_fdct_islow_neon
- *
- * This file contains a slow-but-accurate integer implementation of the
- * forward DCT (Discrete Cosine Transform). The following code is based
- * directly on the IJG''s original jfdctint.c; see the jfdctint.c for
- * more details.
- *
- * TODO: can be combined with 'jsimd_convsamp_neon' to get
- * rid of a bunch of VLD1.16 instructions
- */
-
-#define CONST_BITS 13
-#define PASS1_BITS 2
-
-#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
-#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
-
-#define XFIX_P_0_298 v0.h[0]
-#define XFIX_N_0_390 v0.h[1]
-#define XFIX_P_0_541 v0.h[2]
-#define XFIX_P_0_765 v0.h[3]
-#define XFIX_N_0_899 v0.h[4]
-#define XFIX_P_1_175 v0.h[5]
-#define XFIX_P_1_501 v0.h[6]
-#define XFIX_N_1_847 v0.h[7]
-#define XFIX_N_1_961 v1.h[0]
-#define XFIX_P_2_053 v1.h[1]
-#define XFIX_N_2_562 v1.h[2]
-#define XFIX_P_3_072 v1.h[3]
-
-asm_function jsimd_fdct_islow_neon
-
- DATA .req x0
- TMP .req x9
-
- /* Load constants */
- get_symbol_loc TMP, Ljsimd_fdct_islow_neon_consts
- ld1 {v0.8h, v1.8h}, [TMP]
-
- /* Save NEON registers */
- sub sp, sp, #64
- mov x10, sp
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32
-
- /* Load all DATA into NEON registers with the following allocation:
- * 0 1 2 3 | 4 5 6 7
- * ---------+--------
- * 0 | d16 | d17 | v16.8h
- * 1 | d18 | d19 | v17.8h
- * 2 | d20 | d21 | v18.8h
- * 3 | d22 | d23 | v19.8h
- * 4 | d24 | d25 | v20.8h
- * 5 | d26 | d27 | v21.8h
- * 6 | d28 | d29 | v22.8h
- * 7 | d30 | d31 | v23.8h
- */
-
- ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
- ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
- sub DATA, DATA, #64
-
- /* Transpose */
- transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
- /* 1-D FDCT */
- add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */
- sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */
- add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */
- sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */
- add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */
- sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */
- add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */
- sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */
-
- /* even part */
-
- add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */
- sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */
- add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */
- sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */
-
- add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */
- sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */
-
- add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */
-
- shl v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
- shl v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
-
- smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
- smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
- mov v22.16b, v18.16b
- mov v25.16b, v24.16b
-
- smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
- smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
- smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
- smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
-
- rshrn v18.4h, v18.4s, #DESCALE_P1
- rshrn v22.4h, v22.4s, #DESCALE_P1
- rshrn2 v18.8h, v24.4s, #DESCALE_P1 /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
- rshrn2 v22.8h, v25.4s, #DESCALE_P1 /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
-
- /* Odd part */
-
- add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */
- add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */
- add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */
- add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */
- smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */
- smull2 v5.4s, v10.8h, XFIX_P_1_175
- smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
- smlal2 v5.4s, v11.8h, XFIX_P_1_175
-
- smull2 v24.4s, v28.8h, XFIX_P_0_298
- smull2 v25.4s, v29.8h, XFIX_P_2_053
- smull2 v26.4s, v30.8h, XFIX_P_3_072
- smull2 v27.4s, v31.8h, XFIX_P_1_501
- smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
- smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
- smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
- smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
-
- smull2 v12.4s, v8.8h, XFIX_N_0_899
- smull2 v13.4s, v9.8h, XFIX_N_2_562
- smull2 v14.4s, v10.8h, XFIX_N_1_961
- smull2 v15.4s, v11.8h, XFIX_N_0_390
- smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
- smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
- smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
- smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
-
- add v10.4s, v10.4s, v4.4s /* z3 += z5 */
- add v14.4s, v14.4s, v5.4s
- add v11.4s, v11.4s, v4.4s /* z4 += z5 */
- add v15.4s, v15.4s, v5.4s
-
- add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */
- add v24.4s, v24.4s, v12.4s
- add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */
- add v25.4s, v25.4s, v13.4s
- add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */
- add v26.4s, v26.4s, v14.4s
- add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */
- add v27.4s, v27.4s, v15.4s
-
- add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */
- add v24.4s, v24.4s, v14.4s
- add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */
- add v25.4s, v25.4s, v15.4s
- add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */
- add v26.4s, v26.4s, v13.4s
- add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */
- add v27.4s, v27.4s, v12.4s
-
- rshrn v23.4h, v28.4s, #DESCALE_P1
- rshrn v21.4h, v29.4s, #DESCALE_P1
- rshrn v19.4h, v30.4s, #DESCALE_P1
- rshrn v17.4h, v31.4s, #DESCALE_P1
- rshrn2 v23.8h, v24.4s, #DESCALE_P1 /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
- rshrn2 v21.8h, v25.4s, #DESCALE_P1 /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
- rshrn2 v19.8h, v26.4s, #DESCALE_P1 /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
- rshrn2 v17.8h, v27.4s, #DESCALE_P1 /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
-
- /* Transpose */
- transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
-
- /* 1-D FDCT */
- add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */
- sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */
- add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */
- sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */
- add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */
- sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */
- add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */
- sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */
-
- /* even part */
- add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */
- sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */
- add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */
- sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */
-
- add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */
- sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */
-
- add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */
-
- srshr v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS); */
- srshr v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS); */
-
- smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
- smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
- mov v22.16b, v18.16b
- mov v25.16b, v24.16b
-
- smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
- smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
- smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
- smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
-
- rshrn v18.4h, v18.4s, #DESCALE_P2
- rshrn v22.4h, v22.4s, #DESCALE_P2
- rshrn2 v18.8h, v24.4s, #DESCALE_P2 /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
- rshrn2 v22.8h, v25.4s, #DESCALE_P2 /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
-
- /* Odd part */
- add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */
- add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */
- add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */
- add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */
-
- smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */
- smull2 v5.4s, v10.8h, XFIX_P_1_175
- smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
- smlal2 v5.4s, v11.8h, XFIX_P_1_175
-
- smull2 v24.4s, v28.8h, XFIX_P_0_298
- smull2 v25.4s, v29.8h, XFIX_P_2_053
- smull2 v26.4s, v30.8h, XFIX_P_3_072
- smull2 v27.4s, v31.8h, XFIX_P_1_501
- smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
- smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
- smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
- smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
-
- smull2 v12.4s, v8.8h, XFIX_N_0_899
- smull2 v13.4s, v9.8h, XFIX_N_2_562
- smull2 v14.4s, v10.8h, XFIX_N_1_961
- smull2 v15.4s, v11.8h, XFIX_N_0_390
- smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
- smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
- smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
- smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
-
- add v10.4s, v10.4s, v4.4s
- add v14.4s, v14.4s, v5.4s
- add v11.4s, v11.4s, v4.4s
- add v15.4s, v15.4s, v5.4s
-
- add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */
- add v24.4s, v24.4s, v12.4s
- add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */
- add v25.4s, v25.4s, v13.4s
- add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */
- add v26.4s, v26.4s, v14.4s
- add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */
- add v27.4s, v27.4s, v15.4s
-
- add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */
- add v24.4s, v24.4s, v14.4s
- add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */
- add v25.4s, v25.4s, v15.4s
- add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */
- add v26.4s, v26.4s, v13.4s
- add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */
- add v27.4s, v27.4s, v12.4s
-
- rshrn v23.4h, v28.4s, #DESCALE_P2
- rshrn v21.4h, v29.4s, #DESCALE_P2
- rshrn v19.4h, v30.4s, #DESCALE_P2
- rshrn v17.4h, v31.4s, #DESCALE_P2
- rshrn2 v23.8h, v24.4s, #DESCALE_P2 /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
- rshrn2 v21.8h, v25.4s, #DESCALE_P2 /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
- rshrn2 v19.8h, v26.4s, #DESCALE_P2 /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
- rshrn2 v17.8h, v27.4s, #DESCALE_P2 /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
-
- /* store results */
- st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
- st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
-
- /* Restore NEON registers */
- ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
- ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-
- br x30
-
- .unreq DATA
- .unreq TMP
-
-#undef XFIX_P_0_298
-#undef XFIX_N_0_390
-#undef XFIX_P_0_541
-#undef XFIX_P_0_765
-#undef XFIX_N_0_899
-#undef XFIX_P_1_175
-#undef XFIX_P_1_501
-#undef XFIX_N_1_847
-#undef XFIX_N_1_961
-#undef XFIX_P_2_053
-#undef XFIX_N_2_562
-#undef XFIX_P_3_072
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_fdct_ifast_neon
- *
- * This function contains a fast, not so accurate integer implementation of
- * the forward DCT (Discrete Cosine Transform). It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
- * function from jfdctfst.c
- *
- * TODO: can be combined with 'jsimd_convsamp_neon' to get
- * rid of a bunch of VLD1.16 instructions
- */
-
-#undef XFIX_0_541196100
-#define XFIX_0_382683433 v0.h[0]
-#define XFIX_0_541196100 v0.h[1]
-#define XFIX_0_707106781 v0.h[2]
-#define XFIX_1_306562965 v0.h[3]
-
-asm_function jsimd_fdct_ifast_neon
-
- DATA .req x0
- TMP .req x9
-
- /* Load constants */
- get_symbol_loc TMP, Ljsimd_fdct_ifast_neon_consts
- ld1 {v0.4h}, [TMP]
-
- /* Load all DATA into NEON registers with the following allocation:
- * 0 1 2 3 | 4 5 6 7
- * ---------+--------
- * 0 | d16 | d17 | v0.8h
- * 1 | d18 | d19 | q9
- * 2 | d20 | d21 | q10
- * 3 | d22 | d23 | q11
- * 4 | d24 | d25 | q12
- * 5 | d26 | d27 | q13
- * 6 | d28 | d29 | q14
- * 7 | d30 | d31 | q15
- */
-
- ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
- ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
- mov TMP, #2
- sub DATA, DATA, #64
-1:
- /* Transpose */
- transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4
- subs TMP, TMP, #1
- /* 1-D FDCT */
- add v4.8h, v19.8h, v20.8h
- sub v20.8h, v19.8h, v20.8h
- sub v28.8h, v18.8h, v21.8h
- add v18.8h, v18.8h, v21.8h
- sub v29.8h, v17.8h, v22.8h
- add v17.8h, v17.8h, v22.8h
- sub v21.8h, v16.8h, v23.8h
- add v16.8h, v16.8h, v23.8h
- sub v6.8h, v17.8h, v18.8h
- sub v7.8h, v16.8h, v4.8h
- add v5.8h, v17.8h, v18.8h
- add v6.8h, v6.8h, v7.8h
- add v4.8h, v16.8h, v4.8h
- sqdmulh v6.8h, v6.8h, XFIX_0_707106781
- add v19.8h, v20.8h, v28.8h
- add v16.8h, v4.8h, v5.8h
- sub v20.8h, v4.8h, v5.8h
- add v5.8h, v28.8h, v29.8h
- add v29.8h, v29.8h, v21.8h
- sqdmulh v5.8h, v5.8h, XFIX_0_707106781
- sub v28.8h, v19.8h, v29.8h
- add v18.8h, v7.8h, v6.8h
- sqdmulh v28.8h, v28.8h, XFIX_0_382683433
- sub v22.8h, v7.8h, v6.8h
- sqdmulh v19.8h, v19.8h, XFIX_0_541196100
- sqdmulh v7.8h, v29.8h, XFIX_1_306562965
- add v6.8h, v21.8h, v5.8h
- sub v5.8h, v21.8h, v5.8h
- add v29.8h, v29.8h, v28.8h
- add v19.8h, v19.8h, v28.8h
- add v29.8h, v29.8h, v7.8h
- add v21.8h, v5.8h, v19.8h
- sub v19.8h, v5.8h, v19.8h
- add v17.8h, v6.8h, v29.8h
- sub v23.8h, v6.8h, v29.8h
-
- b.ne 1b
-
- /* store results */
- st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
- st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
-
- br x30
-
- .unreq DATA
- .unreq TMP
-#undef XFIX_0_382683433
-#undef XFIX_0_541196100
-#undef XFIX_0_707106781
-#undef XFIX_1_306562965
-
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(void)
- * jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
- * DCTELEM *workspace);
- *
- */
-asm_function jsimd_quantize_neon
-
- COEF_BLOCK .req x0
- DIVISORS .req x1
- WORKSPACE .req x2
-
- RECIPROCAL .req DIVISORS
- CORRECTION .req x9
- SHIFT .req x10
- LOOP_COUNT .req x11
-
- mov LOOP_COUNT, #2
- add CORRECTION, DIVISORS, #(64 * 2)
- add SHIFT, DIVISORS, #(64 * 6)
-1:
- subs LOOP_COUNT, LOOP_COUNT, #1
- ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64
- ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64
- abs v20.8h, v0.8h
- abs v21.8h, v1.8h
- abs v22.8h, v2.8h
- abs v23.8h, v3.8h
- ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64
- add v20.8h, v20.8h, v4.8h /* add correction */
- add v21.8h, v21.8h, v5.8h
- add v22.8h, v22.8h, v6.8h
- add v23.8h, v23.8h, v7.8h
- umull v4.4s, v20.4h, v28.4h /* multiply by reciprocal */
- umull2 v16.4s, v20.8h, v28.8h
- umull v5.4s, v21.4h, v29.4h
- umull2 v17.4s, v21.8h, v29.8h
- umull v6.4s, v22.4h, v30.4h /* multiply by reciprocal */
- umull2 v18.4s, v22.8h, v30.8h
- umull v7.4s, v23.4h, v31.4h
- umull2 v19.4s, v23.8h, v31.8h
- ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64
- shrn v4.4h, v4.4s, #16
- shrn v5.4h, v5.4s, #16
- shrn v6.4h, v6.4s, #16
- shrn v7.4h, v7.4s, #16
- shrn2 v4.8h, v16.4s, #16
- shrn2 v5.8h, v17.4s, #16
- shrn2 v6.8h, v18.4s, #16
- shrn2 v7.8h, v19.4s, #16
- neg v24.8h, v24.8h
- neg v25.8h, v25.8h
- neg v26.8h, v26.8h
- neg v27.8h, v27.8h
- sshr v0.8h, v0.8h, #15 /* extract sign */
- sshr v1.8h, v1.8h, #15
- sshr v2.8h, v2.8h, #15
- sshr v3.8h, v3.8h, #15
- ushl v4.8h, v4.8h, v24.8h /* shift */
- ushl v5.8h, v5.8h, v25.8h
- ushl v6.8h, v6.8h, v26.8h
- ushl v7.8h, v7.8h, v27.8h
-
- eor v4.16b, v4.16b, v0.16b /* restore sign */
- eor v5.16b, v5.16b, v1.16b
- eor v6.16b, v6.16b, v2.16b
- eor v7.16b, v7.16b, v3.16b
- sub v4.8h, v4.8h, v0.8h
- sub v5.8h, v5.8h, v1.8h
- sub v6.8h, v6.8h, v2.8h
- sub v7.8h, v7.8h, v3.8h
- st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64
-
- b.ne 1b
-
- br x30 /* return */
-
- .unreq COEF_BLOCK
- .unreq DIVISORS
- .unreq WORKSPACE
- .unreq RECIPROCAL
- .unreq CORRECTION
- .unreq SHIFT
- .unreq LOOP_COUNT
-
-
-/*****************************************************************************/
-
-/*
- * Downsample pixel values of a single component.
- * This version handles the common case of 2:1 horizontal and 1:1 vertical,
- * without smoothing.
- *
- * GLOBAL(void)
- * jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
- * JDIMENSION v_samp_factor,
- * JDIMENSION width_in_blocks,
- * JSAMPARRAY input_data, JSAMPARRAY output_data);
- */
-
-asm_function jsimd_h2v1_downsample_neon
- IMAGE_WIDTH .req x0
- MAX_V_SAMP .req x1
- V_SAMP .req x2
- BLOCK_WIDTH .req x3
- INPUT_DATA .req x4
- OUTPUT_DATA .req x5
- OUTPTR .req x9
- INPTR .req x10
- TMP1 .req x11
- TMP2 .req x12
- TMP3 .req x13
- TMPDUP .req w15
-
- mov TMPDUP, #0x10000
- lsl TMP2, BLOCK_WIDTH, #4
- sub TMP2, TMP2, IMAGE_WIDTH
- get_symbol_loc TMP3, Ljsimd_h2_downsample_neon_consts
- add TMP3, TMP3, TMP2, lsl #4
- dup v16.4s, TMPDUP
- ld1 {v18.16b}, [TMP3]
-
-1: /* row loop */
- ldr INPTR, [INPUT_DATA], #8
- ldr OUTPTR, [OUTPUT_DATA], #8
- subs TMP1, BLOCK_WIDTH, #1
- b.eq 3f
-2: /* columns */
- ld1 {v0.16b}, [INPTR], #16
- mov v4.16b, v16.16b
- subs TMP1, TMP1, #1
- uadalp v4.8h, v0.16b
- shrn v6.8b, v4.8h, #1
- st1 {v6.8b}, [OUTPTR], #8
- b.ne 2b
-3: /* last columns */
- ld1 {v0.16b}, [INPTR]
- mov v4.16b, v16.16b
- subs V_SAMP, V_SAMP, #1
- /* expand right */
- tbl v2.16b, {v0.16b}, v18.16b
- uadalp v4.8h, v2.16b
- shrn v6.8b, v4.8h, #1
- st1 {v6.8b}, [OUTPTR], #8
- b.ne 1b
-
- br x30
-
- .unreq IMAGE_WIDTH
- .unreq MAX_V_SAMP
- .unreq V_SAMP
- .unreq BLOCK_WIDTH
- .unreq INPUT_DATA
- .unreq OUTPUT_DATA
- .unreq OUTPTR
- .unreq INPTR
- .unreq TMP1
- .unreq TMP2
- .unreq TMP3
- .unreq TMPDUP
-
-
-/*****************************************************************************/
-
-/*
- * Downsample pixel values of a single component.
- * This version handles the common case of 2:1 horizontal and 2:1 vertical,
- * without smoothing.
- *
- * GLOBAL(void)
- * jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
- * JDIMENSION v_samp_factor,
- * JDIMENSION width_in_blocks,
- * JSAMPARRAY input_data, JSAMPARRAY output_data);
- */
-
-.balign 16
-asm_function jsimd_h2v2_downsample_neon
- IMAGE_WIDTH .req x0
- MAX_V_SAMP .req x1
- V_SAMP .req x2
- BLOCK_WIDTH .req x3
- INPUT_DATA .req x4
- OUTPUT_DATA .req x5
- OUTPTR .req x9
- INPTR0 .req x10
- INPTR1 .req x14
- TMP1 .req x11
- TMP2 .req x12
- TMP3 .req x13
- TMPDUP .req w15
-
- mov TMPDUP, #1
- lsl TMP2, BLOCK_WIDTH, #4
- lsl TMPDUP, TMPDUP, #17
- sub TMP2, TMP2, IMAGE_WIDTH
- get_symbol_loc TMP3, Ljsimd_h2_downsample_neon_consts
- orr TMPDUP, TMPDUP, #1
- add TMP3, TMP3, TMP2, lsl #4
- dup v16.4s, TMPDUP
- ld1 {v18.16b}, [TMP3]
-
-1: /* row loop */
- ldr INPTR0, [INPUT_DATA], #8
- ldr OUTPTR, [OUTPUT_DATA], #8
- ldr INPTR1, [INPUT_DATA], #8
- subs TMP1, BLOCK_WIDTH, #1
- b.eq 3f
-2: /* columns */
- ld1 {v0.16b}, [INPTR0], #16
- ld1 {v1.16b}, [INPTR1], #16
- mov v4.16b, v16.16b
- subs TMP1, TMP1, #1
- uadalp v4.8h, v0.16b
- uadalp v4.8h, v1.16b
- shrn v6.8b, v4.8h, #2
- st1 {v6.8b}, [OUTPTR], #8
- b.ne 2b
-3: /* last columns */
- ld1 {v0.16b}, [INPTR0], #16
- ld1 {v1.16b}, [INPTR1], #16
- mov v4.16b, v16.16b
- subs V_SAMP, V_SAMP, #1
- /* expand right */
- tbl v2.16b, {v0.16b}, v18.16b
- tbl v3.16b, {v1.16b}, v18.16b
- uadalp v4.8h, v2.16b
- uadalp v4.8h, v3.16b
- shrn v6.8b, v4.8h, #2
- st1 {v6.8b}, [OUTPTR], #8
- b.ne 1b
-
- br x30
-
- .unreq IMAGE_WIDTH
- .unreq MAX_V_SAMP
- .unreq V_SAMP
- .unreq BLOCK_WIDTH
- .unreq INPUT_DATA
- .unreq OUTPUT_DATA
- .unreq OUTPTR
- .unreq INPTR0
- .unreq INPTR1
- .unreq TMP1
- .unreq TMP2
- .unreq TMP3
- .unreq TMPDUP
-
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(JOCTET *)
- * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
- * JCOEFPTR block, int last_dc_val,
- * c_derived_tbl *dctbl, c_derived_tbl *actbl)
- *
- */
-
- BUFFER .req x1
- PUT_BUFFER .req x6
- PUT_BITS .req x7
- PUT_BITSw .req w7
-
-.macro emit_byte
- sub PUT_BITS, PUT_BITS, #0x8
- lsr x19, PUT_BUFFER, PUT_BITS
- uxtb w19, w19
- strb w19, [BUFFER, #1]!
- cmp w19, #0xff
- b.ne 14f
- strb wzr, [BUFFER, #1]!
-14:
-.endm
-.macro put_bits CODE, SIZE
- lsl PUT_BUFFER, PUT_BUFFER, \SIZE
- add PUT_BITS, PUT_BITS, \SIZE
- orr PUT_BUFFER, PUT_BUFFER, \CODE
-.endm
-.macro checkbuf31
- cmp PUT_BITS, #0x20
- b.lt 31f
- emit_byte
- emit_byte
- emit_byte
- emit_byte
-31:
-.endm
-.macro checkbuf47
- cmp PUT_BITS, #0x30
- b.lt 47f
- emit_byte
- emit_byte
- emit_byte
- emit_byte
- emit_byte
- emit_byte
-47:
-.endm
-
-.macro generate_jsimd_huff_encode_one_block fast_tbl
-
-.balign 16
-
-.if \fast_tbl == 1
-asm_function jsimd_huff_encode_one_block_neon
-.else
-asm_function jsimd_huff_encode_one_block_neon_slowtbl
-.endif
- sub sp, sp, 272
- sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */
- /* Save ARM registers */
- stp x19, x20, [sp]
-.if \fast_tbl == 1
- get_symbol_loc x15, Ljsimd_huff_encode_one_block_neon_consts
-.else
- get_symbol_loc x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts
-.endif
- ldr PUT_BUFFER, [x0, #0x10]
- ldr PUT_BITSw, [x0, #0x18]
- ldrsh w12, [x2] /* load DC coeff in w12 */
- /* prepare data */
-.if \fast_tbl == 1
- ld1 {v23.16b}, [x15], #16
- ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64
- ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64
- ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64
- ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64
- ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64
- sub w12, w12, w3 /* last_dc_val, not used afterwards */
- /* ZigZag 8x8 */
- tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b
- tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b
- tbl v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b
- tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b
- tbl v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b
- tbl v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b
- tbl v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b
- tbl v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b
- ins v0.h[0], w12
- tbx v1.16b, {v28.16b}, v16.16b
- tbx v2.16b, {v29.16b, v30.16b}, v17.16b
- tbx v5.16b, {v29.16b, v30.16b}, v18.16b
- tbx v6.16b, {v31.16b}, v19.16b
-.else
- add x13, x2, #0x22
- sub w12, w12, w3 /* last_dc_val, not used afterwards */
- ld1 {v23.16b}, [x15]
- add x14, x2, #0x18
- add x3, x2, #0x36
- ins v0.h[0], w12
- add x9, x2, #0x2
- ld1 {v1.h}[0], [x13]
- add x15, x2, #0x30
- ld1 {v2.h}[0], [x14]
- add x19, x2, #0x26
- ld1 {v3.h}[0], [x3]
- add x20, x2, #0x28
- ld1 {v0.h}[1], [x9]
- add x12, x2, #0x10
- ld1 {v1.h}[1], [x15]
- add x13, x2, #0x40
- ld1 {v2.h}[1], [x19]
- add x14, x2, #0x34
- ld1 {v3.h}[1], [x20]
- add x3, x2, #0x1a
- ld1 {v0.h}[2], [x12]
- add x9, x2, #0x20
- ld1 {v1.h}[2], [x13]
- add x15, x2, #0x32
- ld1 {v2.h}[2], [x14]
- add x19, x2, #0x42
- ld1 {v3.h}[2], [x3]
- add x20, x2, #0xc
- ld1 {v0.h}[3], [x9]
- add x12, x2, #0x12
- ld1 {v1.h}[3], [x15]
- add x13, x2, #0x24
- ld1 {v2.h}[3], [x19]
- add x14, x2, #0x50
- ld1 {v3.h}[3], [x20]
- add x3, x2, #0xe
- ld1 {v0.h}[4], [x12]
- add x9, x2, #0x4
- ld1 {v1.h}[4], [x13]
- add x15, x2, #0x16
- ld1 {v2.h}[4], [x14]
- add x19, x2, #0x60
- ld1 {v3.h}[4], [x3]
- add x20, x2, #0x1c
- ld1 {v0.h}[5], [x9]
- add x12, x2, #0x6
- ld1 {v1.h}[5], [x15]
- add x13, x2, #0x8
- ld1 {v2.h}[5], [x19]
- add x14, x2, #0x52
- ld1 {v3.h}[5], [x20]
- add x3, x2, #0x2a
- ld1 {v0.h}[6], [x12]
- add x9, x2, #0x14
- ld1 {v1.h}[6], [x13]
- add x15, x2, #0xa
- ld1 {v2.h}[6], [x14]
- add x19, x2, #0x44
- ld1 {v3.h}[6], [x3]
- add x20, x2, #0x38
- ld1 {v0.h}[7], [x9]
- add x12, x2, #0x46
- ld1 {v1.h}[7], [x15]
- add x13, x2, #0x3a
- ld1 {v2.h}[7], [x19]
- add x14, x2, #0x74
- ld1 {v3.h}[7], [x20]
- add x3, x2, #0x6a
- ld1 {v4.h}[0], [x12]
- add x9, x2, #0x54
- ld1 {v5.h}[0], [x13]
- add x15, x2, #0x2c
- ld1 {v6.h}[0], [x14]
- add x19, x2, #0x76
- ld1 {v7.h}[0], [x3]
- add x20, x2, #0x78
- ld1 {v4.h}[1], [x9]
- add x12, x2, #0x62
- ld1 {v5.h}[1], [x15]
- add x13, x2, #0x1e
- ld1 {v6.h}[1], [x19]
- add x14, x2, #0x68
- ld1 {v7.h}[1], [x20]
- add x3, x2, #0x7a
- ld1 {v4.h}[2], [x12]
- add x9, x2, #0x70
- ld1 {v5.h}[2], [x13]
- add x15, x2, #0x2e
- ld1 {v6.h}[2], [x14]
- add x19, x2, #0x5a
- ld1 {v7.h}[2], [x3]
- add x20, x2, #0x6c
- ld1 {v4.h}[3], [x9]
- add x12, x2, #0x72
- ld1 {v5.h}[3], [x15]
- add x13, x2, #0x3c
- ld1 {v6.h}[3], [x19]
- add x14, x2, #0x4c
- ld1 {v7.h}[3], [x20]
- add x3, x2, #0x5e
- ld1 {v4.h}[4], [x12]
- add x9, x2, #0x64
- ld1 {v5.h}[4], [x13]
- add x15, x2, #0x4a
- ld1 {v6.h}[4], [x14]
- add x19, x2, #0x3e
- ld1 {v7.h}[4], [x3]
- add x20, x2, #0x6e
- ld1 {v4.h}[5], [x9]
- add x12, x2, #0x56
- ld1 {v5.h}[5], [x15]
- add x13, x2, #0x58
- ld1 {v6.h}[5], [x19]
- add x14, x2, #0x4e
- ld1 {v7.h}[5], [x20]
- add x3, x2, #0x7c
- ld1 {v4.h}[6], [x12]
- add x9, x2, #0x48
- ld1 {v5.h}[6], [x13]
- add x15, x2, #0x66
- ld1 {v6.h}[6], [x14]
- add x19, x2, #0x5c
- ld1 {v7.h}[6], [x3]
- add x20, x2, #0x7e
- ld1 {v4.h}[7], [x9]
- ld1 {v5.h}[7], [x15]
- ld1 {v6.h}[7], [x19]
- ld1 {v7.h}[7], [x20]
-.endif
- cmlt v24.8h, v0.8h, #0
- cmlt v25.8h, v1.8h, #0
- cmlt v26.8h, v2.8h, #0
- cmlt v27.8h, v3.8h, #0
- cmlt v28.8h, v4.8h, #0
- cmlt v29.8h, v5.8h, #0
- cmlt v30.8h, v6.8h, #0
- cmlt v31.8h, v7.8h, #0
- abs v0.8h, v0.8h
- abs v1.8h, v1.8h
- abs v2.8h, v2.8h
- abs v3.8h, v3.8h
- abs v4.8h, v4.8h
- abs v5.8h, v5.8h
- abs v6.8h, v6.8h
- abs v7.8h, v7.8h
- eor v24.16b, v24.16b, v0.16b
- eor v25.16b, v25.16b, v1.16b
- eor v26.16b, v26.16b, v2.16b
- eor v27.16b, v27.16b, v3.16b
- eor v28.16b, v28.16b, v4.16b
- eor v29.16b, v29.16b, v5.16b
- eor v30.16b, v30.16b, v6.16b
- eor v31.16b, v31.16b, v7.16b
- cmeq v16.8h, v0.8h, #0
- cmeq v17.8h, v1.8h, #0
- cmeq v18.8h, v2.8h, #0
- cmeq v19.8h, v3.8h, #0
- cmeq v20.8h, v4.8h, #0
- cmeq v21.8h, v5.8h, #0
- cmeq v22.8h, v6.8h, #0
- xtn v16.8b, v16.8h
- xtn v18.8b, v18.8h
- xtn v20.8b, v20.8h
- xtn v22.8b, v22.8h
- umov w14, v0.h[0]
- xtn2 v16.16b, v17.8h
- umov w13, v24.h[0]
- xtn2 v18.16b, v19.8h
- clz w14, w14
- xtn2 v20.16b, v21.8h
- lsl w13, w13, w14
- cmeq v17.8h, v7.8h, #0
- sub w12, w14, #32
- xtn2 v22.16b, v17.8h
- lsr w13, w13, w14
- and v16.16b, v16.16b, v23.16b
- neg w12, w12
- and v18.16b, v18.16b, v23.16b
- add x3, x4, #0x400 /* r1 = dctbl->ehufsi */
- and v20.16b, v20.16b, v23.16b
- add x15, sp, #0x90 /* x15 = t2 */
- and v22.16b, v22.16b, v23.16b
- ldr w10, [x4, x12, lsl #2]
- addp v16.16b, v16.16b, v18.16b
- ldrb w11, [x3, x12]
- addp v20.16b, v20.16b, v22.16b
- checkbuf47
- addp v16.16b, v16.16b, v20.16b
- put_bits x10, x11
- addp v16.16b, v16.16b, v18.16b
- checkbuf47
- umov x9, v16.D[0]
- put_bits x13, x12
- cnt v17.8b, v16.8b
- mvn x9, x9
- addv B18, v17.8b
- add x4, x5, #0x400 /* x4 = actbl->ehufsi */
- umov w12, v18.b[0]
- lsr x9, x9, #0x1 /* clear AC coeff */
- ldr w13, [x5, #0x3c0] /* x13 = actbl->ehufco[0xf0] */
- rbit x9, x9 /* x9 = index0 */
- ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */
- cmp w12, #(64-8)
- add x11, sp, #16
- b.lt 4f
- cbz x9, 6f
- st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
- st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
- st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
- st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
-1:
- clz x2, x9
- add x15, x15, x2, lsl #1
- lsl x9, x9, x2
- ldrh w20, [x15, #-126]
-2:
- cmp x2, #0x10
- b.lt 3f
- sub x2, x2, #0x10
- checkbuf47
- put_bits x13, x14
- b 2b
-3:
- clz w20, w20
- ldrh w3, [x15, #2]!
- sub w11, w20, #32
- lsl w3, w3, w20
- neg w11, w11
- lsr w3, w3, w20
- add x2, x11, x2, lsl #4
- lsl x9, x9, #0x1
- ldr w12, [x5, x2, lsl #2]
- ldrb w10, [x4, x2]
- checkbuf31
- put_bits x12, x10
- put_bits x3, x11
- cbnz x9, 1b
- b 6f
-4:
- movi v21.8h, #0x0010
- clz v0.8h, v0.8h
- clz v1.8h, v1.8h
- clz v2.8h, v2.8h
- clz v3.8h, v3.8h
- clz v4.8h, v4.8h
- clz v5.8h, v5.8h
- clz v6.8h, v6.8h
- clz v7.8h, v7.8h
- ushl v24.8h, v24.8h, v0.8h
- ushl v25.8h, v25.8h, v1.8h
- ushl v26.8h, v26.8h, v2.8h
- ushl v27.8h, v27.8h, v3.8h
- ushl v28.8h, v28.8h, v4.8h
- ushl v29.8h, v29.8h, v5.8h
- ushl v30.8h, v30.8h, v6.8h
- ushl v31.8h, v31.8h, v7.8h
- neg v0.8h, v0.8h
- neg v1.8h, v1.8h
- neg v2.8h, v2.8h
- neg v3.8h, v3.8h
- neg v4.8h, v4.8h
- neg v5.8h, v5.8h
- neg v6.8h, v6.8h
- neg v7.8h, v7.8h
- ushl v24.8h, v24.8h, v0.8h
- ushl v25.8h, v25.8h, v1.8h
- ushl v26.8h, v26.8h, v2.8h
- ushl v27.8h, v27.8h, v3.8h
- ushl v28.8h, v28.8h, v4.8h
- ushl v29.8h, v29.8h, v5.8h
- ushl v30.8h, v30.8h, v6.8h
- ushl v31.8h, v31.8h, v7.8h
- add v0.8h, v21.8h, v0.8h
- add v1.8h, v21.8h, v1.8h
- add v2.8h, v21.8h, v2.8h
- add v3.8h, v21.8h, v3.8h
- add v4.8h, v21.8h, v4.8h
- add v5.8h, v21.8h, v5.8h
- add v6.8h, v21.8h, v6.8h
- add v7.8h, v21.8h, v7.8h
- st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
- st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
- st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
- st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
-1:
- clz x2, x9
- add x15, x15, x2, lsl #1
- lsl x9, x9, x2
- ldrh w11, [x15, #-126]
-2:
- cmp x2, #0x10
- b.lt 3f
- sub x2, x2, #0x10
- checkbuf47
- put_bits x13, x14
- b 2b
-3:
- ldrh w3, [x15, #2]!
- add x2, x11, x2, lsl #4
- lsl x9, x9, #0x1
- ldr w12, [x5, x2, lsl #2]
- ldrb w10, [x4, x2]
- checkbuf31
- put_bits x12, x10
- put_bits x3, x11
- cbnz x9, 1b
-6:
- add x13, sp, #0x10e
- cmp x15, x13
- b.hs 1f
- ldr w12, [x5]
- ldrb w14, [x4]
- checkbuf47
- put_bits x12, x14
-1:
- str PUT_BUFFER, [x0, #0x10]
- str PUT_BITSw, [x0, #0x18]
- ldp x19, x20, [sp], 16
- add x0, BUFFER, #0x1
- add sp, sp, 256
- br x30
-
-.endm
-
-generate_jsimd_huff_encode_one_block 1
-generate_jsimd_huff_encode_one_block 0
-
- .unreq BUFFER
- .unreq PUT_BUFFER
- .unreq PUT_BITS
- .unreq PUT_BITSw
-
-.purgem emit_byte
-.purgem put_bits
-.purgem checkbuf31
-.purgem checkbuf47
diff --git a/simd/i386/jccolext-avx2.asm b/simd/i386/jccolext-avx2.asm
index 7a8d784..c46d684 100644
--- a/simd/i386/jccolext-avx2.asm
+++ b/simd/i386/jccolext-avx2.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jcolsamp.inc"
@@ -110,12 +108,12 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
test cl, SIZEOF_BYTE
jz short .column_ld2
sub ecx, byte SIZEOF_BYTE
- movzx eax, BYTE [esi+ecx]
+ movzx eax, byte [esi+ecx]
.column_ld2:
test cl, SIZEOF_WORD
jz short .column_ld4
sub ecx, byte SIZEOF_WORD
- movzx edx, WORD [esi+ecx]
+ movzx edx, word [esi+ecx]
shl eax, WORD_BIT
or eax, edx
.column_ld4:
diff --git a/simd/i386/jccolext-mmx.asm b/simd/i386/jccolext-mmx.asm
index 9a2c30e..6357a42 100644
--- a/simd/i386/jccolext-mmx.asm
+++ b/simd/i386/jccolext-mmx.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jcolsamp.inc"
@@ -111,13 +109,13 @@ EXTN(jsimd_rgb_ycc_convert_mmx):
jz short .column_ld2
sub ecx, byte SIZEOF_BYTE
xor eax, eax
- mov al, BYTE [esi+ecx]
+ mov al, byte [esi+ecx]
.column_ld2:
test cl, SIZEOF_WORD
jz short .column_ld4
sub ecx, byte SIZEOF_WORD
xor edx, edx
- mov dx, WORD [esi+ecx]
+ mov dx, word [esi+ecx]
shl eax, WORD_BIT
or eax, edx
.column_ld4:
@@ -127,7 +125,7 @@ EXTN(jsimd_rgb_ycc_convert_mmx):
test cl, SIZEOF_DWORD
jz short .column_ld8
sub ecx, byte SIZEOF_DWORD
- movd mmG, DWORD [esi+ecx]
+ movd mmG, dword [esi+ecx]
psllq mmA, DWORD_BIT
por mmA, mmG
.column_ld8:
@@ -197,7 +195,7 @@ EXTN(jsimd_rgb_ycc_convert_mmx):
test cl, SIZEOF_MMWORD/8
jz short .column_ld2
sub ecx, byte SIZEOF_MMWORD/8
- movd mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
+ movd mmA, dword [esi+ecx*RGB_PIXELSIZE]
.column_ld2:
test cl, SIZEOF_MMWORD/4
jz short .column_ld4
diff --git a/simd/i386/jccolext-sse2.asm b/simd/i386/jccolext-sse2.asm
index e830562..c6c8085 100644
--- a/simd/i386/jccolext-sse2.asm
+++ b/simd/i386/jccolext-sse2.asm
@@ -12,8 +12,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jcolsamp.inc"
@@ -109,12 +107,12 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
test cl, SIZEOF_BYTE
jz short .column_ld2
sub ecx, byte SIZEOF_BYTE
- movzx eax, BYTE [esi+ecx]
+ movzx eax, byte [esi+ecx]
.column_ld2:
test cl, SIZEOF_WORD
jz short .column_ld4
sub ecx, byte SIZEOF_WORD
- movzx edx, WORD [esi+ecx]
+ movzx edx, word [esi+ecx]
shl eax, WORD_BIT
or eax, edx
.column_ld4:
diff --git a/simd/i386/jccolor-avx2.asm b/simd/i386/jccolor-avx2.asm
index 958517f..14944e9 100644
--- a/simd/i386/jccolor-avx2.asm
+++ b/simd/i386/jccolor-avx2.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/i386/jccolor-mmx.asm b/simd/i386/jccolor-mmx.asm
index 47be9e1..8cb399b 100644
--- a/simd/i386/jccolor-mmx.asm
+++ b/simd/i386/jccolor-mmx.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/i386/jccolor-sse2.asm b/simd/i386/jccolor-sse2.asm
index c0d5d45..686d222 100644
--- a/simd/i386/jccolor-sse2.asm
+++ b/simd/i386/jccolor-sse2.asm
@@ -12,8 +12,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/i386/jcgray-avx2.asm b/simd/i386/jcgray-avx2.asm
index 4d66242..560ee0c 100644
--- a/simd/i386/jcgray-avx2.asm
+++ b/simd/i386/jcgray-avx2.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/i386/jcgray-mmx.asm b/simd/i386/jcgray-mmx.asm
index 07c7ea6..79fdf08 100644
--- a/simd/i386/jcgray-mmx.asm
+++ b/simd/i386/jcgray-mmx.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/i386/jcgray-sse2.asm b/simd/i386/jcgray-sse2.asm
index 4b8c797..cb4b28e 100644
--- a/simd/i386/jcgray-sse2.asm
+++ b/simd/i386/jcgray-sse2.asm
@@ -12,8 +12,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/i386/jcgryext-avx2.asm b/simd/i386/jcgryext-avx2.asm
index 52e99a8..3fa7973 100644
--- a/simd/i386/jcgryext-avx2.asm
+++ b/simd/i386/jcgryext-avx2.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jcolsamp.inc"
@@ -102,12 +100,12 @@ EXTN(jsimd_rgb_gray_convert_avx2):
test cl, SIZEOF_BYTE
jz short .column_ld2
sub ecx, byte SIZEOF_BYTE
- movzx eax, BYTE [esi+ecx]
+ movzx eax, byte [esi+ecx]
.column_ld2:
test cl, SIZEOF_WORD
jz short .column_ld4
sub ecx, byte SIZEOF_WORD
- movzx edx, WORD [esi+ecx]
+ movzx edx, word [esi+ecx]
shl eax, WORD_BIT
or eax, edx
.column_ld4:
diff --git a/simd/i386/jcgryext-mmx.asm b/simd/i386/jcgryext-mmx.asm
index 4a9ab0d..8af42e5 100644
--- a/simd/i386/jcgryext-mmx.asm
+++ b/simd/i386/jcgryext-mmx.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jcolsamp.inc"
@@ -103,13 +101,13 @@ EXTN(jsimd_rgb_gray_convert_mmx):
jz short .column_ld2
sub ecx, byte SIZEOF_BYTE
xor eax, eax
- mov al, BYTE [esi+ecx]
+ mov al, byte [esi+ecx]
.column_ld2:
test cl, SIZEOF_WORD
jz short .column_ld4
sub ecx, byte SIZEOF_WORD
xor edx, edx
- mov dx, WORD [esi+ecx]
+ mov dx, word [esi+ecx]
shl eax, WORD_BIT
or eax, edx
.column_ld4:
@@ -119,7 +117,7 @@ EXTN(jsimd_rgb_gray_convert_mmx):
test cl, SIZEOF_DWORD
jz short .column_ld8
sub ecx, byte SIZEOF_DWORD
- movd mmG, DWORD [esi+ecx]
+ movd mmG, dword [esi+ecx]
psllq mmA, DWORD_BIT
por mmA, mmG
.column_ld8:
@@ -189,7 +187,7 @@ EXTN(jsimd_rgb_gray_convert_mmx):
test cl, SIZEOF_MMWORD/8
jz short .column_ld2
sub ecx, byte SIZEOF_MMWORD/8
- movd mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
+ movd mmA, dword [esi+ecx*RGB_PIXELSIZE]
.column_ld2:
test cl, SIZEOF_MMWORD/4
jz short .column_ld4
diff --git a/simd/i386/jcgryext-sse2.asm b/simd/i386/jcgryext-sse2.asm
index 04d891c..c9d6ff1 100644
--- a/simd/i386/jcgryext-sse2.asm
+++ b/simd/i386/jcgryext-sse2.asm
@@ -12,8 +12,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jcolsamp.inc"
@@ -101,12 +99,12 @@ EXTN(jsimd_rgb_gray_convert_sse2):
test cl, SIZEOF_BYTE
jz short .column_ld2
sub ecx, byte SIZEOF_BYTE
- movzx eax, BYTE [esi+ecx]
+ movzx eax, byte [esi+ecx]
.column_ld2:
test cl, SIZEOF_WORD
jz short .column_ld4
sub ecx, byte SIZEOF_WORD
- movzx edx, WORD [esi+ecx]
+ movzx edx, word [esi+ecx]
shl eax, WORD_BIT
or eax, edx
.column_ld4:
diff --git a/simd/i386/jchuff-sse2.asm b/simd/i386/jchuff-sse2.asm
index 6ea69f6..d0112e6 100644
--- a/simd/i386/jchuff-sse2.asm
+++ b/simd/i386/jchuff-sse2.asm
@@ -17,8 +17,6 @@
; This file contains an SSE2 implementation for Huffman coding of one block.
; The following code is based directly on jchuff.c; see jchuff.c for more
; details.
-;
-; [TAB8]
%include "jsimdext.inc"
@@ -27,11 +25,10 @@
alignz 32
GLOBAL_DATA(jconst_huff_encode_one_block)
+ EXTERN EXTN(jpeg_nbits_table)
EXTN(jconst_huff_encode_one_block):
-%include "jpeg_nbits_table.inc"
-
alignz 32
; --------------------------------------------------------------------------
@@ -197,8 +194,8 @@ EXTN(jsimd_huff_encode_one_block_sse2):
push ebp
mov esi, POINTER [eax+8] ; (working_state *state)
- mov put_buffer, DWORD [esi+8] ; put_buffer = state->cur.put_buffer;
- mov put_bits, DWORD [esi+12] ; put_bits = state->cur.put_bits;
+ mov put_buffer, dword [esi+8] ; put_buffer = state->cur.put_buffer;
+ mov put_bits, dword [esi+12] ; put_bits = state->cur.put_bits;
push esi ; esi is now scratch
get_GOT edx ; get GOT address
@@ -214,7 +211,7 @@ EXTN(jsimd_huff_encode_one_block_sse2):
; Encode the DC coefficient difference per section F.1.2.1
mov esi, POINTER [esp+block] ; block
movsx ecx, word [esi] ; temp = temp2 = block[0] - last_dc_val;
- sub ecx, DWORD [eax+20]
+ sub ecx, dword [eax+20]
mov esi, ecx
; This is a well-known technique for obtaining the absolute value
@@ -229,12 +226,12 @@ EXTN(jsimd_huff_encode_one_block_sse2):
; For a negative input, want temp2 = bitwise complement of abs(input)
; This code assumes we are on a two's complement machine
add esi, edx ; temp2 += temp3;
- mov DWORD [esp+temp], esi ; backup temp2 in temp
+ mov dword [esp+temp], esi ; backup temp2 in temp
; Find the number of bits needed for the magnitude of the coefficient
movpic ebp, POINTER [esp+gotptr] ; load GOT address (ebp)
- movzx edx, byte [GOTOFF(ebp, jpeg_nbits_table + ecx)] ; nbits = JPEG_NBITS(temp);
- mov DWORD [esp+temp2], edx ; backup nbits in temp2
+ movzx edx, byte [GOTOFF(ebp, EXTN(jpeg_nbits_table) + ecx)] ; nbits = JPEG_NBITS(temp);
+ mov dword [esp+temp2], edx ; backup nbits in temp2
; Emit the Huffman-coded symbol for the number of bits
mov ebp, POINTER [eax+24] ; After this point, arguments are not accessible anymore
@@ -242,13 +239,13 @@ EXTN(jsimd_huff_encode_one_block_sse2):
movzx ecx, byte [ebp + edx + 1024] ; size = dctbl->ehufsi[nbits];
EMIT_BITS eax ; EMIT_BITS(code, size)
- mov ecx, DWORD [esp+temp2] ; restore nbits
+ mov ecx, dword [esp+temp2] ; restore nbits
; Mask off any extra bits in code
mov eax, 1
shl eax, cl
dec eax
- and eax, DWORD [esp+temp] ; temp2 &= (((JLONG)1)<<nbits) - 1;
+ and eax, dword [esp+temp] ; temp2 &= (((JLONG)1)<<nbits) - 1;
; Emit that number of bits of the value, if positive,
; or the complement of its magnitude, if negative.
@@ -291,22 +288,22 @@ EXTN(jsimd_huff_encode_one_block_sse2):
jz near .ELOOP
lea esi, [esi+ecx*2] ; k += r;
shr edx, cl ; index >>= r;
- mov DWORD [esp+temp3], edx
+ mov dword [esp+temp3], edx
.BRLOOP:
cmp ecx, 16 ; while (r > 15) {
jl near .ERLOOP
sub ecx, 16 ; r -= 16;
- mov DWORD [esp+temp], ecx
+ mov dword [esp+temp], ecx
mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0];
movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0];
EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0)
- mov ecx, DWORD [esp+temp]
+ mov ecx, dword [esp+temp]
jmp .BRLOOP
.ERLOOP:
movsx eax, word [esi] ; temp = t1[k];
movpic edx, POINTER [esp+gotptr] ; load GOT address (edx)
- movzx eax, byte [GOTOFF(edx, jpeg_nbits_table + eax)] ; nbits = JPEG_NBITS(temp);
- mov DWORD [esp+temp2], eax
+ movzx eax, byte [GOTOFF(edx, EXTN(jpeg_nbits_table) + eax)] ; nbits = JPEG_NBITS(temp);
+ mov dword [esp+temp2], eax
; Emit Huffman symbol for run length / number of bits
shl ecx, 4 ; temp3 = (r << 4) + nbits;
add ecx, eax
@@ -316,13 +313,13 @@ EXTN(jsimd_huff_encode_one_block_sse2):
movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k];
; Mask off any extra bits in code
- mov ecx, DWORD [esp+temp2]
+ mov ecx, dword [esp+temp2]
mov eax, 1
shl eax, cl
dec eax
and eax, edx ; temp2 &= (((JLONG)1)<<nbits) - 1;
EMIT_BITS eax ; PUT_BITS(temp2, nbits)
- mov edx, DWORD [esp+temp3]
+ mov edx, dword [esp+temp3]
add esi, 2 ; ++k;
shr edx, 1 ; index >>= 1;
@@ -352,29 +349,29 @@ EXTN(jsimd_huff_encode_one_block_sse2):
shr edx, cl ; index >>= r;
add ecx, eax
lea esi, [esi+ecx*2] ; k += r;
- mov DWORD [esp+temp3], edx
+ mov dword [esp+temp3], edx
jmp .BRLOOP2
.BLOOP2:
bsf ecx, edx ; r = __builtin_ctzl(index);
jz near .ELOOP2
lea esi, [esi+ecx*2] ; k += r;
shr edx, cl ; index >>= r;
- mov DWORD [esp+temp3], edx
+ mov dword [esp+temp3], edx
.BRLOOP2:
cmp ecx, 16 ; while (r > 15) {
jl near .ERLOOP2
sub ecx, 16 ; r -= 16;
- mov DWORD [esp+temp], ecx
+ mov dword [esp+temp], ecx
mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0];
movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0];
EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0)
- mov ecx, DWORD [esp+temp]
+ mov ecx, dword [esp+temp]
jmp .BRLOOP2
.ERLOOP2:
movsx eax, word [esi] ; temp = t1[k];
bsr eax, eax ; nbits = 32 - __builtin_clz(temp);
inc eax
- mov DWORD [esp+temp2], eax
+ mov dword [esp+temp2], eax
; Emit Huffman symbol for run length / number of bits
shl ecx, 4 ; temp3 = (r << 4) + nbits;
add ecx, eax
@@ -384,13 +381,13 @@ EXTN(jsimd_huff_encode_one_block_sse2):
movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k];
; Mask off any extra bits in code
- mov ecx, DWORD [esp+temp2]
+ mov ecx, dword [esp+temp2]
mov eax, 1
shl eax, cl
dec eax
and eax, edx ; temp2 &= (((JLONG)1)<<nbits) - 1;
EMIT_BITS eax ; PUT_BITS(temp2, nbits)
- mov edx, DWORD [esp+temp3]
+ mov edx, dword [esp+temp3]
add esi, 2 ; ++k;
shr edx, 1 ; index >>= 1;
@@ -407,8 +404,8 @@ EXTN(jsimd_huff_encode_one_block_sse2):
mov eax, [esp+buffer]
pop esi
; Save put_buffer & put_bits
- mov DWORD [esi+8], put_buffer ; state->cur.put_buffer = put_buffer;
- mov DWORD [esi+12], put_bits ; state->cur.put_bits = put_bits;
+ mov dword [esi+8], put_buffer ; state->cur.put_buffer = put_buffer;
+ mov dword [esi+12], put_bits ; state->cur.put_bits = put_bits;
pop ebp
pop edi
diff --git a/simd/i386/jcphuff-sse2.asm b/simd/i386/jcphuff-sse2.asm
index e35a7d8..8b73178 100644
--- a/simd/i386/jcphuff-sse2.asm
+++ b/simd/i386/jcphuff-sse2.asm
@@ -15,8 +15,6 @@
;
; This file contains an SSE2 implementation of data preparation for progressive
; Huffman encoding. See jcphuff.c for more details.
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/i386/jcsample-avx2.asm b/simd/i386/jcsample-avx2.asm
index 5bcdefd..0a20802 100644
--- a/simd/i386/jcsample-avx2.asm
+++ b/simd/i386/jcsample-avx2.asm
@@ -14,8 +14,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/i386/jcsample-mmx.asm b/simd/i386/jcsample-mmx.asm
index faf4234..2c223ee 100644
--- a/simd/i386/jcsample-mmx.asm
+++ b/simd/i386/jcsample-mmx.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/i386/jcsample-sse2.asm b/simd/i386/jcsample-sse2.asm
index b10fa83..4fea60d 100644
--- a/simd/i386/jcsample-sse2.asm
+++ b/simd/i386/jcsample-sse2.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/i386/jdcolext-avx2.asm b/simd/i386/jdcolext-avx2.asm
index 46de9b9..015be04 100644
--- a/simd/i386/jdcolext-avx2.asm
+++ b/simd/i386/jdcolext-avx2.asm
@@ -14,8 +14,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jcolsamp.inc"
@@ -348,7 +346,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
vmovd eax, xmmA
cmp ecx, byte SIZEOF_WORD
jb short .column_st1
- mov WORD [edi], ax
+ mov word [edi], ax
add edi, byte SIZEOF_WORD
sub ecx, byte SIZEOF_WORD
shr eax, 16
@@ -357,7 +355,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
; space.
test ecx, ecx
jz short .nextrow
- mov BYTE [edi], al
+ mov byte [edi], al
%else ; RGB_PIXELSIZE == 4 ; -----------
diff --git a/simd/i386/jdcolext-mmx.asm b/simd/i386/jdcolext-mmx.asm
index cd2cb3f..5813cfc 100644
--- a/simd/i386/jdcolext-mmx.asm
+++ b/simd/i386/jdcolext-mmx.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jcolsamp.inc"
@@ -280,7 +278,7 @@ EXTN(jsimd_ycc_rgb_convert_mmx):
movd eax, mmA
cmp ecx, byte SIZEOF_DWORD
jb short .column_st2
- mov DWORD [edi+0*SIZEOF_DWORD], eax
+ mov dword [edi+0*SIZEOF_DWORD], eax
psrlq mmA, DWORD_BIT
movd eax, mmA
sub ecx, byte SIZEOF_DWORD
@@ -288,14 +286,14 @@ EXTN(jsimd_ycc_rgb_convert_mmx):
.column_st2:
cmp ecx, byte SIZEOF_WORD
jb short .column_st1
- mov WORD [edi+0*SIZEOF_WORD], ax
+ mov word [edi+0*SIZEOF_WORD], ax
shr eax, WORD_BIT
sub ecx, byte SIZEOF_WORD
add edi, byte SIZEOF_WORD
.column_st1:
cmp ecx, byte SIZEOF_BYTE
jb short .nextrow
- mov BYTE [edi+0*SIZEOF_BYTE], al
+ mov byte [edi+0*SIZEOF_BYTE], al
%else ; RGB_PIXELSIZE == 4 ; -----------
@@ -367,7 +365,7 @@ EXTN(jsimd_ycc_rgb_convert_mmx):
.column_st4:
cmp ecx, byte SIZEOF_MMWORD/8
jb short .nextrow
- movd DWORD [edi+0*SIZEOF_DWORD], mmA
+ movd dword [edi+0*SIZEOF_DWORD], mmA
%endif ; RGB_PIXELSIZE ; ---------------
diff --git a/simd/i386/jdcolext-sse2.asm b/simd/i386/jdcolext-sse2.asm
index 0fcb006..d5572b3 100644
--- a/simd/i386/jdcolext-sse2.asm
+++ b/simd/i386/jdcolext-sse2.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jcolsamp.inc"
@@ -320,7 +318,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movd eax, xmmA
cmp ecx, byte SIZEOF_WORD
jb short .column_st1
- mov WORD [edi], ax
+ mov word [edi], ax
add edi, byte SIZEOF_WORD
sub ecx, byte SIZEOF_WORD
shr eax, 16
@@ -329,7 +327,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
; space.
test ecx, ecx
jz short .nextrow
- mov BYTE [edi], al
+ mov byte [edi], al
%else ; RGB_PIXELSIZE == 4 ; -----------
diff --git a/simd/i386/jdcolor-avx2.asm b/simd/i386/jdcolor-avx2.asm
index d2f86e6..e05b60d 100644
--- a/simd/i386/jdcolor-avx2.asm
+++ b/simd/i386/jdcolor-avx2.asm
@@ -14,8 +14,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/i386/jdcolor-mmx.asm b/simd/i386/jdcolor-mmx.asm
index 8f5a3b3..fb7e7bc 100644
--- a/simd/i386/jdcolor-mmx.asm
+++ b/simd/i386/jdcolor-mmx.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/i386/jdcolor-sse2.asm b/simd/i386/jdcolor-sse2.asm
index ae553db..b736255 100644
--- a/simd/i386/jdcolor-sse2.asm
+++ b/simd/i386/jdcolor-sse2.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/i386/jdmerge-avx2.asm b/simd/i386/jdmerge-avx2.asm
index 1731844..711e679 100644
--- a/simd/i386/jdmerge-avx2.asm
+++ b/simd/i386/jdmerge-avx2.asm
@@ -14,8 +14,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/i386/jdmerge-mmx.asm b/simd/i386/jdmerge-mmx.asm
index 607bf39..6e8311d 100644
--- a/simd/i386/jdmerge-mmx.asm
+++ b/simd/i386/jdmerge-mmx.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/i386/jdmerge-sse2.asm b/simd/i386/jdmerge-sse2.asm
index ddb1d5e..e32f90a 100644
--- a/simd/i386/jdmerge-sse2.asm
+++ b/simd/i386/jdmerge-sse2.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/i386/jdmrgext-avx2.asm b/simd/i386/jdmrgext-avx2.asm
index cde4865..e35f728 100644
--- a/simd/i386/jdmrgext-avx2.asm
+++ b/simd/i386/jdmrgext-avx2.asm
@@ -14,8 +14,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jcolsamp.inc"
@@ -354,7 +352,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
vmovd eax, xmmA
cmp ecx, byte SIZEOF_WORD
jb short .column_st1
- mov WORD [edi], ax
+ mov word [edi], ax
add edi, byte SIZEOF_WORD
sub ecx, byte SIZEOF_WORD
shr eax, 16
@@ -363,7 +361,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
; space.
test ecx, ecx
jz short .endcolumn
- mov BYTE [edi], al
+ mov byte [edi], al
%else ; RGB_PIXELSIZE == 4 ; -----------
diff --git a/simd/i386/jdmrgext-mmx.asm b/simd/i386/jdmrgext-mmx.asm
index 4b9e35d..eb3e36b 100644
--- a/simd/i386/jdmrgext-mmx.asm
+++ b/simd/i386/jdmrgext-mmx.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jcolsamp.inc"
@@ -283,7 +281,7 @@ EXTN(jsimd_h2v1_merged_upsample_mmx):
movd eax, mmA
cmp ecx, byte SIZEOF_DWORD
jb short .column_st2
- mov DWORD [edi+0*SIZEOF_DWORD], eax
+ mov dword [edi+0*SIZEOF_DWORD], eax
psrlq mmA, DWORD_BIT
movd eax, mmA
sub ecx, byte SIZEOF_DWORD
@@ -291,14 +289,14 @@ EXTN(jsimd_h2v1_merged_upsample_mmx):
.column_st2:
cmp ecx, byte SIZEOF_WORD
jb short .column_st1
- mov WORD [edi+0*SIZEOF_WORD], ax
+ mov word [edi+0*SIZEOF_WORD], ax
shr eax, WORD_BIT
sub ecx, byte SIZEOF_WORD
add edi, byte SIZEOF_WORD
.column_st1:
cmp ecx, byte SIZEOF_BYTE
jb short .endcolumn
- mov BYTE [edi+0*SIZEOF_BYTE], al
+ mov byte [edi+0*SIZEOF_BYTE], al
%else ; RGB_PIXELSIZE == 4 ; -----------
@@ -373,7 +371,7 @@ EXTN(jsimd_h2v1_merged_upsample_mmx):
.column_st4:
cmp ecx, byte SIZEOF_MMWORD/8
jb short .endcolumn
- movd DWORD [edi+0*SIZEOF_DWORD], mmA
+ movd dword [edi+0*SIZEOF_DWORD], mmA
%endif ; RGB_PIXELSIZE ; ---------------
diff --git a/simd/i386/jdmrgext-sse2.asm b/simd/i386/jdmrgext-sse2.asm
index ac4697e..c113dc4 100644
--- a/simd/i386/jdmrgext-sse2.asm
+++ b/simd/i386/jdmrgext-sse2.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jcolsamp.inc"
@@ -325,7 +323,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movd eax, xmmA
cmp ecx, byte SIZEOF_WORD
jb short .column_st1
- mov WORD [edi], ax
+ mov word [edi], ax
add edi, byte SIZEOF_WORD
sub ecx, byte SIZEOF_WORD
shr eax, 16
@@ -334,7 +332,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
; space.
test ecx, ecx
jz short .endcolumn
- mov BYTE [edi], al
+ mov byte [edi], al
%else ; RGB_PIXELSIZE == 4 ; -----------
diff --git a/simd/i386/jdsample-avx2.asm b/simd/i386/jdsample-avx2.asm
index 61ce511..a800c35 100644
--- a/simd/i386/jdsample-avx2.asm
+++ b/simd/i386/jdsample-avx2.asm
@@ -14,8 +14,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/i386/jdsample-mmx.asm b/simd/i386/jdsample-mmx.asm
index 1f810fa..12c49f0 100644
--- a/simd/i386/jdsample-mmx.asm
+++ b/simd/i386/jdsample-mmx.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/i386/jdsample-sse2.asm b/simd/i386/jdsample-sse2.asm
index f0da626..4e28d2f 100644
--- a/simd/i386/jdsample-sse2.asm
+++ b/simd/i386/jdsample-sse2.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/i386/jfdctflt-3dn.asm b/simd/i386/jfdctflt-3dn.asm
index 1d45865..322ab16 100644
--- a/simd/i386/jfdctflt-3dn.asm
+++ b/simd/i386/jfdctflt-3dn.asm
@@ -17,8 +17,6 @@
; This file contains a floating-point implementation of the forward DCT
; (Discrete Cosine Transform). The following code is based directly on
; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
diff --git a/simd/i386/jfdctflt-sse.asm b/simd/i386/jfdctflt-sse.asm
index 1faf835..86952c6 100644
--- a/simd/i386/jfdctflt-sse.asm
+++ b/simd/i386/jfdctflt-sse.asm
@@ -17,8 +17,6 @@
; This file contains a floating-point implementation of the forward DCT
; (Discrete Cosine Transform). The following code is based directly on
; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
diff --git a/simd/i386/jfdctfst-mmx.asm b/simd/i386/jfdctfst-mmx.asm
index 0271901..80645a5 100644
--- a/simd/i386/jfdctfst-mmx.asm
+++ b/simd/i386/jfdctfst-mmx.asm
@@ -18,8 +18,6 @@
; the forward DCT (Discrete Cosine Transform). The following code is
; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
; for more details.
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
diff --git a/simd/i386/jfdctfst-sse2.asm b/simd/i386/jfdctfst-sse2.asm
index f09dadd..446fa7a 100644
--- a/simd/i386/jfdctfst-sse2.asm
+++ b/simd/i386/jfdctfst-sse2.asm
@@ -18,8 +18,6 @@
; the forward DCT (Discrete Cosine Transform). The following code is
; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
; for more details.
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
diff --git a/simd/i386/jfdctint-avx2.asm b/simd/i386/jfdctint-avx2.asm
index ae258ee..97de230 100644
--- a/simd/i386/jfdctint-avx2.asm
+++ b/simd/i386/jfdctint-avx2.asm
@@ -18,8 +18,6 @@
; forward DCT (Discrete Cosine Transform). The following code is based
; directly on the IJG's original jfdctint.c; see the jfdctint.c for
; more details.
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
diff --git a/simd/i386/jfdctint-mmx.asm b/simd/i386/jfdctint-mmx.asm
index c6bd959..3ade9d4 100644
--- a/simd/i386/jfdctint-mmx.asm
+++ b/simd/i386/jfdctint-mmx.asm
@@ -18,8 +18,6 @@
; forward DCT (Discrete Cosine Transform). The following code is based
; directly on the IJG's original jfdctint.c; see the jfdctint.c for
; more details.
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
diff --git a/simd/i386/jfdctint-sse2.asm b/simd/i386/jfdctint-sse2.asm
index d67dcc1..71b684c 100644
--- a/simd/i386/jfdctint-sse2.asm
+++ b/simd/i386/jfdctint-sse2.asm
@@ -18,8 +18,6 @@
; forward DCT (Discrete Cosine Transform). The following code is based
; directly on the IJG's original jfdctint.c; see the jfdctint.c for
; more details.
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
diff --git a/simd/i386/jidctflt-3dn.asm b/simd/i386/jidctflt-3dn.asm
index 73aa18d..8795191 100644
--- a/simd/i386/jidctflt-3dn.asm
+++ b/simd/i386/jidctflt-3dn.asm
@@ -17,8 +17,6 @@
; This file contains a floating-point implementation of the inverse DCT
; (Discrete Cosine Transform). The following code is based directly on
; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
@@ -92,23 +90,23 @@ EXTN(jsimd_idct_float_3dnow):
alignx 16, 7
.columnloop:
%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW
- mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
jnz short .columnDCT
pushpic ebx ; save GOT address
- mov ebx, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
- mov eax, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
- or ebx, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
- or ebx, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ mov ebx, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ mov eax, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ or ebx, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ or ebx, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
or eax, ebx
poppic ebx ; restore GOT address
jnz short .columnDCT
; -- AC terms all zero
- movd mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movd mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
punpcklwd mm0, mm0
psrad mm0, (DWORD_BIT-WORD_BIT)
@@ -135,10 +133,10 @@ EXTN(jsimd_idct_float_3dnow):
; -- Even part
- movd mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
- movd mm1, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
- movd mm2, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
- movd mm3, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ movd mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ movd mm1, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ movd mm2, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ movd mm3, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
punpcklwd mm0, mm0
punpcklwd mm1, mm1
@@ -182,10 +180,10 @@ EXTN(jsimd_idct_float_3dnow):
; -- Odd part
- movd mm2, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
- movd mm3, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
- movd mm5, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
- movd mm1, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ movd mm2, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ movd mm3, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ movd mm5, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ movd mm1, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
punpcklwd mm2, mm2
punpcklwd mm3, mm3
diff --git a/simd/i386/jidctflt-sse.asm b/simd/i386/jidctflt-sse.asm
index 386650f..b27ecfd 100644
--- a/simd/i386/jidctflt-sse.asm
+++ b/simd/i386/jidctflt-sse.asm
@@ -17,8 +17,6 @@
; This file contains a floating-point implementation of the inverse DCT
; (Discrete Cosine Transform). The following code is based directly on
; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
@@ -102,8 +100,8 @@ EXTN(jsimd_idct_float_sse):
alignx 16, 7
.columnloop:
%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
- mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
jnz near .columnDCT
movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
diff --git a/simd/i386/jidctflt-sse2.asm b/simd/i386/jidctflt-sse2.asm
index 9de7139..c646eae 100644
--- a/simd/i386/jidctflt-sse2.asm
+++ b/simd/i386/jidctflt-sse2.asm
@@ -17,8 +17,6 @@
; This file contains a floating-point implementation of the inverse DCT
; (Discrete Cosine Transform). The following code is based directly on
; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
@@ -102,8 +100,8 @@ EXTN(jsimd_idct_float_sse2):
alignx 16, 7
.columnloop:
%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
- mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
jnz near .columnDCT
movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
diff --git a/simd/i386/jidctfst-mmx.asm b/simd/i386/jidctfst-mmx.asm
index d3e8a5d..24622d4 100644
--- a/simd/i386/jidctfst-mmx.asm
+++ b/simd/i386/jidctfst-mmx.asm
@@ -18,8 +18,6 @@
; the inverse DCT (Discrete Cosine Transform). The following code is
; based directly on the IJG's original jidctfst.c; see the jidctfst.c
; for more details.
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
@@ -123,8 +121,8 @@ EXTN(jsimd_idct_ifast_mmx):
alignx 16, 7
.columnloop:
%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
- mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
jnz short .columnDCT
movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
diff --git a/simd/i386/jidctfst-sse2.asm b/simd/i386/jidctfst-sse2.asm
index 83bc414..19704ff 100644
--- a/simd/i386/jidctfst-sse2.asm
+++ b/simd/i386/jidctfst-sse2.asm
@@ -18,8 +18,6 @@
; the inverse DCT (Discrete Cosine Transform). The following code is
; based directly on the IJG's original jidctfst.c; see the jidctfst.c
; for more details.
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
@@ -118,8 +116,8 @@ EXTN(jsimd_idct_ifast_sse2):
mov esi, JCOEFPTR [coef_block(eax)] ; inptr
%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
- mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
jnz near .columnDCT
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
diff --git a/simd/i386/jidctint-avx2.asm b/simd/i386/jidctint-avx2.asm
index b3b7b14..c371985 100644
--- a/simd/i386/jidctint-avx2.asm
+++ b/simd/i386/jidctint-avx2.asm
@@ -18,8 +18,6 @@
; inverse DCT (Discrete Cosine Transform). The following code is based
; directly on the IJG's original jidctint.c; see the jidctint.c for
; more details.
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
@@ -320,8 +318,8 @@ EXTN(jsimd_idct_islow_avx2):
mov esi, JCOEFPTR [coef_block(eax)] ; inptr
%ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2
- mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
jnz near .columnDCT
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
diff --git a/simd/i386/jidctint-mmx.asm b/simd/i386/jidctint-mmx.asm
index 6ca6d06..4f07f56 100644
--- a/simd/i386/jidctint-mmx.asm
+++ b/simd/i386/jidctint-mmx.asm
@@ -18,8 +18,6 @@
; inverse DCT (Discrete Cosine Transform). The following code is based
; directly on the IJG's original jidctint.c; see the jidctint.c for
; more details.
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
@@ -136,8 +134,8 @@ EXTN(jsimd_idct_islow_mmx):
alignx 16, 7
.columnloop:
%ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX
- mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
jnz short .columnDCT
movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
diff --git a/simd/i386/jidctint-sse2.asm b/simd/i386/jidctint-sse2.asm
index a6bd00a..e442fdd 100644
--- a/simd/i386/jidctint-sse2.asm
+++ b/simd/i386/jidctint-sse2.asm
@@ -18,8 +18,6 @@
; inverse DCT (Discrete Cosine Transform). The following code is based
; directly on the IJG's original jidctint.c; see the jidctint.c for
; more details.
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
@@ -131,8 +129,8 @@ EXTN(jsimd_idct_islow_sse2):
mov esi, JCOEFPTR [coef_block(eax)] ; inptr
%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
- mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
jnz near .columnDCT
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
diff --git a/simd/i386/jidctred-mmx.asm b/simd/i386/jidctred-mmx.asm
index 336ee3b..e2307e1 100644
--- a/simd/i386/jidctred-mmx.asm
+++ b/simd/i386/jidctred-mmx.asm
@@ -18,8 +18,6 @@
; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
; The following code is based directly on the IJG's original jidctred.c;
; see the jidctred.c for more details.
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
@@ -144,8 +142,8 @@ EXTN(jsimd_idct_4x4_mmx):
alignx 16, 7
.columnloop:
%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
- mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
jnz short .columnDCT
movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
@@ -464,16 +462,16 @@ EXTN(jsimd_idct_4x4_mmx):
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
- movd DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
- movd DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
+ movd dword [edx+eax*SIZEOF_JSAMPLE], mm1
+ movd dword [esi+eax*SIZEOF_JSAMPLE], mm0
psrlq mm1, 4*BYTE_BIT
psrlq mm0, 4*BYTE_BIT
mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
- movd DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
- movd DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
+ movd dword [edx+eax*SIZEOF_JSAMPLE], mm1
+ movd dword [esi+eax*SIZEOF_JSAMPLE], mm0
emms ; empty MMX state
@@ -688,8 +686,8 @@ EXTN(jsimd_idct_2x2_mmx):
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
- mov WORD [edx+eax*SIZEOF_JSAMPLE], bx
- mov WORD [esi+eax*SIZEOF_JSAMPLE], cx
+ mov word [edx+eax*SIZEOF_JSAMPLE], bx
+ mov word [esi+eax*SIZEOF_JSAMPLE], cx
emms ; empty MMX state
diff --git a/simd/i386/jidctred-sse2.asm b/simd/i386/jidctred-sse2.asm
index 97838ba..6e56494 100644
--- a/simd/i386/jidctred-sse2.asm
+++ b/simd/i386/jidctred-sse2.asm
@@ -18,8 +18,6 @@
; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
; The following code is based directly on the IJG's original jidctred.c;
; see the jidctred.c for more details.
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
@@ -139,8 +137,8 @@ EXTN(jsimd_idct_4x4_sse2):
mov esi, JCOEFPTR [coef_block(eax)] ; inptr
%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
- mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
jnz short .columnDCT
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
@@ -578,8 +576,8 @@ EXTN(jsimd_idct_2x2_sse2):
mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
- mov WORD [edx+eax*SIZEOF_JSAMPLE], bx
- mov WORD [esi+eax*SIZEOF_JSAMPLE], cx
+ mov word [edx+eax*SIZEOF_JSAMPLE], bx
+ mov word [esi+eax*SIZEOF_JSAMPLE], cx
pop edi
pop esi
diff --git a/simd/i386/jquant-3dn.asm b/simd/i386/jquant-3dn.asm
index 1767f44..5cb60ca 100644
--- a/simd/i386/jquant-3dn.asm
+++ b/simd/i386/jquant-3dn.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
diff --git a/simd/i386/jquant-mmx.asm b/simd/i386/jquant-mmx.asm
index 98932db..61305c6 100644
--- a/simd/i386/jquant-mmx.asm
+++ b/simd/i386/jquant-mmx.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
diff --git a/simd/i386/jquant-sse.asm b/simd/i386/jquant-sse.asm
index cc244c4..218adc9 100644
--- a/simd/i386/jquant-sse.asm
+++ b/simd/i386/jquant-sse.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
diff --git a/simd/i386/jquantf-sse2.asm b/simd/i386/jquantf-sse2.asm
index 8d1201c..a881ab5 100644
--- a/simd/i386/jquantf-sse2.asm
+++ b/simd/i386/jquantf-sse2.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
diff --git a/simd/i386/jquanti-avx2.asm b/simd/i386/jquanti-avx2.asm
index ea8e1a1..5ed6bec 100644
--- a/simd/i386/jquanti-avx2.asm
+++ b/simd/i386/jquanti-avx2.asm
@@ -14,8 +14,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
diff --git a/simd/i386/jquanti-sse2.asm b/simd/i386/jquanti-sse2.asm
index 2a69494..0a50940 100644
--- a/simd/i386/jquanti-sse2.asm
+++ b/simd/i386/jquanti-sse2.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
diff --git a/simd/i386/jsimd.c b/simd/i386/jsimd.c
index 563949a..2f92f8c 100644
--- a/simd/i386/jsimd.c
+++ b/simd/i386/jsimd.c
@@ -543,6 +543,12 @@ jsimd_can_h2v1_fancy_upsample(void)
return 0;
}
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+ return 0;
+}
+
GLOBAL(void)
jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
@@ -579,6 +585,12 @@ jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
output_data_ptr);
}
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
GLOBAL(int)
jsimd_can_h2v2_merged_upsample(void)
{
diff --git a/simd/i386/jsimdcpu.asm b/simd/i386/jsimdcpu.asm
index 0af4eec..ddcafa9 100644
--- a/simd/i386/jsimdcpu.asm
+++ b/simd/i386/jsimdcpu.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/jsimd.h b/simd/jsimd.h
index a9fc812..99c8801 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -121,13 +121,6 @@ EXTERN(void) jsimd_extxrgb_ycc_convert_neon
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extrgb_ycc_convert_neon_slowld3
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extbgr_ycc_convert_neon_slowld3
- (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows);
-
EXTERN(void) jsimd_rgb_ycc_convert_dspr2
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows);
@@ -263,6 +256,28 @@ EXTERN(void) jsimd_extxrgb_gray_convert_avx2
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_rgb_gray_convert_neon
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_neon
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_neon
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_neon
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_neon
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_neon
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_neon
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+
EXTERN(void) jsimd_rgb_gray_convert_dspr2
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows);
@@ -401,13 +416,6 @@ EXTERN(void) jsimd_ycc_rgb565_convert_neon
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_ycc_extrgb_convert_neon_slowst3
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_ycc_extbgr_convert_neon_slowst3
- (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
- JSAMPARRAY output_buf, int num_rows);
-
EXTERN(void) jsimd_ycc_rgb_convert_dspr2
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows);
@@ -562,6 +570,13 @@ EXTERN(void) jsimd_h2v2_upsample_avx2
(int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v1_upsample_neon
+ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_neon
+ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+
EXTERN(void) jsimd_h2v1_upsample_dspr2
(int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr);
@@ -608,6 +623,12 @@ EXTERN(void) jsimd_h2v2_fancy_upsample_avx2
EXTERN(void) jsimd_h2v1_fancy_upsample_neon
(int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_neon
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h1v2_fancy_upsample_neon
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
EXTERN(void) jsimd_h2v1_fancy_upsample_dspr2
(int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
@@ -762,6 +783,50 @@ EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_avx2
(JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_neon
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+
EXTERN(void) jsimd_h2v1_merged_upsample_dspr2
(JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
JSAMPARRAY output_buf, JSAMPLE *range);
diff --git a/simd/loongson/jccolext-mmi.c b/simd/loongson/jccolext-mmi.c
deleted file mode 100644
index 6cdeb5e..0000000
--- a/simd/loongson/jccolext-mmi.c
+++ /dev/null
@@ -1,483 +0,0 @@
-/*
- * Loongson MMI optimizations for libjpeg-turbo
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2014-2015, 2019, D. R. Commander. All Rights Reserved.
- * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
- * All Rights Reserved.
- * Authors: ZhuChen <zhuchen@loongson.cn>
- * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
- * CaiWanwei <caiwanwei@loongson.cn>
- * ZhangLixia <zhanglixia-hf@loongson.cn>
- *
- * Based on the x86 SIMD extension for IJG JPEG library
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* This file is included by jccolor-mmi.c */
-
-
-#if RGB_RED == 0
-#define mmA mm0
-#define mmB mm1
-#elif RGB_GREEN == 0
-#define mmA mm2
-#define mmB mm3
-#elif RGB_BLUE == 0
-#define mmA mm4
-#define mmB mm5
-#else
-#define mmA mm6
-#define mmB mm7
-#endif
-
-#if RGB_RED == 1
-#define mmC mm0
-#define mmD mm1
-#elif RGB_GREEN == 1
-#define mmC mm2
-#define mmD mm3
-#elif RGB_BLUE == 1
-#define mmC mm4
-#define mmD mm5
-#else
-#define mmC mm6
-#define mmD mm7
-#endif
-
-#if RGB_RED == 2
-#define mmE mm0
-#define mmF mm1
-#elif RGB_GREEN == 2
-#define mmE mm2
-#define mmF mm3
-#elif RGB_BLUE == 2
-#define mmE mm4
-#define mmF mm5
-#else
-#define mmE mm6
-#define mmF mm7
-#endif
-
-#if RGB_RED == 3
-#define mmG mm0
-#define mmH mm1
-#elif RGB_GREEN == 3
-#define mmG mm2
-#define mmH mm3
-#elif RGB_BLUE == 3
-#define mmG mm4
-#define mmH mm5
-#else
-#define mmG mm6
-#define mmH mm7
-#endif
-
-
-void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
- JSAMPIMAGE output_buf, JDIMENSION output_row,
- int num_rows)
-{
- JSAMPROW inptr, outptr0, outptr1, outptr2;
- int num_cols, col;
- __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
- __m64 wk[7];
- __m64 Y_BG, Cb_RG, Cr_BG;
-
- while (--num_rows >= 0) {
- inptr = *input_buf++;
- outptr0 = output_buf[0][output_row];
- outptr1 = output_buf[1][output_row];
- outptr2 = output_buf[2][output_row];
- output_row++;
-
- for (num_cols = image_width; num_cols > 0; num_cols -= 8,
- outptr0 += 8, outptr1 += 8, outptr2 += 8) {
-
-#if RGB_PIXELSIZE == 3
-
- if (num_cols < 8) {
- col = num_cols * 3;
- asm(".set noreorder\r\n"
-
- "li $8, 1\r\n"
- "move $9, %3\r\n"
- "and $10, $9, $8\r\n"
- "beqz $10, 1f\r\n"
- "nop \r\n"
- "subu $9, $9, 1\r\n"
- "xor $12, $12, $12\r\n"
- "move $13, %5\r\n"
- "dadd $13, $13, $9\r\n"
- "lbu $12, 0($13)\r\n"
-
- "1: \r\n"
- "li $8, 2\r\n"
- "and $10, $9, $8\r\n"
- "beqz $10, 2f\r\n"
- "nop \r\n"
- "subu $9, $9, 2\r\n"
- "xor $11, $11, $11\r\n"
- "move $13, %5\r\n"
- "dadd $13, $13, $9\r\n"
- "lhu $11, 0($13)\r\n"
- "sll $12, $12, 16\r\n"
- "or $12, $12, $11\r\n"
-
- "2: \r\n"
- "dmtc1 $12, %0\r\n"
- "li $8, 4\r\n"
- "and $10, $9, $8\r\n"
- "beqz $10, 3f\r\n"
- "nop \r\n"
- "subu $9, $9, 4\r\n"
- "move $13, %5\r\n"
- "dadd $13, $13, $9\r\n"
- "lwu $14, 0($13)\r\n"
- "dmtc1 $14, %1\r\n"
- "dsll32 $12, $12, 0\r\n"
- "or $12, $12, $14\r\n"
- "dmtc1 $12, %0\r\n"
-
- "3: \r\n"
- "li $8, 8\r\n"
- "and $10, $9, $8\r\n"
- "beqz $10, 4f\r\n"
- "nop \r\n"
- "mov.s %1, %0\r\n"
- "ldc1 %0, 0(%5)\r\n"
- "li $9, 8\r\n"
- "j 5f\r\n"
- "nop \r\n"
-
- "4: \r\n"
- "li $8, 16\r\n"
- "and $10, $9, $8\r\n"
- "beqz $10, 5f\r\n"
- "nop \r\n"
- "mov.s %2, %0\r\n"
- "ldc1 %0, 0(%5)\r\n"
- "ldc1 %1, 8(%5)\r\n"
-
- "5: \r\n"
- "nop \r\n"
- ".set reorder\r\n"
-
- : "=f" (mmA), "=f" (mmG), "=f" (mmF)
- : "r" (col), "r" (num_rows), "r" (inptr)
- : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
- "$14", "memory"
- );
- } else {
- if (!(((long)inptr) & 7)) {
- mmA = _mm_load_si64((__m64 *)&inptr[0]);
- mmG = _mm_load_si64((__m64 *)&inptr[8]);
- mmF = _mm_load_si64((__m64 *)&inptr[16]);
- } else {
- mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
- mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
- mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
- }
- inptr += RGB_PIXELSIZE * 8;
- }
- mmD = mmA;
- mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
- mmD = _mm_srli_si64(mmD, 4 * BYTE_BIT);
-
- mmA = _mm_unpackhi_pi8(mmA, mmG);
- mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
-
- mmD = _mm_unpacklo_pi8(mmD, mmF);
- mmG = _mm_unpackhi_pi8(mmG, mmF);
-
- mmE = mmA;
- mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
- mmE = _mm_srli_si64(mmE, 4 * BYTE_BIT);
-
- mmA = _mm_unpackhi_pi8(mmA, mmD);
- mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
-
- mmE = _mm_unpacklo_pi8(mmE, mmG);
- mmD = _mm_unpackhi_pi8(mmD, mmG);
- mmC = mmA;
- mmA = _mm_loadlo_pi8_f(mmA);
- mmC = _mm_loadhi_pi8_f(mmC);
-
- mmB = mmE;
- mmE = _mm_loadlo_pi8_f(mmE);
- mmB = _mm_loadhi_pi8_f(mmB);
-
- mmF = mmD;
- mmD = _mm_loadlo_pi8_f(mmD);
- mmF = _mm_loadhi_pi8_f(mmF);
-
-#else /* RGB_PIXELSIZE == 4 */
-
- if (num_cols < 8) {
- col = num_cols;
- asm(".set noreorder\r\n"
-
- "li $8, 1\r\n"
- "move $9, %4\r\n"
- "and $10, $9, $8\r\n"
- "beqz $10, 1f\r\n"
- "nop \r\n"
- "subu $9, $9, 1\r\n"
- "dsll $11, $9, 2\r\n"
- "move $13, %5\r\n"
- "daddu $13, $13, $11\r\n"
- "lwc1 %0, 0($13)\r\n"
-
- "1: \r\n"
- "li $8, 2\r\n"
- "and $10, $9, $8\r\n"
- "beqz $10, 2f\r\n"
- "nop \r\n"
- "subu $9, $9, 2\r\n"
- "dsll $11, $9, 2\r\n"
- "move $13, %5\r\n"
- "daddu $13, $13, $11\r\n"
- "mov.s %1, %0\r\n"
- "ldc1 %0, 0($13)\r\n"
-
- "2: \r\n"
- "li $8, 4\r\n"
- "and $10, $9, $8\r\n"
- "beqz $10, 3f\r\n"
- "nop \r\n"
- "mov.s %2, %0\r\n"
- "mov.s %3, %1\r\n"
- "ldc1 %0, 0(%5)\r\n"
- "ldc1 %1, 8(%5)\r\n"
-
- "3: \r\n"
- "nop \r\n"
- ".set reorder\r\n"
-
- : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
- : "r" (col), "r" (inptr)
- : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
- );
- } else {
- if (!(((long)inptr) & 7)) {
- mmA = _mm_load_si64((__m64 *)&inptr[0]);
- mmF = _mm_load_si64((__m64 *)&inptr[8]);
- mmD = _mm_load_si64((__m64 *)&inptr[16]);
- mmC = _mm_load_si64((__m64 *)&inptr[24]);
- } else {
- mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
- mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
- mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
- mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
- }
- inptr += RGB_PIXELSIZE * 8;
- }
- mmB = mmA;
- mmA = _mm_unpacklo_pi8(mmA, mmF);
- mmB = _mm_unpackhi_pi8(mmB, mmF);
-
- mmG = mmD;
- mmD = _mm_unpacklo_pi8(mmD, mmC);
- mmG = _mm_unpackhi_pi8(mmG, mmC);
-
- mmE = mmA;
- mmA = _mm_unpacklo_pi16(mmA, mmD);
- mmE = _mm_unpackhi_pi16(mmE, mmD);
-
- mmH = mmB;
- mmB = _mm_unpacklo_pi16(mmB, mmG);
- mmH = _mm_unpackhi_pi16(mmH, mmG);
-
- mmC = mmA;
- mmA = _mm_loadlo_pi8_f(mmA);
- mmC = _mm_loadhi_pi8_f(mmC);
-
- mmD = mmB;
- mmB = _mm_loadlo_pi8_f(mmB);
- mmD = _mm_loadhi_pi8_f(mmD);
-
- mmG = mmE;
- mmE = _mm_loadlo_pi8_f(mmE);
- mmG = _mm_loadhi_pi8_f(mmG);
-
- mmF = mmH;
- mmF = _mm_unpacklo_pi8(mmF, mmH);
- mmH = _mm_unpackhi_pi8(mmH, mmH);
- mmF = _mm_srli_pi16(mmF, BYTE_BIT);
- mmH = _mm_srli_pi16(mmH, BYTE_BIT);
-
-#endif
-
- wk[0] = mm0;
- wk[1] = mm1;
- wk[2] = mm4;
- wk[3] = mm5;
-
- mm6 = mm1;
- mm1 = _mm_unpacklo_pi16(mm1, mm3);
- mm6 = _mm_unpackhi_pi16(mm6, mm3);
- mm7 = mm1;
- mm4 = mm6;
- mm1 = _mm_madd_pi16(mm1, PW_F0299_F0337);
- mm6 = _mm_madd_pi16(mm6, PW_F0299_F0337);
- mm7 = _mm_madd_pi16(mm7, PW_MF016_MF033);
- mm4 = _mm_madd_pi16(mm4, PW_MF016_MF033);
-
- wk[4] = mm1;
- wk[5] = mm6;
-
- mm1 = _mm_loadlo_pi16_f(mm5);
- mm6 = _mm_loadhi_pi16_f(mm5);
- mm1 = _mm_srli_pi32(mm1, 1);
- mm6 = _mm_srli_pi32(mm6, 1);
-
- mm5 = PD_ONEHALFM1_CJ;
- mm7 = _mm_add_pi32(mm7, mm1);
- mm4 = _mm_add_pi32(mm4, mm6);
- mm7 = _mm_add_pi32(mm7, mm5);
- mm4 = _mm_add_pi32(mm4, mm5);
- mm7 = _mm_srli_pi32(mm7, SCALEBITS);
- mm4 = _mm_srli_pi32(mm4, SCALEBITS);
- mm7 = _mm_packs_pi32(mm7, mm4);
-
- mm1 = wk[2];
- mm6 = mm0;
- mm0 = _mm_unpacklo_pi16(mm0, mm2);
- mm6 = _mm_unpackhi_pi16(mm6, mm2);
- mm5 = mm0;
- mm4 = mm6;
- mm0 = _mm_madd_pi16(mm0, PW_F0299_F0337);
- mm6 = _mm_madd_pi16(mm6, PW_F0299_F0337);
- mm5 = _mm_madd_pi16(mm5, PW_MF016_MF033);
- mm4 = _mm_madd_pi16(mm4, PW_MF016_MF033);
-
- wk[6] = mm0;
- wk[7] = mm6;
- mm0 = _mm_loadlo_pi16_f(mm1);
- mm6 = _mm_loadhi_pi16_f(mm1);
- mm0 = _mm_srli_pi32(mm0, 1);
- mm6 = _mm_srli_pi32(mm6, 1);
-
- mm1 = PD_ONEHALFM1_CJ;
- mm5 = _mm_add_pi32(mm5, mm0);
- mm4 = _mm_add_pi32(mm4, mm6);
- mm5 = _mm_add_pi32(mm5, mm1);
- mm4 = _mm_add_pi32(mm4, mm1);
- mm5 = _mm_srli_pi32(mm5, SCALEBITS);
- mm4 = _mm_srli_pi32(mm4, SCALEBITS);
- mm5 = _mm_packs_pi32(mm5, mm4);
-
- mm7 = _mm_slli_pi16(mm7, BYTE_BIT);
- mm5 = _mm_or_si64(mm5, mm7);
- Cb_RG = mm5;
-
- mm0 = wk[3];
- mm6 = wk[2];
- mm1 = wk[1];
-
- mm4 = mm0;
- mm0 = _mm_unpacklo_pi16(mm0, mm3);
- mm4 = _mm_unpackhi_pi16(mm4, mm3);
- mm7 = mm0;
- mm5 = mm4;
- mm0 = _mm_madd_pi16(mm0, PW_F0114_F0250);
- mm4 = _mm_madd_pi16(mm4, PW_F0114_F0250);
- mm7 = _mm_madd_pi16(mm7, PW_MF008_MF041);
- mm5 = _mm_madd_pi16(mm5, PW_MF008_MF041);
-
- mm3 = PD_ONEHALF;
- mm0 = _mm_add_pi32(mm0, wk[4]);
- mm4 = _mm_add_pi32(mm4, wk[5]);
- mm0 = _mm_add_pi32(mm0, mm3);
- mm4 = _mm_add_pi32(mm4, mm3);
- mm0 = _mm_srli_pi32(mm0, SCALEBITS);
- mm4 = _mm_srli_pi32(mm4, SCALEBITS);
- mm0 = _mm_packs_pi32(mm0, mm4);
-
- mm3 = _mm_loadlo_pi16_f(mm1);
- mm4 = _mm_loadhi_pi16_f(mm1);
- mm3 = _mm_srli_pi32(mm3, 1);
- mm4 = _mm_srli_pi32(mm4, 1);
-
- mm1 = PD_ONEHALFM1_CJ;
- mm7 = _mm_add_pi32(mm7, mm3);
- mm5 = _mm_add_pi32(mm5, mm4);
- mm7 = _mm_add_pi32(mm7, mm1);
- mm5 = _mm_add_pi32(mm5, mm1);
- mm7 = _mm_srli_pi32(mm7, SCALEBITS);
- mm5 = _mm_srli_pi32(mm5, SCALEBITS);
- mm7 = _mm_packs_pi32(mm7, mm5);
-
- mm3 = wk[0];
- mm4 = mm6;
- mm6 = _mm_unpacklo_pi16(mm6, mm2);
- mm4 = _mm_unpackhi_pi16(mm4, mm2);
- mm1 = mm6;
- mm5 = mm4;
- mm6 = _mm_madd_pi16(mm6, PW_F0114_F0250);
- mm4 = _mm_madd_pi16(mm4, PW_F0114_F0250);
- mm1 = _mm_madd_pi16(mm1, PW_MF008_MF041);
- mm5 = _mm_madd_pi16(mm5, PW_MF008_MF041);
-
- mm2 = PD_ONEHALF;
- mm6 = _mm_add_pi32(mm6, wk[6]);
- mm4 = _mm_add_pi32(mm4, wk[7]);
- mm6 = _mm_add_pi32(mm6, mm2);
- mm4 = _mm_add_pi32(mm4, mm2);
- mm6 = _mm_srli_pi32(mm6, SCALEBITS);
- mm4 = _mm_srli_pi32(mm4, SCALEBITS);
- mm6 = _mm_packs_pi32(mm6, mm4);
-
- mm0 = _mm_slli_pi16(mm0, BYTE_BIT);
- mm6 = _mm_or_si64(mm6, mm0);
- Y_BG = mm6;
-
- mm2 = _mm_loadlo_pi16_f(mm3);
- mm4 = _mm_loadhi_pi16_f(mm3);
- mm2 = _mm_srli_pi32(mm2, 1);
- mm4 = _mm_srli_pi32(mm4, 1);
-
- mm0 = PD_ONEHALFM1_CJ;
- mm1 = _mm_add_pi32(mm1, mm2);
- mm5 = _mm_add_pi32(mm5, mm4);
- mm1 = _mm_add_pi32(mm1, mm0);
- mm5 = _mm_add_pi32(mm5, mm0);
- mm1 = _mm_srli_pi32(mm1, SCALEBITS);
- mm5 = _mm_srli_pi32(mm5, SCALEBITS);
- mm1 = _mm_packs_pi32(mm1, mm5);
-
- mm7 = _mm_slli_pi16(mm7, BYTE_BIT);
- mm1 = _mm_or_si64(mm1, mm7);
- Cr_BG = mm1;
-
- _mm_store_si64((__m64 *)&outptr0[0], Y_BG);
- _mm_store_si64((__m64 *)&outptr1[0], Cb_RG);
- _mm_store_si64((__m64 *)&outptr2[0], Cr_BG);
- }
- }
-}
-
-#undef mmA
-#undef mmB
-#undef mmC
-#undef mmD
-#undef mmE
-#undef mmF
-#undef mmG
-#undef mmH
diff --git a/simd/loongson/jccolor-mmi.c b/simd/loongson/jccolor-mmi.c
deleted file mode 100644
index 93ef5c7..0000000
--- a/simd/loongson/jccolor-mmi.c
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Loongson MMI optimizations for libjpeg-turbo
- *
- * Copyright (C) 2011, 2014, D. R. Commander. All Rights Reserved.
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
- * All Rights Reserved.
- * Authors: ZhuChen <zhuchen@loongson.cn>
- * CaiWanwei <caiwanwei@loongson.cn>
- * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* RGB --> YCC CONVERSION */
-
-#include "jsimd_mmi.h"
-
-
-#define F_0_081 ((short)5329) /* FIX(0.08131) */
-#define F_0_114 ((short)7471) /* FIX(0.11400) */
-#define F_0_168 ((short)11059) /* FIX(0.16874) */
-#define F_0_250 ((short)16384) /* FIX(0.25000) */
-#define F_0_299 ((short)19595) /* FIX(0.29900) */
-#define F_0_331 ((short)21709) /* FIX(0.33126) */
-#define F_0_418 ((short)27439) /* FIX(0.41869) */
-#define F_0_587 ((short)38470) /* FIX(0.58700) */
-#define F_0_337 ((short)(F_0_587 - F_0_250)) /* FIX(0.58700) - FIX(0.25000) */
-
-enum const_index {
- index_PD_ONEHALF,
- index_PW_F0299_F0337,
- index_PW_F0114_F0250,
- index_PW_MF016_MF033,
- index_PW_MF008_MF041,
- index_PD_ONEHALFM1_CJ
-};
-
-static uint64_t const_value[] = {
- _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1))),
- _uint64_set_pi16(F_0_337, F_0_299, F_0_337, F_0_299),
- _uint64_set_pi16(F_0_250, F_0_114, F_0_250, F_0_114),
- _uint64_set_pi16(-F_0_331, -F_0_168, -F_0_331, -F_0_168),
- _uint64_set_pi16(-F_0_418, -F_0_081, -F_0_418, -F_0_081),
- _uint64_set_pi32(((1 << (SCALEBITS - 1)) - 1 + (CENTERJSAMPLE << SCALEBITS)),
- ((1 << (SCALEBITS - 1)) - 1 + (CENTERJSAMPLE << SCALEBITS)))
-};
-
-#define get_const_value(index) (*(__m64 *)&const_value[index])
-
-#define PD_ONEHALF get_const_value(index_PD_ONEHALF)
-#define PW_F0299_F0337 get_const_value(index_PW_F0299_F0337)
-#define PW_F0114_F0250 get_const_value(index_PW_F0114_F0250)
-#define PW_MF016_MF033 get_const_value(index_PW_MF016_MF033)
-#define PW_MF008_MF041 get_const_value(index_PW_MF008_MF041)
-#define PD_ONEHALFM1_CJ get_const_value(index_PD_ONEHALFM1_CJ)
-
-
-#include "jccolext-mmi.c"
-#undef RGB_RED
-#undef RGB_GREEN
-#undef RGB_BLUE
-#undef RGB_PIXELSIZE
-
-#define RGB_RED EXT_RGB_RED
-#define RGB_GREEN EXT_RGB_GREEN
-#define RGB_BLUE EXT_RGB_BLUE
-#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-#define jsimd_rgb_ycc_convert_mmi jsimd_extrgb_ycc_convert_mmi
-#include "jccolext-mmi.c"
-#undef RGB_RED
-#undef RGB_GREEN
-#undef RGB_BLUE
-#undef RGB_PIXELSIZE
-#undef jsimd_rgb_ycc_convert_mmi
-
-#define RGB_RED EXT_RGBX_RED
-#define RGB_GREEN EXT_RGBX_GREEN
-#define RGB_BLUE EXT_RGBX_BLUE
-#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define jsimd_rgb_ycc_convert_mmi jsimd_extrgbx_ycc_convert_mmi
-#include "jccolext-mmi.c"
-#undef RGB_RED
-#undef RGB_GREEN
-#undef RGB_BLUE
-#undef RGB_PIXELSIZE
-#undef jsimd_rgb_ycc_convert_mmi
-
-#define RGB_RED EXT_BGR_RED
-#define RGB_GREEN EXT_BGR_GREEN
-#define RGB_BLUE EXT_BGR_BLUE
-#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define jsimd_rgb_ycc_convert_mmi jsimd_extbgr_ycc_convert_mmi
-#include "jccolext-mmi.c"
-#undef RGB_RED
-#undef RGB_GREEN
-#undef RGB_BLUE
-#undef RGB_PIXELSIZE
-#undef jsimd_rgb_ycc_convert_mmi
-
-#define RGB_RED EXT_BGRX_RED
-#define RGB_GREEN EXT_BGRX_GREEN
-#define RGB_BLUE EXT_BGRX_BLUE
-#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define jsimd_rgb_ycc_convert_mmi jsimd_extbgrx_ycc_convert_mmi
-#include "jccolext-mmi.c"
-#undef RGB_RED
-#undef RGB_GREEN
-#undef RGB_BLUE
-#undef RGB_PIXELSIZE
-#undef jsimd_rgb_ycc_convert_mmi
-
-#define RGB_RED EXT_XBGR_RED
-#define RGB_GREEN EXT_XBGR_GREEN
-#define RGB_BLUE EXT_XBGR_BLUE
-#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define jsimd_rgb_ycc_convert_mmi jsimd_extxbgr_ycc_convert_mmi
-#include "jccolext-mmi.c"
-#undef RGB_RED
-#undef RGB_GREEN
-#undef RGB_BLUE
-#undef RGB_PIXELSIZE
-#undef jsimd_rgb_ycc_convert_mmi
-
-#define RGB_RED EXT_XRGB_RED
-#define RGB_GREEN EXT_XRGB_GREEN
-#define RGB_BLUE EXT_XRGB_BLUE
-#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define jsimd_rgb_ycc_convert_mmi jsimd_extxrgb_ycc_convert_mmi
-#include "jccolext-mmi.c"
-#undef RGB_RED
-#undef RGB_GREEN
-#undef RGB_BLUE
-#undef RGB_PIXELSIZE
-#undef jsimd_rgb_ycc_convert_mmi
diff --git a/simd/loongson/jcsample-mmi.c b/simd/loongson/jcsample-mmi.c
deleted file mode 100644
index 2f2d851..0000000
--- a/simd/loongson/jcsample-mmi.c
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Loongson MMI optimizations for libjpeg-turbo
- *
- * Copyright (C) 2015, 2018, D. R. Commander. All Rights Reserved.
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
- * All Rights Reserved.
- * Authors: ZhuChen <zhuchen@loongson.cn>
- * CaiWanwei <caiwanwei@loongson.cn>
- * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
- *
- * Based on the x86 SIMD extension for IJG JPEG library
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* CHROMA DOWNSAMPLING */
-
-#include "jsimd_mmi.h"
-#include "jcsample.h"
-
-
-void jsimd_h2v2_downsample_mmi(JDIMENSION image_width, int max_v_samp_factor,
- JDIMENSION v_samp_factor,
- JDIMENSION width_in_blocks,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
- int inrow, outrow, outcol, bias;
- JDIMENSION output_cols = width_in_blocks * DCTSIZE;
- JSAMPROW inptr0, inptr1, outptr;
- __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6 = 0.0, mm7;
-
- expand_right_edge(input_data, max_v_samp_factor, image_width,
- output_cols * 2);
-
- bias = (1 << 17) + 1; /* 0x00020001 (bias pattern) */
- mm7 = _mm_set1_pi32(bias); /* mm7={1, 2, 1, 2} */
- mm6 = _mm_cmpeq_pi16(mm6, mm6);
- mm6 = _mm_srli_pi16(mm6, BYTE_BIT); /* mm6={0xFF 0x00 0xFF 0x00 ..} */
-
- for (inrow = 0, outrow = 0; outrow < v_samp_factor;
- inrow += 2, outrow++) {
-
- inptr0 = input_data[inrow];
- inptr1 = input_data[inrow + 1];
- outptr = output_data[outrow];
-
- for (outcol = output_cols; outcol > 0;
- outcol -= 8, inptr0 += 16, inptr1 += 16, outptr += 8) {
-
- mm0 = _mm_load_si64((__m64 *)&inptr0[0]);
- mm1 = _mm_load_si64((__m64 *)&inptr1[0]);
- mm2 = _mm_load_si64((__m64 *)&inptr0[8]);
- mm3 = _mm_load_si64((__m64 *)&inptr1[8]);
-
- mm4 = mm0;
- mm5 = mm1;
- mm0 = _mm_and_si64(mm0, mm6);
- mm4 = _mm_srli_pi16(mm4, BYTE_BIT);
- mm1 = _mm_and_si64(mm1, mm6);
- mm5 = _mm_srli_pi16(mm5, BYTE_BIT);
- mm0 = _mm_add_pi16(mm0, mm4);
- mm1 = _mm_add_pi16(mm1, mm5);
-
- mm4 = mm2;
- mm5 = mm3;
- mm2 = _mm_and_si64(mm2, mm6);
- mm4 = _mm_srli_pi16(mm4, BYTE_BIT);
- mm3 = _mm_and_si64(mm3, mm6);
- mm5 = _mm_srli_pi16(mm5, BYTE_BIT);
- mm2 = _mm_add_pi16(mm2, mm4);
- mm3 = _mm_add_pi16(mm3, mm5);
-
- mm0 = _mm_add_pi16(mm0, mm1);
- mm2 = _mm_add_pi16(mm2, mm3);
- mm0 = _mm_add_pi16(mm0, mm7);
- mm2 = _mm_add_pi16(mm2, mm7);
- mm0 = _mm_srli_pi16(mm0, 2);
- mm2 = _mm_srli_pi16(mm2, 2);
-
- mm0 = _mm_packs_pu16(mm0, mm2);
-
- _mm_store_si64((__m64 *)&outptr[0], mm0);
- }
- }
-}
diff --git a/simd/loongson/jcsample.h b/simd/loongson/jcsample.h
deleted file mode 100644
index 2ac4816..0000000
--- a/simd/loongson/jcsample.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * jcsample.h
- *
- * This file was part of the Independent JPEG Group's software:
- * Copyright (C) 1991-1996, Thomas G. Lane.
- * For conditions of distribution and use, see the accompanying README.ijg
- * file.
- */
-
-LOCAL(void)
-expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
- JDIMENSION output_cols)
-{
- register JSAMPROW ptr;
- register JSAMPLE pixval;
- register int count;
- int row;
- int numcols = (int)(output_cols - input_cols);
-
- if (numcols > 0) {
- for (row = 0; row < num_rows; row++) {
- ptr = image_data[row] + input_cols;
- pixval = ptr[-1]; /* don't need GETJSAMPLE() here */
- for (count = numcols; count > 0; count--)
- *ptr++ = pixval;
- }
- }
-}
diff --git a/simd/loongson/jdcolext-mmi.c b/simd/loongson/jdcolext-mmi.c
deleted file mode 100644
index 560d9b0..0000000
--- a/simd/loongson/jdcolext-mmi.c
+++ /dev/null
@@ -1,424 +0,0 @@
-/*
- * Loongson MMI optimizations for libjpeg-turbo
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
- * All Rights Reserved.
- * Authors: ZhuChen <zhuchen@loongson.cn>
- * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
- * CaiWanwei <caiwanwei@loongson.cn>
- *
- * Based on the x86 SIMD extension for IJG JPEG library
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* This file is included by jdcolor-mmi.c */
-
-
-#if RGB_RED == 0
-#define mmA mm0
-#define mmB mm1
-#elif RGB_GREEN == 0
-#define mmA mm2
-#define mmB mm3
-#elif RGB_BLUE == 0
-#define mmA mm4
-#define mmB mm5
-#else
-#define mmA mm6
-#define mmB mm7
-#endif
-
-#if RGB_RED == 1
-#define mmC mm0
-#define mmD mm1
-#elif RGB_GREEN == 1
-#define mmC mm2
-#define mmD mm3
-#elif RGB_BLUE == 1
-#define mmC mm4
-#define mmD mm5
-#else
-#define mmC mm6
-#define mmD mm7
-#endif
-
-#if RGB_RED == 2
-#define mmE mm0
-#define mmF mm1
-#elif RGB_GREEN == 2
-#define mmE mm2
-#define mmF mm3
-#elif RGB_BLUE == 2
-#define mmE mm4
-#define mmF mm5
-#else
-#define mmE mm6
-#define mmF mm7
-#endif
-
-#if RGB_RED == 3
-#define mmG mm0
-#define mmH mm1
-#elif RGB_GREEN == 3
-#define mmG mm2
-#define mmH mm3
-#elif RGB_BLUE == 3
-#define mmG mm4
-#define mmH mm5
-#else
-#define mmG mm6
-#define mmH mm7
-#endif
-
-
-void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf,
- JDIMENSION input_row, JSAMPARRAY output_buf,
- int num_rows)
-{
- JSAMPROW outptr, inptr0, inptr1, inptr2;
- int num_cols, col;
- __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
- __m64 mm8, wk[2];
-
- while (--num_rows >= 0) {
- inptr0 = input_buf[0][input_row];
- inptr1 = input_buf[1][input_row];
- inptr2 = input_buf[2][input_row];
- input_row++;
- outptr = *output_buf++;
-
- for (num_cols = out_width; num_cols > 0; num_cols -= 8,
- inptr0 += 8, inptr1 += 8, inptr2 += 8) {
-
- mm5 = _mm_load_si64((__m64 *)inptr1);
- mm1 = _mm_load_si64((__m64 *)inptr2);
- mm8 = _mm_load_si64((__m64 *)inptr0);
- mm4 = 0;
- mm7 = 0;
- mm4 = _mm_cmpeq_pi16(mm4, mm4);
- mm7 = _mm_cmpeq_pi16(mm7, mm7);
- mm4 = _mm_srli_pi16(mm4, BYTE_BIT);
- mm7 = _mm_slli_pi16(mm7, 7); /* mm7={0xFF80 0xFF80 0xFF80 0xFF80} */
- mm0 = mm4; /* mm0=mm4={0xFF 0x00 0xFF 0x00 ..} */
-
- mm4 = _mm_and_si64(mm4, mm5); /* mm4=Cb(0246)=CbE */
- mm5 = _mm_srli_pi16(mm5, BYTE_BIT); /* mm5=Cb(1357)=CbO */
- mm0 = _mm_and_si64(mm0, mm1); /* mm0=Cr(0246)=CrE */
- mm1 = _mm_srli_pi16(mm1, BYTE_BIT); /* mm1=Cr(1357)=CrO */
- mm4 = _mm_add_pi16(mm4, mm7);
- mm5 = _mm_add_pi16(mm5, mm7);
- mm0 = _mm_add_pi16(mm0, mm7);
- mm1 = _mm_add_pi16(mm1, mm7);
-
- /* (Original)
- * R = Y + 1.40200 * Cr
- * G = Y - 0.34414 * Cb - 0.71414 * Cr
- * B = Y + 1.77200 * Cb
- *
- * (This implementation)
- * R = Y + 0.40200 * Cr + Cr
- * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
- * B = Y - 0.22800 * Cb + Cb + Cb
- */
-
- mm2 = mm4; /* mm2 = CbE */
- mm3 = mm5; /* mm3 = CbO */
- mm4 = _mm_add_pi16(mm4, mm4); /* mm4 = 2*CbE */
- mm5 = _mm_add_pi16(mm5, mm5); /* mm5 = 2*CbO */
- mm6 = mm0; /* mm6 = CrE */
- mm7 = mm1; /* mm7 = CrO */
- mm0 = _mm_add_pi16(mm0, mm0); /* mm0 = 2*CrE */
- mm1 = _mm_add_pi16(mm1, mm1); /* mm1 = 2*CrO */
-
- mm4 = _mm_mulhi_pi16(mm4, PW_MF0228); /* mm4=(2*CbE * -FIX(0.22800) */
- mm5 = _mm_mulhi_pi16(mm5, PW_MF0228); /* mm5=(2*CbO * -FIX(0.22800) */
- mm0 = _mm_mulhi_pi16(mm0, PW_F0402); /* mm0=(2*CrE * FIX(0.40200)) */
- mm1 = _mm_mulhi_pi16(mm1, PW_F0402); /* mm1=(2*CrO * FIX(0.40200)) */
-
- mm4 = _mm_add_pi16(mm4, PW_ONE);
- mm5 = _mm_add_pi16(mm5, PW_ONE);
- mm4 = _mm_srai_pi16(mm4, 1); /* mm4=(CbE * -FIX(0.22800)) */
- mm5 = _mm_srai_pi16(mm5, 1); /* mm5=(CbO * -FIX(0.22800)) */
- mm0 = _mm_add_pi16(mm0, PW_ONE);
- mm1 = _mm_add_pi16(mm1, PW_ONE);
- mm0 = _mm_srai_pi16(mm0, 1); /* mm0=(CrE * FIX(0.40200)) */
- mm1 = _mm_srai_pi16(mm1, 1); /* mm1=(CrO * FIX(0.40200)) */
-
- mm4 = _mm_add_pi16(mm4, mm2);
- mm5 = _mm_add_pi16(mm5, mm3);
- mm4 = _mm_add_pi16(mm4, mm2); /* mm4=(CbE * FIX(1.77200))=(B-Y)E */
- mm5 = _mm_add_pi16(mm5, mm3); /* mm5=(CbO * FIX(1.77200))=(B-Y)O */
- mm0 = _mm_add_pi16(mm0, mm6); /* mm0=(CrE * FIX(1.40200))=(R-Y)E */
- mm1 = _mm_add_pi16(mm1, mm7); /* mm1=(CrO * FIX(1.40200))=(R-Y)O */
-
- wk[0] = mm4; /* wk(0)=(B-Y)E */
- wk[1] = mm5; /* wk(1)=(B-Y)O */
-
- mm4 = mm2;
- mm5 = mm3;
- mm2 = _mm_unpacklo_pi16(mm2, mm6);
- mm4 = _mm_unpackhi_pi16(mm4, mm6);
- mm2 = _mm_madd_pi16(mm2, PW_MF0344_F0285);
- mm4 = _mm_madd_pi16(mm4, PW_MF0344_F0285);
- mm3 = _mm_unpacklo_pi16(mm3, mm7);
- mm5 = _mm_unpackhi_pi16(mm5, mm7);
- mm3 = _mm_madd_pi16(mm3, PW_MF0344_F0285);
- mm5 = _mm_madd_pi16(mm5, PW_MF0344_F0285);
-
- mm2 = _mm_add_pi32(mm2, PD_ONEHALF);
- mm4 = _mm_add_pi32(mm4, PD_ONEHALF);
- mm2 = _mm_srai_pi32(mm2, SCALEBITS);
- mm4 = _mm_srai_pi32(mm4, SCALEBITS);
- mm3 = _mm_add_pi32(mm3, PD_ONEHALF);
- mm5 = _mm_add_pi32(mm5, PD_ONEHALF);
- mm3 = _mm_srai_pi32(mm3, SCALEBITS);
- mm5 = _mm_srai_pi32(mm5, SCALEBITS);
-
- mm2 = _mm_packs_pi32(mm2, mm4); /* mm2=CbE*-FIX(0.344)+CrE*FIX(0.285) */
- mm3 = _mm_packs_pi32(mm3, mm5); /* mm3=CbO*-FIX(0.344)+CrO*FIX(0.285) */
- mm2 = _mm_sub_pi16(mm2, mm6); /* mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E */
- mm3 = _mm_sub_pi16(mm3, mm7); /* mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O */
-
- mm5 = mm8; /* mm5=Y(01234567) */
-
- mm4 = _mm_cmpeq_pi16(mm4, mm4);
- mm4 = _mm_srli_pi16(mm4, BYTE_BIT); /* mm4={0xFF 0x00 0xFF 0x00 ..} */
- mm4 = _mm_and_si64(mm4, mm5); /* mm4=Y(0246)=YE */
- mm5 = _mm_srli_pi16(mm5, BYTE_BIT); /* mm5=Y(1357)=YO */
-
- mm0 = _mm_add_pi16(mm0, mm4); /* mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6) */
- mm1 = _mm_add_pi16(mm1, mm5); /* mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7) */
- mm0 = _mm_packs_pu16(mm0, mm0); /* mm0=(R0 R2 R4 R6 ** ** ** **) */
- mm1 = _mm_packs_pu16(mm1, mm1); /* mm1=(R1 R3 R5 R7 ** ** ** **) */
-
- mm2 = _mm_add_pi16(mm2, mm4); /* mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6) */
- mm3 = _mm_add_pi16(mm3, mm5); /* mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7) */
- mm2 = _mm_packs_pu16(mm2, mm2); /* mm2=(G0 G2 G4 G6 ** ** ** **) */
- mm3 = _mm_packs_pu16(mm3, mm3); /* mm3=(G1 G3 G5 G7 ** ** ** **) */
-
- mm4 = _mm_add_pi16(mm4, wk[0]); /* mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6) */
- mm5 = _mm_add_pi16(mm5, wk[1]); /* mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7) */
- mm4 = _mm_packs_pu16(mm4, mm4); /* mm4=(B0 B2 B4 B6 ** ** ** **) */
- mm5 = _mm_packs_pu16(mm5, mm5); /* mm5=(B1 B3 B5 B7 ** ** ** **) */
-
-#if RGB_PIXELSIZE == 3
-
- /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
- /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
- mmA = _mm_unpacklo_pi8(mmA, mmC); /* mmA=(00 10 02 12 04 14 06 16) */
- mmE = _mm_unpacklo_pi8(mmE, mmB); /* mmE=(20 01 22 03 24 05 26 07) */
- mmD = _mm_unpacklo_pi8(mmD, mmF); /* mmD=(11 21 13 23 15 25 17 27) */
-
- mmG = mmA;
- mmH = mmA;
- mmA = _mm_unpacklo_pi16(mmA, mmE); /* mmA=(00 10 20 01 02 12 22 03) */
- mmG = _mm_unpackhi_pi16(mmG, mmE); /* mmG=(04 14 24 05 06 16 26 07) */
-
- mmH = _mm_srli_si64(mmH, 2 * BYTE_BIT);
- mmE = _mm_srli_si64(mmE, 2 * BYTE_BIT);
-
- mmC = mmD;
- mmB = mmD;
- mmD = _mm_unpacklo_pi16(mmD, mmH); /* mmD=(11 21 02 12 13 23 04 14) */
- mmC = _mm_unpackhi_pi16(mmC, mmH); /* mmC=(15 25 06 16 17 27 -- --) */
-
- mmB = _mm_srli_si64(mmB, 2 * BYTE_BIT); /* mmB=(13 23 15 25 17 27 -- --) */
-
- mmF = mmE;
- mmE = _mm_unpacklo_pi16(mmE, mmB); /* mmE=(22 03 13 23 24 05 15 25) */
- mmF = _mm_unpackhi_pi16(mmF, mmB); /* mmF=(26 07 17 27 -- -- -- --) */
-
- mmA = _mm_unpacklo_pi32(mmA, mmD); /* mmA=(00 10 20 01 11 21 02 12) */
- mmE = _mm_unpacklo_pi32(mmE, mmG); /* mmE=(22 03 13 23 04 14 24 05) */
- mmC = _mm_unpacklo_pi32(mmC, mmF); /* mmC=(15 25 06 16 26 07 17 27) */
-
- if (num_cols >= 8) {
- _mm_store_si64((__m64 *)outptr, mmA);
- _mm_store_si64((__m64 *)(outptr + 8), mmE);
- _mm_store_si64((__m64 *)(outptr + 16), mmC);
- outptr += RGB_PIXELSIZE * 8;
- } else {
- col = num_cols * 3;
- asm(".set noreorder\r\n"
-
- "li $8, 16\r\n"
- "move $9, %4\r\n"
- "mov.s $f4, %1\r\n"
- "mov.s $f6, %3\r\n"
- "move $10, %5\r\n"
- "bltu $9, $8, 1f\r\n"
- "nop \r\n"
- "gssdlc1 $f4, 7($10)\r\n"
- "gssdrc1 $f4, 0($10)\r\n"
- "gssdlc1 $f6, 7+8($10)\r\n"
- "gssdrc1 $f6, 8($10)\r\n"
- "mov.s $f4, %2\r\n"
- "subu $9, $9, 16\r\n"
- "daddu $10, $10, 16\r\n"
- "b 2f\r\n"
- "nop \r\n"
-
- "1: \r\n"
- "li $8, 8\r\n" /* st8 */
- "bltu $9, $8, 2f\r\n"
- "nop \r\n"
- "gssdlc1 $f4, 7($10)\r\n"
- "gssdrc1 $f4, ($10)\r\n"
- "mov.s $f4, %3\r\n"
- "subu $9, $9, 8\r\n"
- "daddu $10, $10, 8\r\n"
-
- "2: \r\n"
- "li $8, 4\r\n" /* st4 */
- "mfc1 $11, $f4\r\n"
- "bltu $9, $8, 3f\r\n"
- "nop \r\n"
- "swl $11, 3($10)\r\n"
- "swr $11, 0($10)\r\n"
- "li $8, 32\r\n"
- "mtc1 $8, $f6\r\n"
- "dsrl $f4, $f4, $f6\r\n"
- "mfc1 $11, $f4\r\n"
- "subu $9, $9, 4\r\n"
- "daddu $10, $10, 4\r\n"
-
- "3: \r\n"
- "li $8, 2\r\n" /* st2 */
- "bltu $9, $8, 4f\r\n"
- "nop \r\n"
- "ush $11, 0($10)\r\n"
- "srl $11, 16\r\n"
- "subu $9, $9, 2\r\n"
- "daddu $10, $10, 2\r\n"
-
- "4: \r\n"
- "li $8, 1\r\n" /* st1 */
- "bltu $9, $8, 5f\r\n"
- "nop \r\n"
- "sb $11, 0($10)\r\n"
-
- "5: \r\n"
- "nop \r\n" /* end */
- : "=m" (*outptr)
- : "f" (mmA), "f" (mmC), "f" (mmE), "r" (col), "r" (outptr)
- : "$f4", "$f6", "$8", "$9", "$10", "$11", "memory"
- );
- }
-
-#else /* RGB_PIXELSIZE == 4 */
-
-#ifdef RGBX_FILLER_0XFF
- mm6 = _mm_cmpeq_pi8(mm6, mm6);
- mm7 = _mm_cmpeq_pi8(mm7, mm7);
-#else
- mm6 = _mm_xor_si64(mm6, mm6);
- mm7 = _mm_xor_si64(mm7, mm7);
-#endif
- /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
- /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
- /* mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) */
- /* mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) */
-
- mmA = _mm_unpacklo_pi8(mmA, mmC); /* mmA=(00 10 02 12 04 14 06 16) */
- mmE = _mm_unpacklo_pi8(mmE, mmG); /* mmE=(20 30 22 32 24 34 26 36) */
- mmB = _mm_unpacklo_pi8(mmB, mmD); /* mmB=(01 11 03 13 05 15 07 17) */
- mmF = _mm_unpacklo_pi8(mmF, mmH); /* mmF=(21 31 23 33 25 35 27 37) */
-
- mmC = mmA;
- mmA = _mm_unpacklo_pi16(mmA, mmE); /* mmA=(00 10 20 30 02 12 22 32) */
- mmC = _mm_unpackhi_pi16(mmC, mmE); /* mmC=(04 14 24 34 06 16 26 36) */
- mmG = mmB;
- mmB = _mm_unpacklo_pi16(mmB, mmF); /* mmB=(01 11 21 31 03 13 23 33) */
- mmG = _mm_unpackhi_pi16(mmG, mmF); /* mmG=(05 15 25 35 07 17 27 37) */
-
- mmD = mmA;
- mmA = _mm_unpacklo_pi32(mmA, mmB); /* mmA=(00 10 20 30 01 11 21 31) */
- mmD = _mm_unpackhi_pi32(mmD, mmB); /* mmD=(02 12 22 32 03 13 23 33) */
- mmH = mmC;
- mmC = _mm_unpacklo_pi32(mmC, mmG); /* mmC=(04 14 24 34 05 15 25 35) */
- mmH = _mm_unpackhi_pi32(mmH, mmG); /* mmH=(06 16 26 36 07 17 27 37) */
-
- if (num_cols >= 8) {
- _mm_store_si64((__m64 *)outptr, mmA);
- _mm_store_si64((__m64 *)(outptr + 8), mmD);
- _mm_store_si64((__m64 *)(outptr + 16), mmC);
- _mm_store_si64((__m64 *)(outptr + 24), mmH);
- outptr += RGB_PIXELSIZE * 8;
- } else {
- col = num_cols;
- asm(".set noreorder\r\n" /* st16 */
-
- "li $8, 4\r\n"
- "move $9, %6\r\n"
- "move $10, %7\r\n"
- "mov.s $f4, %2\r\n"
- "mov.s $f6, %4\r\n"
- "bltu $9, $8, 1f\r\n"
- "nop \r\n"
- "gssdlc1 $f4, 7($10)\r\n"
- "gssdrc1 $f4, ($10)\r\n"
- "gssdlc1 $f6, 7+8($10)\r\n"
- "gssdrc1 $f6, 8($10)\r\n"
- "mov.s $f4, %3\r\n"
- "mov.s $f6, %5\r\n"
- "subu $9, $9, 4\r\n"
- "daddu $10, $10, 16\r\n"
-
- "1: \r\n"
- "li $8, 2\r\n" /* st8 */
- "bltu $9, $8, 2f\r\n"
- "nop \r\n"
- "gssdlc1 $f4, 7($10)\r\n"
- "gssdrc1 $f4, 0($10)\r\n"
- "mov.s $f4, $f6\r\n"
- "subu $9, $9, 2\r\n"
- "daddu $10, $10, 8\r\n"
-
- "2: \r\n"
- "li $8, 1\r\n" /* st4 */
- "bltu $9, $8, 3f\r\n"
- "nop \r\n"
- "gsswlc1 $f4, 3($10)\r\n"
- "gsswrc1 $f4, 0($10)\r\n"
-
- "3: \r\n"
- "li %1, 0\r\n" /* end */
- : "=m" (*outptr), "=r" (col)
- : "f" (mmA), "f" (mmC), "f" (mmD), "f" (mmH), "r" (col),
- "r" (outptr)
- : "$f4", "$f6", "$8", "$9", "$10", "memory"
- );
- }
-
-#endif
-
- }
- }
-}
-
-#undef mmA
-#undef mmB
-#undef mmC
-#undef mmD
-#undef mmE
-#undef mmF
-#undef mmG
-#undef mmH
diff --git a/simd/loongson/jdcolor-mmi.c b/simd/loongson/jdcolor-mmi.c
deleted file mode 100644
index 2c58263..0000000
--- a/simd/loongson/jdcolor-mmi.c
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Loongson MMI optimizations for libjpeg-turbo
- *
- * Copyright (C) 2011, 2015, D. R. Commander. All Rights Reserved.
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
- * All Rights Reserved.
- * Authors: ZhuChen <zhuchen@loongson.cn>
- * CaiWanwei <caiwanwei@loongson.cn>
- * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* YCC --> RGB CONVERSION */
-
-#include "jsimd_mmi.h"
-
-
-#define F_0_344 ((short)22554) /* FIX(0.34414) */
-#define F_0_402 ((short)26345) /* FIX(1.40200) - FIX(1) */
-#define F_0_285 ((short)18734) /* FIX(1) - FIX(0.71414) */
-#define F_0_228 ((short)14942) /* FIX(2) - FIX(1.77200) */
-
-enum const_index {
- index_PW_ONE,
- index_PW_F0402,
- index_PW_MF0228,
- index_PW_MF0344_F0285,
- index_PD_ONEHALF
-};
-
-static uint64_t const_value[] = {
- _uint64_set_pi16(1, 1, 1, 1),
- _uint64_set_pi16(F_0_402, F_0_402, F_0_402, F_0_402),
- _uint64_set_pi16(-F_0_228, -F_0_228, -F_0_228, -F_0_228),
- _uint64_set_pi16(F_0_285, -F_0_344, F_0_285, -F_0_344),
- _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1)))
-};
-
-#define PW_ONE get_const_value(index_PW_ONE)
-#define PW_F0402 get_const_value(index_PW_F0402)
-#define PW_MF0228 get_const_value(index_PW_MF0228)
-#define PW_MF0344_F0285 get_const_value(index_PW_MF0344_F0285)
-#define PD_ONEHALF get_const_value(index_PD_ONEHALF)
-
-#define RGBX_FILLER_0XFF 1
-
-
-#include "jdcolext-mmi.c"
-#undef RGB_RED
-#undef RGB_GREEN
-#undef RGB_BLUE
-#undef RGB_PIXELSIZE
-
-#define RGB_RED EXT_RGB_RED
-#define RGB_GREEN EXT_RGB_GREEN
-#define RGB_BLUE EXT_RGB_BLUE
-#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extrgb_convert_mmi
-#include "jdcolext-mmi.c"
-#undef RGB_RED
-#undef RGB_GREEN
-#undef RGB_BLUE
-#undef RGB_PIXELSIZE
-#undef jsimd_ycc_rgb_convert_mmi
-
-#define RGB_RED EXT_RGBX_RED
-#define RGB_GREEN EXT_RGBX_GREEN
-#define RGB_BLUE EXT_RGBX_BLUE
-#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extrgbx_convert_mmi
-#include "jdcolext-mmi.c"
-#undef RGB_RED
-#undef RGB_GREEN
-#undef RGB_BLUE
-#undef RGB_PIXELSIZE
-#undef jsimd_ycc_rgb_convert_mmi
-
-#define RGB_RED EXT_BGR_RED
-#define RGB_GREEN EXT_BGR_GREEN
-#define RGB_BLUE EXT_BGR_BLUE
-#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extbgr_convert_mmi
-#include "jdcolext-mmi.c"
-#undef RGB_RED
-#undef RGB_GREEN
-#undef RGB_BLUE
-#undef RGB_PIXELSIZE
-#undef jsimd_ycc_rgb_convert_mmi
-
-#define RGB_RED EXT_BGRX_RED
-#define RGB_GREEN EXT_BGRX_GREEN
-#define RGB_BLUE EXT_BGRX_BLUE
-#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extbgrx_convert_mmi
-#include "jdcolext-mmi.c"
-#undef RGB_RED
-#undef RGB_GREEN
-#undef RGB_BLUE
-#undef RGB_PIXELSIZE
-#undef jsimd_ycc_rgb_convert_mmi
-
-#define RGB_RED EXT_XBGR_RED
-#define RGB_GREEN EXT_XBGR_GREEN
-#define RGB_BLUE EXT_XBGR_BLUE
-#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extxbgr_convert_mmi
-#include "jdcolext-mmi.c"
-#undef RGB_RED
-#undef RGB_GREEN
-#undef RGB_BLUE
-#undef RGB_PIXELSIZE
-#undef jsimd_ycc_rgb_convert_mmi
-
-#define RGB_RED EXT_XRGB_RED
-#define RGB_GREEN EXT_XRGB_GREEN
-#define RGB_BLUE EXT_XRGB_BLUE
-#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extxrgb_convert_mmi
-#include "jdcolext-mmi.c"
-#undef RGB_RED
-#undef RGB_GREEN
-#undef RGB_BLUE
-#undef RGB_PIXELSIZE
-#undef jsimd_ycc_rgb_convert_mmi
diff --git a/simd/loongson/jdsample-mmi.c b/simd/loongson/jdsample-mmi.c
deleted file mode 100644
index 00a6265..0000000
--- a/simd/loongson/jdsample-mmi.c
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Loongson MMI optimizations for libjpeg-turbo
- *
- * Copyright (C) 2015, 2018, D. R. Commander. All Rights Reserved.
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
- * All Rights Reserved.
- * Authors: ZhuChen <zhuchen@loongson.cn>
- * CaiWanwei <caiwanwei@loongson.cn>
- * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
- *
- * Based on the x86 SIMD extension for IJG JPEG library
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* CHROMA UPSAMPLING */
-
-#include "jsimd_mmi.h"
-
-
-enum const_index {
- index_PW_THREE,
- index_PW_SEVEN,
- index_PW_EIGHT,
-};
-
-static uint64_t const_value[] = {
- _uint64_set_pi16(3, 3, 3, 3),
- _uint64_set_pi16(7, 7, 7, 7),
- _uint64_set_pi16(8, 8, 8, 8),
-};
-
-#define PW_THREE get_const_value(index_PW_THREE)
-#define PW_SEVEN get_const_value(index_PW_SEVEN)
-#define PW_EIGHT get_const_value(index_PW_EIGHT)
-
-
-#define PROCESS_ROW(r) { \
- mm7 = _mm_load_si64((__m64 *)outptr##r); /* mm7=IntrL=( 0 1 2 3) */ \
- mm3 = _mm_load_si64((__m64 *)outptr##r + 1); /* mm3=IntrH=( 4 5 6 7) */ \
- \
- mm0 = mm7; \
- mm4 = mm3; \
- mm0 = _mm_srli_si64(mm0, 2 * BYTE_BIT); /* mm0=( 1 2 3 -) */ \
- mm4 = _mm_slli_si64(mm4, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm4=( - - - 4) */ \
- mm5 = mm7; \
- mm6 = mm3; \
- mm5 = _mm_srli_si64(mm5, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm5=( 3 - - -) */ \
- mm6 = _mm_slli_si64(mm6, 2 * BYTE_BIT); /* mm6=( - 4 5 6) */ \
- \
- mm0 = _mm_or_si64(mm0, mm4); /* mm0=( 1 2 3 4) */ \
- mm5 = _mm_or_si64(mm5, mm6); /* mm5=( 3 4 5 6) */ \
- \
- mm1 = mm7; \
- mm2 = mm3; \
- mm1 = _mm_slli_si64(mm1, 2 * BYTE_BIT); /* mm1=( - 0 1 2) */ \
- mm2 = _mm_srli_si64(mm2, 2 * BYTE_BIT); /* mm2=( 5 6 7 -) */ \
- mm4 = mm3; \
- mm4 = _mm_srli_si64(mm4, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm4=( 7 - - -) */ \
- \
- mm1 = _mm_or_si64(mm1, wk[r]); /* mm1=(-1 0 1 2) */ \
- mm2 = _mm_or_si64(mm2, wk[r + 2]); /* mm2=( 5 6 6 8) */ \
- \
- wk[r] = mm4; \
- \
- mm7 = _mm_mullo_pi16(mm7, PW_THREE); \
- mm3 = _mm_mullo_pi16(mm3, PW_THREE); \
- mm1 = _mm_add_pi16(mm1, PW_EIGHT); \
- mm5 = _mm_add_pi16(mm5, PW_EIGHT); \
- mm0 = _mm_add_pi16(mm0, PW_SEVEN); \
- mm2 = _mm_add_pi16(mm2, PW_SEVEN); \
- \
- mm1 = _mm_add_pi16(mm1, mm7); \
- mm5 = _mm_add_pi16(mm5, mm3); \
- mm1 = _mm_srli_pi16(mm1, 4); /* mm1=OutrLE=( 0 2 4 6) */ \
- mm5 = _mm_srli_pi16(mm5, 4); /* mm5=OutrHE=( 8 10 12 14) */ \
- mm0 = _mm_add_pi16(mm0, mm7); \
- mm2 = _mm_add_pi16(mm2, mm3); \
- mm0 = _mm_srli_pi16(mm0, 4); /* mm0=OutrLO=( 1 3 5 7) */ \
- mm2 = _mm_srli_pi16(mm2, 4); /* mm2=OutrHO=( 9 11 13 15) */ \
- \
- mm0 = _mm_slli_pi16(mm0, BYTE_BIT); \
- mm2 = _mm_slli_pi16(mm2, BYTE_BIT); \
- mm1 = _mm_or_si64(mm1, mm0); /* mm1=OutrL=( 0 1 2 3 4 5 6 7) */ \
- mm5 = _mm_or_si64(mm5, mm2); /* mm5=OutrH=( 8 9 10 11 12 13 14 15) */ \
- \
- _mm_store_si64((__m64 *)outptr##r, mm1); \
- _mm_store_si64((__m64 *)outptr##r + 1, mm5); \
-}
-
-void jsimd_h2v2_fancy_upsample_mmi(int max_v_samp_factor,
- JDIMENSION downsampled_width,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
- JSAMPARRAY output_data = *output_data_ptr;
- JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
- int inrow, outrow, incol, tmp, tmp1;
- __m64 mm0, mm1, mm2, mm3 = 0.0, mm4, mm5, mm6, mm7 = 0.0;
- __m64 wk[4], mm_tmp;
-
- for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
-
- inptr_1 = input_data[inrow - 1];
- inptr0 = input_data[inrow];
- inptr1 = input_data[inrow + 1];
- outptr0 = output_data[outrow++];
- outptr1 = output_data[outrow++];
-
- if (downsampled_width & 7) {
- tmp = (downsampled_width - 1) * sizeof(JSAMPLE);
- tmp1 = downsampled_width * sizeof(JSAMPLE);
- asm("daddu $8, %3, %6\r\n"
- "lb $9, ($8)\r\n"
- "daddu $8, %3, %7\r\n"
- "sb $9, ($8)\r\n"
- "daddu $8, %4, %6\r\n"
- "lb $9, ($8)\r\n"
- "daddu $8, %4, %7\r\n"
- "sb $9, ($8)\r\n"
- "daddu $8, %5, %6\r\n"
- "lb $9, ($8)\r\n"
- "daddu $8, %5, %7\r\n"
- "sb $9, ($8)\r\n"
- : "=m" (*inptr_1), "=m" (*inptr0), "=m" (*inptr1)
- : "r" (inptr_1), "r" (inptr0), "r" (inptr1), "r" (tmp), "r" (tmp1)
- : "$8", "$9"
- );
- }
-
- /* process the first column block */
- mm0 = _mm_load_si64((__m64 *)inptr0); /* mm0 = row[ 0][0] */
- mm1 = _mm_load_si64((__m64 *)inptr_1); /* mm1 = row[-1][0] */
- mm2 = _mm_load_si64((__m64 *)inptr1); /* mm2 = row[ 1][0] */
-
- mm3 = _mm_xor_si64(mm3, mm3); /* mm3 = (all 0's) */
- mm4 = mm0;
- mm0 = _mm_unpacklo_pi8(mm0, mm3); /* mm0 = row[ 0][0]( 0 1 2 3) */
- mm4 = _mm_unpackhi_pi8(mm4, mm3); /* mm4 = row[ 0][0]( 4 5 6 7) */
- mm5 = mm1;
- mm1 = _mm_unpacklo_pi8(mm1, mm3); /* mm1 = row[-1][0]( 0 1 2 3) */
- mm5 = _mm_unpackhi_pi8(mm5, mm3); /* mm5 = row[-1][0]( 4 5 6 7) */
- mm6 = mm2;
- mm2 = _mm_unpacklo_pi8(mm2, mm3); /* mm2 = row[+1][0]( 0 1 2 3) */
- mm6 = _mm_unpackhi_pi8(mm6, mm3); /* mm6 = row[+1][0]( 4 5 6 7) */
-
- mm0 = _mm_mullo_pi16(mm0, PW_THREE);
- mm4 = _mm_mullo_pi16(mm4, PW_THREE);
-
- mm7 = _mm_cmpeq_pi8(mm7, mm7);
- mm7 = _mm_srli_si64(mm7, (SIZEOF_MMWORD - 2) * BYTE_BIT);
-
- mm1 = _mm_add_pi16(mm1, mm0); /* mm1=Int0L=( 0 1 2 3) */
- mm5 = _mm_add_pi16(mm5, mm4); /* mm5=Int0H=( 4 5 6 7) */
- mm2 = _mm_add_pi16(mm2, mm0); /* mm2=Int1L=( 0 1 2 3) */
- mm6 = _mm_add_pi16(mm6, mm4); /* mm6=Int1H=( 4 5 6 7) */
-
- _mm_store_si64((__m64 *)outptr0, mm1); /* temporarily save */
- _mm_store_si64((__m64 *)outptr0 + 1, mm5); /* the intermediate data */
- _mm_store_si64((__m64 *)outptr1, mm2);
- _mm_store_si64((__m64 *)outptr1 + 1, mm6);
-
- mm1 = _mm_and_si64(mm1, mm7); /* mm1=( 0 - - -) */
- mm2 = _mm_and_si64(mm2, mm7); /* mm2=( 0 - - -) */
-
- wk[0] = mm1;
- wk[1] = mm2;
-
- for (incol = downsampled_width; incol > 0;
- incol -= 8, inptr_1 += 8, inptr0 += 8, inptr1 += 8,
- outptr0 += 16, outptr1 += 16) {
-
- if (incol > 8) {
- /* process the next column block */
- mm0 = _mm_load_si64((__m64 *)inptr0 + 1); /* mm0 = row[ 0][1] */
- mm1 = _mm_load_si64((__m64 *)inptr_1 + 1); /* mm1 = row[-1][1] */
- mm2 = _mm_load_si64((__m64 *)inptr1 + 1); /* mm2 = row[+1][1] */
-
- mm3 = _mm_setzero_si64(); /* mm3 = (all 0's) */
- mm4 = mm0;
- mm0 = _mm_unpacklo_pi8(mm0, mm3); /* mm0 = row[ 0][1]( 0 1 2 3) */
- mm4 = _mm_unpackhi_pi8(mm4, mm3); /* mm4 = row[ 0][1]( 4 5 6 7) */
- mm5 = mm1;
- mm1 = _mm_unpacklo_pi8(mm1, mm3); /* mm1 = row[-1][1]( 0 1 2 3) */
- mm5 = _mm_unpackhi_pi8(mm5, mm3); /* mm5 = row[-1][1]( 4 5 6 7) */
- mm6 = mm2;
- mm2 = _mm_unpacklo_pi8(mm2, mm3); /* mm2 = row[+1][1]( 0 1 2 3) */
- mm6 = _mm_unpackhi_pi8(mm6, mm3); /* mm6 = row[+1][1]( 4 5 6 7) */
-
- mm0 = _mm_mullo_pi16(mm0, PW_THREE);
- mm4 = _mm_mullo_pi16(mm4, PW_THREE);
-
- mm1 = _mm_add_pi16(mm1, mm0); /* mm1 = Int0L = ( 0 1 2 3) */
- mm5 = _mm_add_pi16(mm5, mm4); /* mm5 = Int0H = ( 4 5 6 7) */
- mm2 = _mm_add_pi16(mm2, mm0); /* mm2 = Int1L = ( 0 1 2 3) */
- mm6 = _mm_add_pi16(mm6, mm4); /* mm6 = Int1H = ( 4 5 6 7) */
-
- _mm_store_si64((__m64 *)outptr0 + 2, mm1); /* temporarily save */
- _mm_store_si64((__m64 *)outptr0 + 3, mm5); /* the intermediate data */
- _mm_store_si64((__m64 *)outptr1 + 2, mm2);
- _mm_store_si64((__m64 *)outptr1 + 3, mm6);
-
- mm1 = _mm_slli_si64(mm1, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm1=( - - - 0) */
- mm2 = _mm_slli_si64(mm2, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm2=( - - - 0) */
-
- wk[2] = mm1;
- wk[3] = mm2;
- } else {
- /* process the last column block */
- mm1 = _mm_cmpeq_pi8(mm1, mm1);
- mm1 = _mm_slli_si64(mm1, (SIZEOF_MMWORD - 2) * BYTE_BIT);
- mm2 = mm1;
-
- mm_tmp = _mm_load_si64((__m64 *)outptr0 + 1);
- mm1 = _mm_and_si64(mm1, mm_tmp); /* mm1=( - - - 7) */
- mm_tmp = _mm_load_si64((__m64 *)outptr1 + 1);
- mm2 = _mm_and_si64(mm2, mm_tmp); /* mm2=( - - - 7) */
-
- wk[2] = mm1;
- wk[3] = mm2;
- }
-
- /* process the upper row */
- PROCESS_ROW(0)
-
- /* process the lower row */
- PROCESS_ROW(1)
- }
- }
-}
diff --git a/simd/loongson/jfdctint-mmi.c b/simd/loongson/jfdctint-mmi.c
deleted file mode 100644
index a0ea692..0000000
--- a/simd/loongson/jfdctint-mmi.c
+++ /dev/null
@@ -1,398 +0,0 @@
-/*
- * Loongson MMI optimizations for libjpeg-turbo
- *
- * Copyright (C) 2014, 2018, D. R. Commander. All Rights Reserved.
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
- * All Rights Reserved.
- * Authors: ZhuChen <zhuchen@loongson.cn>
- * CaiWanwei <caiwanwei@loongson.cn>
- * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
- *
- * Based on the x86 SIMD extension for IJG JPEG library
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* SLOW INTEGER FORWARD DCT */
-
-#include "jsimd_mmi.h"
-
-
-#define CONST_BITS 13
-#define PASS1_BITS 2
-#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
-#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
-
-#define FIX_0_298 ((short)2446) /* FIX(0.298631336) */
-#define FIX_0_390 ((short)3196) /* FIX(0.390180644) */
-#define FIX_0_541 ((short)4433) /* FIX(0.541196100) */
-#define FIX_0_765 ((short)6270) /* FIX(0.765366865) */
-#define FIX_0_899 ((short)7373) /* FIX(0.899976223) */
-#define FIX_1_175 ((short)9633) /* FIX(1.175875602) */
-#define FIX_1_501 ((short)12299) /* FIX(1.501321110) */
-#define FIX_1_847 ((short)15137) /* FIX(1.847759065) */
-#define FIX_1_961 ((short)16069) /* FIX(1.961570560) */
-#define FIX_2_053 ((short)16819) /* FIX(2.053119869) */
-#define FIX_2_562 ((short)20995) /* FIX(2.562915447) */
-#define FIX_3_072 ((short)25172) /* FIX(3.072711026) */
-
-enum const_index {
- index_PW_F130_F054,
- index_PW_F054_MF130,
- index_PW_MF078_F117,
- index_PW_F117_F078,
- index_PW_MF060_MF089,
- index_PW_MF089_F060,
- index_PW_MF050_MF256,
- index_PW_MF256_F050,
- index_PD_DESCALE_P1,
- index_PD_DESCALE_P2,
- index_PW_DESCALE_P2X
-};
-
-static uint64_t const_value[] = {
- _uint64_set_pi16(FIX_0_541, (FIX_0_541 + FIX_0_765),
- FIX_0_541, (FIX_0_541 + FIX_0_765)),
- _uint64_set_pi16((FIX_0_541 - FIX_1_847), FIX_0_541,
- (FIX_0_541 - FIX_1_847), FIX_0_541),
- _uint64_set_pi16(FIX_1_175, (FIX_1_175 - FIX_1_961),
- FIX_1_175, (FIX_1_175 - FIX_1_961)),
- _uint64_set_pi16((FIX_1_175 - FIX_0_390), FIX_1_175,
- (FIX_1_175 - FIX_0_390), FIX_1_175),
- _uint64_set_pi16(-FIX_0_899, (FIX_0_298 - FIX_0_899),
- -FIX_0_899, (FIX_0_298 - FIX_0_899)),
- _uint64_set_pi16((FIX_1_501 - FIX_0_899), -FIX_0_899,
- (FIX_1_501 - FIX_0_899), -FIX_0_899),
- _uint64_set_pi16(-FIX_2_562, (FIX_2_053 - FIX_2_562),
- -FIX_2_562, (FIX_2_053 - FIX_2_562)),
- _uint64_set_pi16((FIX_3_072 - FIX_2_562), -FIX_2_562,
- (FIX_3_072 - FIX_2_562), -FIX_2_562),
- _uint64_set_pi32((1 << (DESCALE_P1 - 1)), (1 << (DESCALE_P1 - 1))),
- _uint64_set_pi32((1 << (DESCALE_P2 - 1)), (1 << (DESCALE_P2 - 1))),
- _uint64_set_pi16((1 << (PASS1_BITS - 1)), (1 << (PASS1_BITS - 1)),
- (1 << (PASS1_BITS - 1)), (1 << (PASS1_BITS - 1)))
-};
-
-#define PW_F130_F054 get_const_value(index_PW_F130_F054)
-#define PW_F054_MF130 get_const_value(index_PW_F054_MF130)
-#define PW_MF078_F117 get_const_value(index_PW_MF078_F117)
-#define PW_F117_F078 get_const_value(index_PW_F117_F078)
-#define PW_MF060_MF089 get_const_value(index_PW_MF060_MF089)
-#define PW_MF089_F060 get_const_value(index_PW_MF089_F060)
-#define PW_MF050_MF256 get_const_value(index_PW_MF050_MF256)
-#define PW_MF256_F050 get_const_value(index_PW_MF256_F050)
-#define PD_DESCALE_P1 get_const_value(index_PD_DESCALE_P1)
-#define PD_DESCALE_P2 get_const_value(index_PD_DESCALE_P2)
-#define PW_DESCALE_P2X get_const_value(index_PW_DESCALE_P2X)
-
-
-#define DO_FDCT_COMMON(PASS) { \
- __m64 tmp1312l, tmp1312h, tmp47l, tmp47h, tmp4l, tmp4h, tmp7l, tmp7h; \
- __m64 tmp56l, tmp56h, tmp5l, tmp5h, tmp6l, tmp6h; \
- __m64 out1l, out1h, out2l, out2h, out3l, out3h; \
- __m64 out5l, out5h, out6l, out6h, out7l, out7h; \
- __m64 z34l, z34h, z3l, z3h, z4l, z4h, z3, z4; \
- \
- /* (Original) \
- * z1 = (tmp12 + tmp13) * 0.541196100; \
- * out2 = z1 + tmp13 * 0.765366865; \
- * out6 = z1 + tmp12 * -1.847759065; \
- * \
- * (This implementation) \
- * out2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \
- * out6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \
- */ \
- \
- tmp1312l = _mm_unpacklo_pi16(tmp13, tmp12); \
- tmp1312h = _mm_unpackhi_pi16(tmp13, tmp12); \
- \
- out2l = _mm_madd_pi16(tmp1312l, PW_F130_F054); \
- out2h = _mm_madd_pi16(tmp1312h, PW_F130_F054); \
- out6l = _mm_madd_pi16(tmp1312l, PW_F054_MF130); \
- out6h = _mm_madd_pi16(tmp1312h, PW_F054_MF130); \
- \
- out2l = _mm_add_pi32(out2l, PD_DESCALE_P##PASS); \
- out2h = _mm_add_pi32(out2h, PD_DESCALE_P##PASS); \
- out2l = _mm_srai_pi32(out2l, DESCALE_P##PASS); \
- out2h = _mm_srai_pi32(out2h, DESCALE_P##PASS); \
- \
- out6l = _mm_add_pi32(out6l, PD_DESCALE_P##PASS); \
- out6h = _mm_add_pi32(out6h, PD_DESCALE_P##PASS); \
- out6l = _mm_srai_pi32(out6l, DESCALE_P##PASS); \
- out6h = _mm_srai_pi32(out6h, DESCALE_P##PASS); \
- \
- out2 = _mm_packs_pi32(out2l, out2h); \
- out6 = _mm_packs_pi32(out6l, out6h); \
- \
- /* Odd part */ \
- \
- z3 = _mm_add_pi16(tmp4, tmp6); \
- z4 = _mm_add_pi16(tmp5, tmp7); \
- \
- /* (Original) \
- * z5 = (z3 + z4) * 1.175875602; \
- * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
- * z3 += z5; z4 += z5; \
- * \
- * (This implementation) \
- * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
- * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
- */ \
- \
- z34l = _mm_unpacklo_pi16(z3, z4); \
- z34h = _mm_unpackhi_pi16(z3, z4); \
- z3l = _mm_madd_pi16(z34l, PW_MF078_F117); \
- z3h = _mm_madd_pi16(z34h, PW_MF078_F117); \
- z4l = _mm_madd_pi16(z34l, PW_F117_F078); \
- z4h = _mm_madd_pi16(z34h, PW_F117_F078); \
- \
- /* (Original) \
- * z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; \
- * tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; \
- * tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; \
- * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
- * out7 = tmp4 + z1 + z3; out5 = tmp5 + z2 + z4; \
- * out3 = tmp6 + z2 + z3; out1 = tmp7 + z1 + z4; \
- * \
- * (This implementation) \
- * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \
- * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \
- * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \
- * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \
- * out7 = tmp4 + z3; out5 = tmp5 + z4; \
- * out3 = tmp6 + z3; out1 = tmp7 + z4; \
- */ \
- \
- tmp47l = _mm_unpacklo_pi16(tmp4, tmp7); \
- tmp47h = _mm_unpackhi_pi16(tmp4, tmp7); \
- \
- tmp4l = _mm_madd_pi16(tmp47l, PW_MF060_MF089); \
- tmp4h = _mm_madd_pi16(tmp47h, PW_MF060_MF089); \
- tmp7l = _mm_madd_pi16(tmp47l, PW_MF089_F060); \
- tmp7h = _mm_madd_pi16(tmp47h, PW_MF089_F060); \
- \
- out7l = _mm_add_pi32(tmp4l, z3l); \
- out7h = _mm_add_pi32(tmp4h, z3h); \
- out1l = _mm_add_pi32(tmp7l, z4l); \
- out1h = _mm_add_pi32(tmp7h, z4h); \
- \
- out7l = _mm_add_pi32(out7l, PD_DESCALE_P##PASS); \
- out7h = _mm_add_pi32(out7h, PD_DESCALE_P##PASS); \
- out7l = _mm_srai_pi32(out7l, DESCALE_P##PASS); \
- out7h = _mm_srai_pi32(out7h, DESCALE_P##PASS); \
- \
- out1l = _mm_add_pi32(out1l, PD_DESCALE_P##PASS); \
- out1h = _mm_add_pi32(out1h, PD_DESCALE_P##PASS); \
- out1l = _mm_srai_pi32(out1l, DESCALE_P##PASS); \
- out1h = _mm_srai_pi32(out1h, DESCALE_P##PASS); \
- \
- out7 = _mm_packs_pi32(out7l, out7h); \
- out1 = _mm_packs_pi32(out1l, out1h); \
- \
- tmp56l = _mm_unpacklo_pi16(tmp5, tmp6); \
- tmp56h = _mm_unpackhi_pi16(tmp5, tmp6); \
- \
- tmp5l = _mm_madd_pi16(tmp56l, PW_MF050_MF256); \
- tmp5h = _mm_madd_pi16(tmp56h, PW_MF050_MF256); \
- tmp6l = _mm_madd_pi16(tmp56l, PW_MF256_F050); \
- tmp6h = _mm_madd_pi16(tmp56h, PW_MF256_F050); \
- \
- out5l = _mm_add_pi32(tmp5l, z4l); \
- out5h = _mm_add_pi32(tmp5h, z4h); \
- out3l = _mm_add_pi32(tmp6l, z3l); \
- out3h = _mm_add_pi32(tmp6h, z3h); \
- \
- out5l = _mm_add_pi32(out5l, PD_DESCALE_P##PASS); \
- out5h = _mm_add_pi32(out5h, PD_DESCALE_P##PASS); \
- out5l = _mm_srai_pi32(out5l, DESCALE_P##PASS); \
- out5h = _mm_srai_pi32(out5h, DESCALE_P##PASS); \
- \
- out3l = _mm_add_pi32(out3l, PD_DESCALE_P##PASS); \
- out3h = _mm_add_pi32(out3h, PD_DESCALE_P##PASS); \
- out3l = _mm_srai_pi32(out3l, DESCALE_P##PASS); \
- out3h = _mm_srai_pi32(out3h, DESCALE_P##PASS); \
- \
- out5 = _mm_packs_pi32(out5l, out5h); \
- out3 = _mm_packs_pi32(out3l, out3h); \
-}
-
-#define DO_FDCT_PASS1() { \
- __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
- __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
- __m64 col0, col1, col2, col3, col4, col5, col6, col7; \
- __m64 tmp10, tmp11; \
- \
- row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]); /* (00 01 02 03) */ \
- row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4]); /* (04 05 06 07) */ \
- row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]); /* (10 11 12 13) */ \
- row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4]); /* (14 15 16 17) */ \
- row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]); /* (20 21 22 23) */ \
- row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4]); /* (24 25 26 27) */ \
- row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]); /* (30 31 32 33) */ \
- row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4]); /* (34 35 36 37) */ \
- \
- /* Transpose coefficients */ \
- \
- row23a = _mm_unpacklo_pi16(row2l, row3l); /* row23a=(20 30 21 31) */ \
- row23b = _mm_unpackhi_pi16(row2l, row3l); /* row23b=(22 32 23 33) */ \
- row23c = _mm_unpacklo_pi16(row2h, row3h); /* row23c=(24 34 25 35) */ \
- row23d = _mm_unpackhi_pi16(row2h, row3h); /* row23d=(26 36 27 37) */ \
- \
- row01a = _mm_unpacklo_pi16(row0l, row1l); /* row01a=(00 10 01 11) */ \
- row01b = _mm_unpackhi_pi16(row0l, row1l); /* row01b=(02 12 03 13) */ \
- row01c = _mm_unpacklo_pi16(row0h, row1h); /* row01c=(04 14 05 15) */ \
- row01d = _mm_unpackhi_pi16(row0h, row1h); /* row01d=(06 16 07 17) */ \
- \
- col0 = _mm_unpacklo_pi32(row01a, row23a); /* col0=(00 10 20 30) */ \
- col1 = _mm_unpackhi_pi32(row01a, row23a); /* col1=(01 11 21 31) */ \
- col6 = _mm_unpacklo_pi32(row01d, row23d); /* col6=(06 16 26 36) */ \
- col7 = _mm_unpackhi_pi32(row01d, row23d); /* col7=(07 17 27 37) */ \
- \
- tmp6 = _mm_sub_pi16(col1, col6); /* tmp6=col1-col6 */ \
- tmp7 = _mm_sub_pi16(col0, col7); /* tmp7=col0-col7 */ \
- tmp1 = _mm_add_pi16(col1, col6); /* tmp1=col1+col6 */ \
- tmp0 = _mm_add_pi16(col0, col7); /* tmp0=col0+col7 */ \
- \
- col2 = _mm_unpacklo_pi32(row01b, row23b); /* col2=(02 12 22 32) */ \
- col3 = _mm_unpackhi_pi32(row01b, row23b); /* col3=(03 13 23 33) */ \
- col4 = _mm_unpacklo_pi32(row01c, row23c); /* col4=(04 14 24 34) */ \
- col5 = _mm_unpackhi_pi32(row01c, row23c); /* col5=(05 15 25 35) */ \
- \
- tmp3 = _mm_add_pi16(col3, col4); /* tmp3=col3+col4 */ \
- tmp2 = _mm_add_pi16(col2, col5); /* tmp2=col2+col5 */ \
- tmp4 = _mm_sub_pi16(col3, col4); /* tmp4=col3-col4 */ \
- tmp5 = _mm_sub_pi16(col2, col5); /* tmp5=col2-col5 */ \
- \
- /* Even part */ \
- \
- tmp10 = _mm_add_pi16(tmp0, tmp3); /* tmp10=tmp0+tmp3 */ \
- tmp13 = _mm_sub_pi16(tmp0, tmp3); /* tmp13=tmp0-tmp3 */ \
- tmp11 = _mm_add_pi16(tmp1, tmp2); /* tmp11=tmp1+tmp2 */ \
- tmp12 = _mm_sub_pi16(tmp1, tmp2); /* tmp12=tmp1-tmp2 */ \
- \
- out0 = _mm_add_pi16(tmp10, tmp11); /* out0=tmp10+tmp11 */ \
- out4 = _mm_sub_pi16(tmp10, tmp11); /* out4=tmp10-tmp11 */ \
- out0 = _mm_slli_pi16(out0, PASS1_BITS); \
- out4 = _mm_slli_pi16(out4, PASS1_BITS); \
- \
- DO_FDCT_COMMON(1) \
- \
- _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
- _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4], out4); \
- _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
- _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4], out5); \
- _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
- _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4], out6); \
- _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
- _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4], out7); \
-}
-
-#define DO_FDCT_PASS2() { \
- __m64 col0l, col0h, col1l, col1h, col2l, col2h, col3l, col3h; \
- __m64 col01a, col01b, col01c, col01d, col23a, col23b, col23c, col23d; \
- __m64 row0, row1, row2, row3, row4, row5, row6, row7; \
- __m64 tmp10, tmp11; \
- \
- col0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]); /* (00 10 20 30) */ \
- col1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]); /* (01 11 21 31) */ \
- col2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]); /* (02 12 22 32) */ \
- col3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]); /* (03 13 23 33) */ \
- col0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 4]); /* (40 50 60 70) */ \
- col1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 5]); /* (41 51 61 71) */ \
- col2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 6]); /* (42 52 62 72) */ \
- col3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 7]); /* (43 53 63 73) */ \
- \
- /* Transpose coefficients */ \
- \
- col23a = _mm_unpacklo_pi16(col2l, col3l); /* col23a=(02 03 12 13) */ \
- col23b = _mm_unpackhi_pi16(col2l, col3l); /* col23b=(22 23 32 33) */ \
- col23c = _mm_unpacklo_pi16(col2h, col3h); /* col23c=(42 43 52 53) */ \
- col23d = _mm_unpackhi_pi16(col2h, col3h); /* col23d=(62 63 72 73) */ \
- \
- col01a = _mm_unpacklo_pi16(col0l, col1l); /* col01a=(00 01 10 11) */ \
- col01b = _mm_unpackhi_pi16(col0l, col1l); /* col01b=(20 21 30 31) */ \
- col01c = _mm_unpacklo_pi16(col0h, col1h); /* col01c=(40 41 50 51) */ \
- col01d = _mm_unpackhi_pi16(col0h, col1h); /* col01d=(60 61 70 71) */ \
- \
- row0 = _mm_unpacklo_pi32(col01a, col23a); /* row0=(00 01 02 03) */ \
- row1 = _mm_unpackhi_pi32(col01a, col23a); /* row1=(10 11 12 13) */ \
- row6 = _mm_unpacklo_pi32(col01d, col23d); /* row6=(60 61 62 63) */ \
- row7 = _mm_unpackhi_pi32(col01d, col23d); /* row7=(70 71 72 73) */ \
- \
- tmp6 = _mm_sub_pi16(row1, row6); /* tmp6=row1-row6 */ \
- tmp7 = _mm_sub_pi16(row0, row7); /* tmp7=row0-row7 */ \
- tmp1 = _mm_add_pi16(row1, row6); /* tmp1=row1+row6 */ \
- tmp0 = _mm_add_pi16(row0, row7); /* tmp0=row0+row7 */ \
- \
- row2 = _mm_unpacklo_pi32(col01b, col23b); /* row2=(20 21 22 23) */ \
- row3 = _mm_unpackhi_pi32(col01b, col23b); /* row3=(30 31 32 33) */ \
- row4 = _mm_unpacklo_pi32(col01c, col23c); /* row4=(40 41 42 43) */ \
- row5 = _mm_unpackhi_pi32(col01c, col23c); /* row5=(50 51 52 53) */ \
- \
- tmp3 = _mm_add_pi16(row3, row4); /* tmp3=row3+row4 */ \
- tmp2 = _mm_add_pi16(row2, row5); /* tmp2=row2+row5 */ \
- tmp4 = _mm_sub_pi16(row3, row4); /* tmp4=row3-row4 */ \
- tmp5 = _mm_sub_pi16(row2, row5); /* tmp5=row2-row5 */ \
- \
- /* Even part */ \
- \
- tmp10 = _mm_add_pi16(tmp0, tmp3); /* tmp10=tmp0+tmp3 */ \
- tmp13 = _mm_sub_pi16(tmp0, tmp3); /* tmp13=tmp0-tmp3 */ \
- tmp11 = _mm_add_pi16(tmp1, tmp2); /* tmp11=tmp1+tmp2 */ \
- tmp12 = _mm_sub_pi16(tmp1, tmp2); /* tmp12=tmp1-tmp2 */ \
- \
- out0 = _mm_add_pi16(tmp10, tmp11); /* out0=tmp10+tmp11 */ \
- out4 = _mm_sub_pi16(tmp10, tmp11); /* out4=tmp10-tmp11 */ \
- \
- out0 = _mm_add_pi16(out0, PW_DESCALE_P2X); \
- out4 = _mm_add_pi16(out4, PW_DESCALE_P2X); \
- out0 = _mm_srai_pi16(out0, PASS1_BITS); \
- out4 = _mm_srai_pi16(out4, PASS1_BITS); \
- \
- DO_FDCT_COMMON(2) \
- \
- _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
- _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
- _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
- _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
- _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 4], out4); \
- _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 5], out5); \
- _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 6], out6); \
- _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 7], out7); \
-}
-
-void jsimd_fdct_islow_mmi(DCTELEM *data)
-{
- __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- __m64 out0, out1, out2, out3, out4, out5, out6, out7;
- __m64 tmp12, tmp13;
- DCTELEM *dataptr = data;
-
- /* Pass 1: process rows. */
-
- DO_FDCT_PASS1()
- dataptr += DCTSIZE * 4;
- DO_FDCT_PASS1()
-
- /* Pass 2: process columns. */
-
- dataptr = data;
- DO_FDCT_PASS2()
- dataptr += 4;
- DO_FDCT_PASS2()
-}
diff --git a/simd/loongson/jidctint-mmi.c b/simd/loongson/jidctint-mmi.c
deleted file mode 100644
index 419c638..0000000
--- a/simd/loongson/jidctint-mmi.c
+++ /dev/null
@@ -1,571 +0,0 @@
-/*
- * Loongson MMI optimizations for libjpeg-turbo
- *
- * Copyright (C) 2014-2015, 2018, D. R. Commander. All Rights Reserved.
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
- * All Rights Reserved.
- * Authors: ZhuChen <zhuchen@loongson.cn>
- * CaiWanwei <caiwanwei@loongson.cn>
- * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
- *
- * Based on the x86 SIMD extension for IJG JPEG library
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* SLOW INTEGER INVERSE DCT */
-
-#include "jsimd_mmi.h"
-
-
-#define CONST_BITS 13
-#define PASS1_BITS 2
-#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
-#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
-#define CENTERJSAMPLE 128
-
-#define FIX_0_298 ((short)2446) /* FIX(0.298631336) */
-#define FIX_0_390 ((short)3196) /* FIX(0.390180644) */
-#define FIX_0_899 ((short)7373) /* FIX(0.899976223) */
-#define FIX_0_541 ((short)4433) /* FIX(0.541196100) */
-#define FIX_0_765 ((short)6270) /* FIX(0.765366865) */
-#define FIX_1_175 ((short)9633) /* FIX(1.175875602) */
-#define FIX_1_501 ((short)12299) /* FIX(1.501321110) */
-#define FIX_1_847 ((short)15137) /* FIX(1.847759065) */
-#define FIX_1_961 ((short)16069) /* FIX(1.961570560) */
-#define FIX_2_053 ((short)16819) /* FIX(2.053119869) */
-#define FIX_2_562 ((short)20995) /* FIX(2.562915447) */
-#define FIX_3_072 ((short)25172) /* FIX(3.072711026) */
-
-enum const_index {
- index_PW_F130_F054,
- index_PW_F054_MF130,
- index_PW_MF078_F117,
- index_PW_F117_F078,
- index_PW_MF060_MF089,
- index_PW_MF089_F060,
- index_PW_MF050_MF256,
- index_PW_MF256_F050,
- index_PD_DESCALE_P1,
- index_PD_DESCALE_P2,
- index_PB_CENTERJSAMP
-};
-
-static uint64_t const_value[] = {
- _uint64_set_pi16(FIX_0_541, (FIX_0_541 + FIX_0_765),
- FIX_0_541, (FIX_0_541 + FIX_0_765)),
- _uint64_set_pi16((FIX_0_541 - FIX_1_847), FIX_0_541,
- (FIX_0_541 - FIX_1_847), FIX_0_541),
- _uint64_set_pi16(FIX_1_175, (FIX_1_175 - FIX_1_961),
- FIX_1_175, (FIX_1_175 - FIX_1_961)),
- _uint64_set_pi16((FIX_1_175 - FIX_0_390), FIX_1_175,
- (FIX_1_175 - FIX_0_390), FIX_1_175),
- _uint64_set_pi16(-FIX_0_899, (FIX_0_298 - FIX_0_899),
- -FIX_0_899, (FIX_0_298 - FIX_0_899)),
- _uint64_set_pi16((FIX_1_501 - FIX_0_899), -FIX_0_899,
- (FIX_1_501 - FIX_0_899), -FIX_0_899),
- _uint64_set_pi16(-FIX_2_562, (FIX_2_053 - FIX_2_562),
- -FIX_2_562, (FIX_2_053 - FIX_2_562)),
- _uint64_set_pi16((FIX_3_072 - FIX_2_562), -FIX_2_562,
- (FIX_3_072 - FIX_2_562), -FIX_2_562),
- _uint64_set_pi32((1 << (DESCALE_P1 - 1)), (1 << (DESCALE_P1 - 1))),
- _uint64_set_pi32((1 << (DESCALE_P2 - 1)), (1 << (DESCALE_P2 - 1))),
- _uint64_set_pi8(CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE,
- CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE)
-};
-
-#define PW_F130_F054 get_const_value(index_PW_F130_F054)
-#define PW_F054_MF130 get_const_value(index_PW_F054_MF130)
-#define PW_MF078_F117 get_const_value(index_PW_MF078_F117)
-#define PW_F117_F078 get_const_value(index_PW_F117_F078)
-#define PW_MF060_MF089 get_const_value(index_PW_MF060_MF089)
-#define PW_MF089_F060 get_const_value(index_PW_MF089_F060)
-#define PW_MF050_MF256 get_const_value(index_PW_MF050_MF256)
-#define PW_MF256_F050 get_const_value(index_PW_MF256_F050)
-#define PD_DESCALE_P1 get_const_value(index_PD_DESCALE_P1)
-#define PD_DESCALE_P2 get_const_value(index_PD_DESCALE_P2)
-#define PB_CENTERJSAMP get_const_value(index_PB_CENTERJSAMP)
-
-
-#define test_m32_zero(mm32) (!(*(uint32_t *)&mm32))
-#define test_m64_zero(mm64) (!(*(uint64_t *)&mm64))
-
-
-#define DO_IDCT_COMMON(PASS) { \
- __m64 tmp0_3l, tmp0_3h, tmp1_2l, tmp1_2h; \
- __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \
- __m64 z34l, z34h, z3l, z3h, z4l, z4h, z3, z4; \
- __m64 out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h; \
- __m64 out4l, out4h, out5l, out5h, out6l, out6h, out7l, out7h; \
- \
- z3 = _mm_add_pi16(tmp0, tmp2); \
- z4 = _mm_add_pi16(tmp1, tmp3); \
- \
- /* (Original) \
- * z5 = (z3 + z4) * 1.175875602; \
- * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
- * z3 += z5; z4 += z5; \
- * \
- * (This implementation) \
- * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
- * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
- */ \
- \
- z34l = _mm_unpacklo_pi16(z3, z4); \
- z34h = _mm_unpackhi_pi16(z3, z4); \
- z3l = _mm_madd_pi16(z34l, PW_MF078_F117); \
- z3h = _mm_madd_pi16(z34h, PW_MF078_F117); \
- z4l = _mm_madd_pi16(z34l, PW_F117_F078); \
- z4h = _mm_madd_pi16(z34h, PW_F117_F078); \
- \
- /* (Original) \
- * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; \
- * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; \
- * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; \
- * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
- * tmp0 += z1 + z3; tmp1 += z2 + z4; \
- * tmp2 += z2 + z3; tmp3 += z1 + z4; \
- * \
- * (This implementation) \
- * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
- * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
- * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
- * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
- * tmp0 += z3; tmp1 += z4; \
- * tmp2 += z3; tmp3 += z4; \
- */ \
- \
- tmp0_3l = _mm_unpacklo_pi16(tmp0, tmp3); \
- tmp0_3h = _mm_unpackhi_pi16(tmp0, tmp3); \
- \
- tmp0l = _mm_madd_pi16(tmp0_3l, PW_MF060_MF089); \
- tmp0h = _mm_madd_pi16(tmp0_3h, PW_MF060_MF089); \
- tmp3l = _mm_madd_pi16(tmp0_3l, PW_MF089_F060); \
- tmp3h = _mm_madd_pi16(tmp0_3h, PW_MF089_F060); \
- \
- tmp0l = _mm_add_pi32(tmp0l, z3l); \
- tmp0h = _mm_add_pi32(tmp0h, z3h); \
- tmp3l = _mm_add_pi32(tmp3l, z4l); \
- tmp3h = _mm_add_pi32(tmp3h, z4h); \
- \
- tmp1_2l = _mm_unpacklo_pi16(tmp1, tmp2); \
- tmp1_2h = _mm_unpackhi_pi16(tmp1, tmp2); \
- \
- tmp1l = _mm_madd_pi16(tmp1_2l, PW_MF050_MF256); \
- tmp1h = _mm_madd_pi16(tmp1_2h, PW_MF050_MF256); \
- tmp2l = _mm_madd_pi16(tmp1_2l, PW_MF256_F050); \
- tmp2h = _mm_madd_pi16(tmp1_2h, PW_MF256_F050); \
- \
- tmp1l = _mm_add_pi32(tmp1l, z4l); \
- tmp1h = _mm_add_pi32(tmp1h, z4h); \
- tmp2l = _mm_add_pi32(tmp2l, z3l); \
- tmp2h = _mm_add_pi32(tmp2h, z3h); \
- \
- /* Final output stage */ \
- \
- out0l = _mm_add_pi32(tmp10l, tmp3l); \
- out0h = _mm_add_pi32(tmp10h, tmp3h); \
- out7l = _mm_sub_pi32(tmp10l, tmp3l); \
- out7h = _mm_sub_pi32(tmp10h, tmp3h); \
- \
- out0l = _mm_add_pi32(out0l, PD_DESCALE_P##PASS); \
- out0h = _mm_add_pi32(out0h, PD_DESCALE_P##PASS); \
- out0l = _mm_srai_pi32(out0l, DESCALE_P##PASS); \
- out0h = _mm_srai_pi32(out0h, DESCALE_P##PASS); \
- \
- out7l = _mm_add_pi32(out7l, PD_DESCALE_P##PASS); \
- out7h = _mm_add_pi32(out7h, PD_DESCALE_P##PASS); \
- out7l = _mm_srai_pi32(out7l, DESCALE_P##PASS); \
- out7h = _mm_srai_pi32(out7h, DESCALE_P##PASS); \
- \
- out0 = _mm_packs_pi32(out0l, out0h); \
- out7 = _mm_packs_pi32(out7l, out7h); \
- \
- out1l = _mm_add_pi32(tmp11l, tmp2l); \
- out1h = _mm_add_pi32(tmp11h, tmp2h); \
- out6l = _mm_sub_pi32(tmp11l, tmp2l); \
- out6h = _mm_sub_pi32(tmp11h, tmp2h); \
- \
- out1l = _mm_add_pi32(out1l, PD_DESCALE_P##PASS); \
- out1h = _mm_add_pi32(out1h, PD_DESCALE_P##PASS); \
- out1l = _mm_srai_pi32(out1l, DESCALE_P##PASS); \
- out1h = _mm_srai_pi32(out1h, DESCALE_P##PASS); \
- \
- out6l = _mm_add_pi32(out6l, PD_DESCALE_P##PASS); \
- out6h = _mm_add_pi32(out6h, PD_DESCALE_P##PASS); \
- out6l = _mm_srai_pi32(out6l, DESCALE_P##PASS); \
- out6h = _mm_srai_pi32(out6h, DESCALE_P##PASS); \
- \
- out1 = _mm_packs_pi32(out1l, out1h); \
- out6 = _mm_packs_pi32(out6l, out6h); \
- \
- out2l = _mm_add_pi32(tmp12l, tmp1l); \
- out2h = _mm_add_pi32(tmp12h, tmp1h); \
- out5l = _mm_sub_pi32(tmp12l, tmp1l); \
- out5h = _mm_sub_pi32(tmp12h, tmp1h); \
- \
- out2l = _mm_add_pi32(out2l, PD_DESCALE_P##PASS); \
- out2h = _mm_add_pi32(out2h, PD_DESCALE_P##PASS); \
- out2l = _mm_srai_pi32(out2l, DESCALE_P##PASS); \
- out2h = _mm_srai_pi32(out2h, DESCALE_P##PASS); \
- \
- out5l = _mm_add_pi32(out5l, PD_DESCALE_P##PASS); \
- out5h = _mm_add_pi32(out5h, PD_DESCALE_P##PASS); \
- out5l = _mm_srai_pi32(out5l, DESCALE_P##PASS); \
- out5h = _mm_srai_pi32(out5h, DESCALE_P##PASS); \
- \
- out2 = _mm_packs_pi32(out2l, out2h); \
- out5 = _mm_packs_pi32(out5l, out5h); \
- \
- out3l = _mm_add_pi32(tmp13l, tmp0l); \
- out3h = _mm_add_pi32(tmp13h, tmp0h); \
- \
- out4l = _mm_sub_pi32(tmp13l, tmp0l); \
- out4h = _mm_sub_pi32(tmp13h, tmp0h); \
- \
- out3l = _mm_add_pi32(out3l, PD_DESCALE_P##PASS); \
- out3h = _mm_add_pi32(out3h, PD_DESCALE_P##PASS); \
- out3l = _mm_srai_pi32(out3l, DESCALE_P##PASS); \
- out3h = _mm_srai_pi32(out3h, DESCALE_P##PASS); \
- \
- out4l = _mm_add_pi32(out4l, PD_DESCALE_P##PASS); \
- out4h = _mm_add_pi32(out4h, PD_DESCALE_P##PASS); \
- out4l = _mm_srai_pi32(out4l, DESCALE_P##PASS); \
- out4h = _mm_srai_pi32(out4h, DESCALE_P##PASS); \
- \
- out3 = _mm_packs_pi32(out3l, out3h); \
- out4 = _mm_packs_pi32(out4l, out4h); \
-}
-
-#define DO_IDCT_PASS1(iter) { \
- __m64 col0l, col1l, col2l, col3l, col4l, col5l, col6l, col7l; \
- __m64 quant0l, quant1l, quant2l, quant3l; \
- __m64 quant4l, quant5l, quant6l, quant7l; \
- __m64 z23, z2, z3, z23l, z23h; \
- __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
- __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
- __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \
- __m64 tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h; \
- __m32 col0a, col1a, mm0; \
- \
- col0a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 1]); \
- col1a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 2]); \
- mm0 = _mm_or_si32(col0a, col1a); \
- \
- if (test_m32_zero(mm0)) { \
- __m64 mm1, mm2; \
- \
- col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); \
- col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); \
- col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); \
- col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); \
- col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); \
- col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); \
- col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); \
- col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); \
- \
- mm1 = _mm_or_si64(col1l, col3l); \
- mm2 = _mm_or_si64(col2l, col4l); \
- mm1 = _mm_or_si64(mm1, col5l); \
- mm2 = _mm_or_si64(mm2, col6l); \
- mm1 = _mm_or_si64(mm1, col7l); \
- mm1 = _mm_or_si64(mm1, mm2); \
- \
- if (test_m64_zero(mm1)) { \
- __m64 dcval, dcvall, dcvalh, row0, row1, row2, row3; \
- \
- /* AC terms all zero */ \
- \
- quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
- \
- dcval = _mm_mullo_pi16(col0l, quant0l); \
- dcval = _mm_slli_pi16(dcval, PASS1_BITS); /* dcval=(00 10 20 30) */ \
- \
- dcvall = _mm_unpacklo_pi16(dcval, dcval); /* dcvall=(00 00 10 10) */ \
- dcvalh = _mm_unpackhi_pi16(dcval, dcval); /* dcvalh=(20 20 30 30) */ \
- \
- row0 = _mm_unpacklo_pi32(dcvall, dcvall); /* row0=(00 00 00 00) */ \
- row1 = _mm_unpackhi_pi32(dcvall, dcvall); /* row1=(10 10 10 10) */ \
- row2 = _mm_unpacklo_pi32(dcvalh, dcvalh); /* row2=(20 20 20 20) */ \
- row3 = _mm_unpackhi_pi32(dcvalh, dcvalh); /* row3=(30 30 30 30) */ \
- \
- _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0); \
- _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0); \
- _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1); \
- _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1); \
- _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2); \
- _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2); \
- _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3); \
- _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3); \
- \
- goto nextcolumn##iter; \
- } \
- } \
- \
- /* Even part \
- * \
- * (Original) \
- * z1 = (z2 + z3) * 0.541196100; \
- * tmp2 = z1 + z3 * -1.847759065; \
- * tmp3 = z1 + z2 * 0.765366865; \
- * \
- * (This implementation) \
- * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
- * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
- */ \
- \
- col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); /* (00 10 20 30) */ \
- col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); /* (02 12 22 32) */ \
- col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); /* (04 14 24 34) */ \
- col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); /* (06 16 26 36) */ \
- \
- quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
- quant2l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 2]); \
- quant4l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 4]); \
- quant6l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 6]); \
- \
- z2 = _mm_mullo_pi16(col2l, quant2l); \
- z3 = _mm_mullo_pi16(col6l, quant6l); \
- \
- z23l = _mm_unpacklo_pi16(z2, z3); \
- z23h = _mm_unpackhi_pi16(z2, z3); \
- tmp3l = _mm_madd_pi16(z23l, PW_F130_F054); \
- tmp3h = _mm_madd_pi16(z23h, PW_F130_F054); \
- tmp2l = _mm_madd_pi16(z23l, PW_F054_MF130); \
- tmp2h = _mm_madd_pi16(z23h, PW_F054_MF130); \
- \
- z2 = _mm_mullo_pi16(col0l, quant0l); \
- z3 = _mm_mullo_pi16(col4l, quant4l); \
- \
- z23 = _mm_add_pi16(z2, z3); \
- tmp0l = _mm_loadlo_pi16_f(z23); \
- tmp0h = _mm_loadhi_pi16_f(z23); \
- tmp0l = _mm_srai_pi32(tmp0l, (16 - CONST_BITS)); \
- tmp0h = _mm_srai_pi32(tmp0h, (16 - CONST_BITS)); \
- \
- tmp10l = _mm_add_pi32(tmp0l, tmp3l); \
- tmp10h = _mm_add_pi32(tmp0h, tmp3h); \
- tmp13l = _mm_sub_pi32(tmp0l, tmp3l); \
- tmp13h = _mm_sub_pi32(tmp0h, tmp3h); \
- \
- z23 = _mm_sub_pi16(z2, z3); \
- tmp1l = _mm_loadlo_pi16_f(z23); \
- tmp1h = _mm_loadhi_pi16_f(z23); \
- tmp1l = _mm_srai_pi32(tmp1l, (16 - CONST_BITS)); \
- tmp1h = _mm_srai_pi32(tmp1h, (16 - CONST_BITS)); \
- \
- tmp11l = _mm_add_pi32(tmp1l, tmp2l); \
- tmp11h = _mm_add_pi32(tmp1h, tmp2h); \
- tmp12l = _mm_sub_pi32(tmp1l, tmp2l); \
- tmp12h = _mm_sub_pi32(tmp1h, tmp2h); \
- \
- /* Odd part */ \
- \
- col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); /* (01 11 21 31) */ \
- col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); /* (03 13 23 33) */ \
- col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); /* (05 15 25 35) */ \
- col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); /* (07 17 27 37) */ \
- \
- quant1l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 1]); \
- quant3l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 3]); \
- quant5l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 5]); \
- quant7l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 7]); \
- \
- tmp0 = _mm_mullo_pi16(col7l, quant7l); \
- tmp1 = _mm_mullo_pi16(col5l, quant5l); \
- tmp2 = _mm_mullo_pi16(col3l, quant3l); \
- tmp3 = _mm_mullo_pi16(col1l, quant1l); \
- \
- DO_IDCT_COMMON(1) \
- \
- /* out0=(00 10 20 30), out1=(01 11 21 31) */ \
- /* out2=(02 12 22 32), out3=(03 13 23 33) */ \
- /* out4=(04 14 24 34), out5=(05 15 25 35) */ \
- /* out6=(06 16 26 36), out7=(07 17 27 37) */ \
- \
- /* Transpose coefficients */ \
- \
- row01a = _mm_unpacklo_pi16(out0, out1); /* row01a=(00 01 10 11) */ \
- row23a = _mm_unpackhi_pi16(out0, out1); /* row23a=(20 21 30 31) */ \
- row01d = _mm_unpacklo_pi16(out6, out7); /* row01d=(06 07 16 17) */ \
- row23d = _mm_unpackhi_pi16(out6, out7); /* row23d=(26 27 36 37) */ \
- \
- row01b = _mm_unpacklo_pi16(out2, out3); /* row01b=(02 03 12 13) */ \
- row23b = _mm_unpackhi_pi16(out2, out3); /* row23b=(22 23 32 33) */ \
- row01c = _mm_unpacklo_pi16(out4, out5); /* row01c=(04 05 14 15) */ \
- row23c = _mm_unpackhi_pi16(out4, out5); /* row23c=(24 25 34 35) */ \
- \
- row0l = _mm_unpacklo_pi32(row01a, row01b); /* row0l=(00 01 02 03) */ \
- row1l = _mm_unpackhi_pi32(row01a, row01b); /* row1l=(10 11 12 13) */ \
- row2l = _mm_unpacklo_pi32(row23a, row23b); /* row2l=(20 21 22 23) */ \
- row3l = _mm_unpackhi_pi32(row23a, row23b); /* row3l=(30 31 32 33) */ \
- \
- row0h = _mm_unpacklo_pi32(row01c, row01d); /* row0h=(04 05 06 07) */ \
- row1h = _mm_unpackhi_pi32(row01c, row01d); /* row1h=(14 15 16 17) */ \
- row2h = _mm_unpacklo_pi32(row23c, row23d); /* row2h=(24 25 26 27) */ \
- row3h = _mm_unpackhi_pi32(row23c, row23d); /* row3h=(34 35 36 37) */ \
- \
- _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0l); \
- _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0h); \
- _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1l); \
- _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1h); \
- _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2l); \
- _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2h); \
- _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3l); \
- _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3h); \
-}
-
-#define DO_IDCT_PASS2(ctr) { \
- __m64 row0l, row1l, row2l, row3l, row4l, row5l, row6l, row7l; \
- __m64 z23, z23l, z23h; \
- __m64 col0123a, col0123b, col0123c, col0123d; \
- __m64 col01l, col01h, col23l, col23h, row06, row17, row24, row35; \
- __m64 col0, col1, col2, col3; \
- __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \
- __m64 tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h; \
- \
- row0l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 0]); /* (00 01 02 03) */ \
- row1l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 1]); /* (10 11 12 13) */ \
- row2l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 2]); /* (20 21 22 23) */ \
- row3l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 3]); /* (30 31 32 33) */ \
- row4l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 4]); /* (40 41 42 43) */ \
- row5l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 5]); /* (50 51 52 53) */ \
- row6l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 6]); /* (60 61 62 63) */ \
- row7l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 7]); /* (70 71 72 73) */ \
- \
- /* Even part \
- * \
- * (Original) \
- * z1 = (z2 + z3) * 0.541196100; \
- * tmp2 = z1 + z3 * -1.847759065; \
- * tmp3 = z1 + z2 * 0.765366865; \
- * \
- * (This implementation) \
- * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
- * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
- */ \
- \
- z23l = _mm_unpacklo_pi16(row2l, row6l); \
- z23h = _mm_unpackhi_pi16(row2l, row6l); \
- \
- tmp3l = _mm_madd_pi16(z23l, PW_F130_F054); \
- tmp3h = _mm_madd_pi16(z23h, PW_F130_F054); \
- tmp2l = _mm_madd_pi16(z23l, PW_F054_MF130); \
- tmp2h = _mm_madd_pi16(z23h, PW_F054_MF130); \
- \
- z23 = _mm_add_pi16(row0l, row4l); \
- tmp0l = _mm_loadlo_pi16_f(z23); \
- tmp0h = _mm_loadhi_pi16_f(z23); \
- tmp0l = _mm_srai_pi32(tmp0l, (16 - CONST_BITS)); \
- tmp0h = _mm_srai_pi32(tmp0h, (16 - CONST_BITS)); \
- \
- tmp10l = _mm_add_pi32(tmp0l, tmp3l); \
- tmp10h = _mm_add_pi32(tmp0h, tmp3h); \
- tmp13l = _mm_sub_pi32(tmp0l, tmp3l); \
- tmp13h = _mm_sub_pi32(tmp0h, tmp3h); \
- \
- z23 = _mm_sub_pi16(row0l, row4l); \
- tmp1l = _mm_loadlo_pi16_f(z23); \
- tmp1h = _mm_loadhi_pi16_f(z23); \
- tmp1l = _mm_srai_pi32(tmp1l, (16 - CONST_BITS)); \
- tmp1h = _mm_srai_pi32(tmp1h, (16 - CONST_BITS)); \
- \
- tmp11l = _mm_add_pi32(tmp1l, tmp2l); \
- tmp11h = _mm_add_pi32(tmp1h, tmp2h); \
- tmp12l = _mm_sub_pi32(tmp1l, tmp2l); \
- tmp12h = _mm_sub_pi32(tmp1h, tmp2h); \
- \
- /* Odd part */ \
- \
- tmp0 = row7l; \
- tmp1 = row5l; \
- tmp2 = row3l; \
- tmp3 = row1l; \
- \
- DO_IDCT_COMMON(2) \
- \
- /* out0=(00 01 02 03), out1=(10 11 12 13) */ \
- /* out2=(20 21 22 23), out3=(30 31 32 33) */ \
- /* out4=(40 41 42 43), out5=(50 51 52 53) */ \
- /* out6=(60 61 62 63), out7=(70 71 72 73) */ \
- \
- row06 = _mm_packs_pi16(out0, out6); /* row06=(00 01 02 03 60 61 62 63) */ \
- row17 = _mm_packs_pi16(out1, out7); /* row17=(10 11 12 13 70 71 72 73) */ \
- row24 = _mm_packs_pi16(out2, out4); /* row24=(20 21 22 23 40 41 42 43) */ \
- row35 = _mm_packs_pi16(out3, out5); /* row35=(30 31 32 33 50 51 52 53) */ \
- \
- row06 = _mm_add_pi8(row06, PB_CENTERJSAMP); \
- row17 = _mm_add_pi8(row17, PB_CENTERJSAMP); \
- row24 = _mm_add_pi8(row24, PB_CENTERJSAMP); \
- row35 = _mm_add_pi8(row35, PB_CENTERJSAMP); \
- \
- /* Transpose coefficients */ \
- \
- col0123a = _mm_unpacklo_pi8(row06, row17); /* col0123a=(00 10 01 11 02 12 03 13) */ \
- col0123d = _mm_unpackhi_pi8(row06, row17); /* col0123d=(60 70 61 71 62 72 63 73) */ \
- col0123b = _mm_unpacklo_pi8(row24, row35); /* col0123b=(20 30 21 31 22 32 23 33) */ \
- col0123c = _mm_unpackhi_pi8(row24, row35); /* col0123c=(40 50 41 51 42 52 43 53) */ \
- \
- col01l = _mm_unpacklo_pi16(col0123a, col0123b); /* col01l=(00 10 20 30 01 11 21 31) */ \
- col23l = _mm_unpackhi_pi16(col0123a, col0123b); /* col23l=(02 12 22 32 03 13 23 33) */ \
- col01h = _mm_unpacklo_pi16(col0123c, col0123d); /* col01h=(40 50 60 70 41 51 61 71) */ \
- col23h = _mm_unpackhi_pi16(col0123c, col0123d); /* col23h=(42 52 62 72 43 53 63 73) */ \
- \
- col0 = _mm_unpacklo_pi32(col01l, col01h); /* col0=(00 10 20 30 40 50 60 70) */ \
- col1 = _mm_unpackhi_pi32(col01l, col01h); /* col1=(01 11 21 31 41 51 61 71) */ \
- col2 = _mm_unpacklo_pi32(col23l, col23h); /* col2=(02 12 22 32 42 52 62 72) */ \
- col3 = _mm_unpackhi_pi32(col23l, col23h); /* col3=(03 13 23 33 43 53 63 73) */ \
- \
- _mm_store_si64((__m64 *)(output_buf[ctr + 0] + output_col), col0); \
- _mm_store_si64((__m64 *)(output_buf[ctr + 1] + output_col), col1); \
- _mm_store_si64((__m64 *)(output_buf[ctr + 2] + output_col), col2); \
- _mm_store_si64((__m64 *)(output_buf[ctr + 3] + output_col), col3); \
-}
-
-void jsimd_idct_islow_mmi(void *dct_table, JCOEFPTR coef_block,
- JSAMPARRAY output_buf, JDIMENSION output_col)
-{
- __m64 tmp0, tmp1, tmp2, tmp3;
- __m64 out0, out1, out2, out3, out4, out5, out6, out7;
- JCOEFPTR inptr;
- ISLOW_MULT_TYPE *quantptr;
- JCOEF *wsptr;
- JCOEF workspace[DCTSIZE2]; /* buffers data between passes */
-
- /* Pass 1: process columns. */
-
- inptr = coef_block;
- quantptr = (ISLOW_MULT_TYPE *)dct_table;
- wsptr = workspace;
-
- DO_IDCT_PASS1(1)
-nextcolumn1:
- inptr += 4;
- quantptr += 4;
- wsptr += DCTSIZE * 4;
- DO_IDCT_PASS1(2)
-nextcolumn2:
-
- /* Pass 2: process rows. */
-
- wsptr = workspace;
-
- DO_IDCT_PASS2(0)
- wsptr += 4;
- DO_IDCT_PASS2(4)
-}
diff --git a/simd/loongson/jquanti-mmi.c b/simd/loongson/jquanti-mmi.c
deleted file mode 100644
index f9a3f81..0000000
--- a/simd/loongson/jquanti-mmi.c
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Loongson MMI optimizations for libjpeg-turbo
- *
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
- * All Rights Reserved.
- * Authors: ZhuChen <zhuchen@loongson.cn>
- * CaiWanwei <caiwanwei@loongson.cn>
- * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
- * Copyright (C) 2018, D. R. Commander. All Rights Reserved.
- *
- * Based on the x86 SIMD extension for IJG JPEG library
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
-
-#include "jsimd_mmi.h"
-
-
-#define DO_QUANT() { \
- mm2 = _mm_load_si64((__m64 *)&workspace[0]); \
- mm3 = _mm_load_si64((__m64 *)&workspace[4]); \
- \
- mm0 = mm2; \
- mm1 = mm3; \
- \
- mm2 = _mm_srai_pi16(mm2, (WORD_BIT - 1)); /* -1 if value < 0, */ \
- /* 0 otherwise */ \
- mm3 = _mm_srai_pi16(mm3, (WORD_BIT - 1)); \
- \
- mm0 = _mm_xor_si64(mm0, mm2); /* val = -val */ \
- mm1 = _mm_xor_si64(mm1, mm3); \
- mm0 = _mm_sub_pi16(mm0, mm2); \
- mm1 = _mm_sub_pi16(mm1, mm3); \
- \
- corr0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1]); /* correction */ \
- corr1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1 + 4]); \
- \
- mm0 = _mm_add_pi16(mm0, corr0); /* correction + roundfactor */ \
- mm1 = _mm_add_pi16(mm1, corr1); \
- \
- mm4 = mm0; \
- mm5 = mm1; \
- \
- recip0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0]); /* reciprocal */ \
- recip1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0 + 4]); \
- \
- mm0 = _mm_mulhi_pi16(mm0, recip0); \
- mm1 = _mm_mulhi_pi16(mm1, recip1); \
- \
- mm0 = _mm_add_pi16(mm0, mm4); /* reciprocal is always negative */ \
- mm1 = _mm_add_pi16(mm1, mm5); /* (MSB=1), so we always need to add the */ \
- /* initial value (input value is never */ \
- /* negative as we inverted it at the */ \
- /* start of this routine) */ \
- \
- scale0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2]); /* scale */ \
- scale1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2 + 4]); \
- \
- mm6 = scale0; \
- mm7 = scale1; \
- mm4 = mm0; \
- mm5 = mm1; \
- \
- mm0 = _mm_mulhi_pi16(mm0, mm6); \
- mm1 = _mm_mulhi_pi16(mm1, mm7); \
- \
- mm6 = _mm_srai_pi16(mm6, (WORD_BIT - 1)); /* determine if scale... */ \
- /* is negative */ \
- mm7 = _mm_srai_pi16(mm7, (WORD_BIT - 1)); \
- \
- mm6 = _mm_and_si64(mm6, mm4); /* and add input if it is */ \
- mm7 = _mm_and_si64(mm7, mm5); \
- mm0 = _mm_add_pi16(mm0, mm6); \
- mm1 = _mm_add_pi16(mm1, mm7); \
- \
- mm4 = _mm_srai_pi16(mm4, (WORD_BIT - 1)); /* then check if... */ \
- mm5 = _mm_srai_pi16(mm5, (WORD_BIT - 1)); /* negative input */ \
- \
- mm4 = _mm_and_si64(mm4, scale0); /* and add scale if it is */ \
- mm5 = _mm_and_si64(mm5, scale1); \
- mm0 = _mm_add_pi16(mm0, mm4); \
- mm1 = _mm_add_pi16(mm1, mm5); \
- \
- mm0 = _mm_xor_si64(mm0, mm2); /* val = -val */ \
- mm1 = _mm_xor_si64(mm1, mm3); \
- mm0 = _mm_sub_pi16(mm0, mm2); \
- mm1 = _mm_sub_pi16(mm1, mm3); \
- \
- _mm_store_si64((__m64 *)&output_ptr[0], mm0); \
- _mm_store_si64((__m64 *)&output_ptr[4], mm1); \
- \
- workspace += DCTSIZE; \
- divisors += DCTSIZE; \
- output_ptr += DCTSIZE; \
-}
-
-
-void jsimd_quantize_mmi(JCOEFPTR coef_block, DCTELEM *divisors,
- DCTELEM *workspace)
-{
- JCOEFPTR output_ptr = coef_block;
- __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
- __m64 corr0, corr1, recip0, recip1, scale0, scale1;
-
- DO_QUANT()
- DO_QUANT()
- DO_QUANT()
- DO_QUANT()
- DO_QUANT()
- DO_QUANT()
- DO_QUANT()
- DO_QUANT()
-}
diff --git a/simd/loongson/jsimd.c b/simd/loongson/jsimd.c
deleted file mode 100644
index e8b1832..0000000
--- a/simd/loongson/jsimd.c
+++ /dev/null
@@ -1,610 +0,0 @@
-/*
- * jsimd_loongson.c
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander.
- * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
- * Copyright (C) 2015, 2018, Matthieu Darbois.
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
- *
- * Based on the x86 SIMD extension for IJG JPEG library,
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- * For conditions of distribution and use, see copyright notice in jsimdext.inc
- *
- * This file contains the interface between the "normal" portions
- * of the library and the SIMD implementations when running on a
- * Loongson architecture.
- */
-
-#define JPEG_INTERNALS
-#include "../../jinclude.h"
-#include "../../jpeglib.h"
-#include "../../jsimd.h"
-#include "../../jdct.h"
-#include "../../jsimddct.h"
-#include "../jsimd.h"
-
-static unsigned int simd_support = ~0;
-
-/*
- * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
- */
-LOCAL(void)
-init_simd(void)
-{
-#ifndef NO_GETENV
- char *env = NULL;
-#endif
-
- if (simd_support != ~0U)
- return;
-
- simd_support |= JSIMD_MMI;
-
-#ifndef NO_GETENV
- /* Force different settings through environment variables */
- env = getenv("JSIMD_FORCENONE");
- if ((env != NULL) && (strcmp(env, "1") == 0))
- simd_support = 0;
-#endif
-}
-
-GLOBAL(int)
-jsimd_can_rgb_ycc(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
- return 0;
-
- if (simd_support & JSIMD_MMI)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_gray(void)
-{
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
- return 0;
-
- if (simd_support & JSIMD_MMI)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb565(void)
-{
- return 0;
-}
-
-GLOBAL(int)
-jsimd_c_can_null_convert(void)
-{
- return 0;
-}
-
-GLOBAL(void)
-jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
- JSAMPIMAGE output_buf, JDIMENSION output_row,
- int num_rows)
-{
- void (*mmifct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
- switch (cinfo->in_color_space) {
- case JCS_EXT_RGB:
- mmifct = jsimd_extrgb_ycc_convert_mmi;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- mmifct = jsimd_extrgbx_ycc_convert_mmi;
- break;
- case JCS_EXT_BGR:
- mmifct = jsimd_extbgr_ycc_convert_mmi;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- mmifct = jsimd_extbgrx_ycc_convert_mmi;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- mmifct = jsimd_extxbgr_ycc_convert_mmi;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- mmifct = jsimd_extxrgb_ycc_convert_mmi;
- break;
- default:
- mmifct = jsimd_rgb_ycc_convert_mmi;
- break;
- }
-
- mmifct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
- JSAMPIMAGE output_buf, JDIMENSION output_row,
- int num_rows)
-{
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
- JDIMENSION input_row, JSAMPARRAY output_buf,
- int num_rows)
-{
- void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
-
- switch (cinfo->out_color_space) {
- case JCS_EXT_RGB:
- mmifct = jsimd_ycc_extrgb_convert_mmi;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- mmifct = jsimd_ycc_extrgbx_convert_mmi;
- break;
- case JCS_EXT_BGR:
- mmifct = jsimd_ycc_extbgr_convert_mmi;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- mmifct = jsimd_ycc_extbgrx_convert_mmi;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- mmifct = jsimd_ycc_extxbgr_convert_mmi;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- mmifct = jsimd_ycc_extxrgb_convert_mmi;
- break;
- default:
- mmifct = jsimd_ycc_rgb_convert_mmi;
- break;
- }
-
- mmifct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
- JDIMENSION input_row, JSAMPARRAY output_buf,
- int num_rows)
-{
-}
-
-GLOBAL(void)
-jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
- JSAMPIMAGE output_buf, JDIMENSION output_row,
- int num_rows)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_downsample(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_MMI)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_smooth_downsample(void)
-{
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_downsample(void)
-{
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
- jsimd_h2v2_downsample_mmi(cinfo->image_width, cinfo->max_v_samp_factor,
- compptr->v_samp_factor, compptr->width_in_blocks,
- input_data, output_data);
-}
-
-GLOBAL(void)
-jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_upsample(void)
-{
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_upsample(void)
-{
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_int_upsample(void)
-{
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(void)
-jsimd_int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_MMI)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample(void)
-{
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
-{
- jsimd_h2v2_fancy_upsample_mmi(cinfo->max_v_samp_factor,
- compptr->downsampled_width, input_data,
- output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_merged_upsample(void)
-{
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_merged_upsample(void)
-{
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
-{
-}
-
-GLOBAL(int)
-jsimd_can_convsamp(void)
-{
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_convsamp_float(void)
-{
- return 0;
-}
-
-GLOBAL(void)
-jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
- DCTELEM *workspace)
-{
-}
-
-GLOBAL(void)
-jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
- FAST_FLOAT *workspace)
-{
-}
-
-GLOBAL(int)
-jsimd_can_fdct_islow(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if (simd_support & JSIMD_MMI)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_ifast(void)
-{
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_float(void)
-{
- return 0;
-}
-
-GLOBAL(void)
-jsimd_fdct_islow(DCTELEM *data)
-{
- jsimd_fdct_islow_mmi(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_ifast(DCTELEM *data)
-{
-}
-
-GLOBAL(void)
-jsimd_fdct_float(FAST_FLOAT *data)
-{
-}
-
-GLOBAL(int)
-jsimd_can_quantize(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if (simd_support & JSIMD_MMI)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_quantize_float(void)
-{
- return 0;
-}
-
-GLOBAL(void)
-jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
-{
- jsimd_quantize_mmi(coef_block, divisors, workspace);
-}
-
-GLOBAL(void)
-jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
- FAST_FLOAT *workspace)
-{
-}
-
-GLOBAL(int)
-jsimd_can_idct_2x2(void)
-{
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_4x4(void)
-{
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_6x6(void)
-{
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_12x12(void)
-{
- return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
-}
-
-GLOBAL(void)
-jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
-}
-
-GLOBAL(void)
-jsimd_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
-}
-
-GLOBAL(void)
-jsimd_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
-}
-
-GLOBAL(int)
-jsimd_can_idct_islow(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(ISLOW_MULT_TYPE) != 2)
- return 0;
-
- if (simd_support & JSIMD_MMI)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_ifast(void)
-{
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_float(void)
-{
- return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- jsimd_idct_islow_mmi(compptr->dct_table, coef_block, output_buf, output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
-}
-
-GLOBAL(void)
-jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
-}
-
-GLOBAL(int)
-jsimd_can_huff_encode_one_block(void)
-{
- return 0;
-}
-
-GLOBAL(JOCTET *)
-jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
- int last_dc_val, c_derived_tbl *dctbl,
- c_derived_tbl *actbl)
-{
- return NULL;
-}
-
-GLOBAL(int)
-jsimd_can_encode_mcu_AC_first_prepare(void)
-{
- return 0;
-}
-
-GLOBAL(void)
-jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
- const int *jpeg_natural_order_start, int Sl,
- int Al, JCOEF *values, size_t *zerobits)
-{
-}
-
-GLOBAL(int)
-jsimd_can_encode_mcu_AC_refine_prepare(void)
-{
- return 0;
-}
-
-GLOBAL(int)
-jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
- const int *jpeg_natural_order_start, int Sl,
- int Al, JCOEF *absvalues, size_t *bits)
-{
- return 0;
-}
diff --git a/simd/loongson/jsimd_mmi.h b/simd/loongson/jsimd_mmi.h
deleted file mode 100644
index 2506aa8..0000000
--- a/simd/loongson/jsimd_mmi.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Loongson MMI optimizations for libjpeg-turbo
- *
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
- * All Rights Reserved.
- * Authors: ZhuChen <zhuchen@loongson.cn>
- * CaiWanwei <caiwanwei@loongson.cn>
- * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-#define JPEG_INTERNALS
-#include "../../jinclude.h"
-#include "../../jpeglib.h"
-#include "../../jdct.h"
-#include "loongson-mmintrin.h"
-
-
-/* Common code */
-
-#define SIZEOF_MMWORD 8
-#define BYTE_BIT 8
-#define WORD_BIT 16
-#define SCALEBITS 16
-
-#define _uint64_set_pi8(a, b, c, d, e, f, g, h) \
- (((uint64_t)(uint8_t)a << 56) | \
- ((uint64_t)(uint8_t)b << 48) | \
- ((uint64_t)(uint8_t)c << 40) | \
- ((uint64_t)(uint8_t)d << 32) | \
- ((uint64_t)(uint8_t)e << 24) | \
- ((uint64_t)(uint8_t)f << 16) | \
- ((uint64_t)(uint8_t)g << 8) | \
- ((uint64_t)(uint8_t)h))
-#define _uint64_set_pi16(a, b, c, d) (((uint64_t)(uint16_t)a << 48) | \
- ((uint64_t)(uint16_t)b << 32) | \
- ((uint64_t)(uint16_t)c << 16) | \
- ((uint64_t)(uint16_t)d))
-#define _uint64_set_pi32(a, b) (((uint64_t)(uint32_t)a << 32) | \
- ((uint64_t)(uint32_t)b))
-
-#define get_const_value(index) (*(__m64 *)&const_value[index])
diff --git a/simd/loongson/loongson-mmintrin.h b/simd/loongson/loongson-mmintrin.h
deleted file mode 100644
index 50d166b..0000000
--- a/simd/loongson/loongson-mmintrin.h
+++ /dev/null
@@ -1,1324 +0,0 @@
-/*
- * Loongson MMI optimizations for libjpeg-turbo
- *
- * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
- * All Rights Reserved.
- * Copyright (C) 2019, D. R. Commander. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-#ifndef __LOONGSON_MMINTRIN_H__
-#define __LOONGSON_MMINTRIN_H__
-
-#include <stdint.h>
-
-
-#define FUNCTION_ATTRIBS \
- __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-
-
-/* Vectors are stored in 64-bit floating-point registers. */
-typedef double __m64;
-
-/* Having a 32-bit datatype allows us to use 32-bit loads in places like
- load8888. */
-typedef float __m32;
-
-
-/********** Set Operations **********/
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_setzero_si64(void)
-{
- return 0.0;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_set_pi8(uint8_t __b7, uint8_t __b6, uint8_t __b5, uint8_t __b4,
- uint8_t __b3, uint8_t __b2, uint8_t __b1, uint8_t __b0)
-{
- __m64 ret;
- uint32_t lo = ((uint32_t)__b6 << 24) |
- ((uint32_t)__b4 << 16) |
- ((uint32_t)__b2 << 8) |
- (uint32_t)__b0;
- uint32_t hi = ((uint32_t)__b7 << 24) |
- ((uint32_t)__b5 << 16) |
- ((uint32_t)__b3 << 8) |
- (uint32_t)__b1;
-
- asm("mtc1 %1, %0\n\t"
- "mtc1 %2, $f0\n\t"
- "punpcklbh %0, %0, $f0\n\t"
- : "=f" (ret)
- : "r" (lo), "r" (hi)
- : "$f0"
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_set_pi16(uint16_t __h3, uint16_t __h2, uint16_t __h1, uint16_t __h0)
-{
- __m64 ret;
- uint32_t lo = ((uint32_t)__h2 << 16) | (uint32_t)__h0;
- uint32_t hi = ((uint32_t)__h3 << 16) | (uint32_t)__h1;
-
- asm("mtc1 %1, %0\n\t"
- "mtc1 %2, $f0\n\t"
- "punpcklhw %0, %0, $f0\n\t"
- : "=f" (ret)
- : "r" (lo), "r" (hi)
- : "$f0"
- );
-
- return ret;
-}
-
-#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
- (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_set_pi32(uint32_t __i1, uint32_t __i0)
-{
- if (__builtin_constant_p(__i1) && __builtin_constant_p(__i0)) {
- uint64_t val = ((uint64_t)__i1 << 32) |
- ((uint64_t)__i0 << 0);
-
- return *(__m64 *)&val;
- } else if (__i1 == __i0) {
- uint64_t imm = _MM_SHUFFLE(1, 0, 1, 0);
- __m64 ret;
-
- asm("pshufh %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm)
- );
-
- return ret;
- } else {
- uint64_t val = ((uint64_t)__i1 << 32) |
- ((uint64_t)__i0 << 0);
-
- return *(__m64 *)&val;
- }
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_set1_pi8(uint8_t __b0)
-{
- __m64 ret;
-
- asm("sll $8, %1, 8\n\t"
- "or %1, %1, $8\n\t"
- "mtc1 %1, %0\n\t"
- "mtc1 $0, $f0\n\t"
- "pshufh %0, %0, $f0\n\t"
- : "=f" (ret)
- : "r" (__b0)
- : "$8", "$f0"
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_set1_pi16(uint16_t __h0)
-{
- __m64 ret;
-
- asm("mtc1 %1, %0\n\t"
- "mtc1 $0, $f0\n\t"
- "pshufh %0, %0, $f0\n\t"
- : "=f" (ret)
- : "r" (__h0)
- : "$8", "$f0"
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_set1_pi32(unsigned __i0)
-{
- return _mm_set_pi32(__i0, __i0);
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_setr_pi8(uint8_t __h0, uint8_t __h1, uint8_t __h2, uint8_t __h3,
- uint8_t __h4, uint8_t __h5, uint8_t __h6, uint8_t __h7)
-{
- return _mm_set_pi8(__h7, __h6, __h5, __h4,
- __h3, __h2, __h1, __h0);
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_setr_pi16(uint16_t __w0, uint16_t __w1, uint16_t __w2, uint16_t __w3)
-{
- return _mm_set_pi16(__w3, __w2, __w1, __w0);
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_setr_pi32(uint32_t __i0, uint32_t __i1)
-{
- return _mm_set_pi32(__i1, __i0);
-}
-
-
-/********** Arithmetic Operations **********/
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_add_pi8(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("paddb %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_add_pi16(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("paddh %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_add_pi32(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("paddw %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_add_si64(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("paddd %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_adds_pi8(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("paddsb %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_adds_pi16(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("paddsh %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_adds_pu8(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("paddusb %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_adds_pu16(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("paddush %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_avg_pu8(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("pavgb %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_avg_pu16(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("pavgh %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_madd_pi16(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("pmaddhw %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_max_pi16(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("pmaxsh %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_max_pu8(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("pmaxub %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_min_pi16(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("pminsh %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_min_pu8(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("pminub %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline int FUNCTION_ATTRIBS
-_mm_movemask_pi8(__m64 __m1)
-{
- int ret;
-
- asm("pmovmskb %0, %1\n\t"
- : "=r" (ret)
- : "y" (__m1)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_mulhi_pi16(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("pmulhh %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_mulhi_pu16(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("pmulhuh %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_mullo_pi16(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("pmullh %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_mul_pu32(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("pmuluw %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_sad_pu8(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("psadbh %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_asub_pu8(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("pasubub %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_biadd_pu8(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("biadd %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_sub_pi8(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("psubb %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_sub_pi16(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("psubh %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_sub_pi32(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("psubw %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_sub_si64(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("psubd %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_subs_pi8(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("psubsb %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_subs_pi16(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("psubsh %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_subs_pu8(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("psubusb %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_subs_pu16(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("psubush %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-
-/********** Logical Operations **********/
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_and_si64(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("and %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_andnot_si64(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("andn %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_or_si32(__m32 __m1, __m32 __m2)
-{
- __m32 ret;
-
- asm("or %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_or_si64(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("or %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_xor_si64(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("xor %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-
-/********** Shift Operations **********/
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_slli_pi16(__m64 __m, int64_t __count)
-{
- __m64 ret;
-
- asm("psllh %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m), "f" (*(__m64 *)&__count)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_slli_pi32(__m64 __m, int64_t __count)
-{
- __m64 ret;
-
- asm("psllw %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m), "f" (*(__m64 *)&__count)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_slli_si64(__m64 __m, int64_t __count)
-{
- __m64 ret;
-
- asm("dsll %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m), "f" (*(__m64 *)&__count)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_srli_pi16(__m64 __m, int64_t __count)
-{
- __m64 ret;
-
- asm("psrlh %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m), "f" (*(__m64 *)&__count)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_srli_pi32(__m64 __m, int64_t __count)
-{
- __m64 ret;
-
- asm("psrlw %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m), "f" (*(__m64 *)&__count)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_srli_si64(__m64 __m, int64_t __count)
-{
- __m64 ret;
-
- asm("dsrl %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m), "f" (*(__m64 *)&__count)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_srai_pi16(__m64 __m, int64_t __count)
-{
- __m64 ret;
-
- asm("psrah %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m), "f" (*(__m64 *)&__count)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_srai_pi32(__m64 __m, int64_t __count)
-{
- __m64 ret;
-
- asm("psraw %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m), "f" (*(__m64 *)&__count)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_srai_si64(__m64 __m, int64_t __count)
-{
- __m64 ret;
-
- asm("dsra %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m), "f" (*(__m64 *)&__count)
- );
-
- return ret;
-}
-
-
-/********** Conversion Intrinsics **********/
-
-extern __inline __m64 FUNCTION_ATTRIBS
-to_m64(uint64_t x)
-{
- return *(__m64 *)&x;
-}
-
-extern __inline uint64_t FUNCTION_ATTRIBS
-to_uint64(__m64 x)
-{
- return *(uint64_t *)&x;
-}
-
-
-/********** Comparison Intrinsics **********/
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("pcmpeqb %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("pcmpeqh %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("pcmpeqw %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("pcmpgtb %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("pcmpgth %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("pcmpgtw %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_cmplt_pi8(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("pcmpltb %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_cmplt_pi16(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("pcmplth %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_cmplt_pi32(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("pcmpltw %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-
-/********** Miscellaneous Operations **********/
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_packs_pi16(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("packsshb %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_packs_pi32(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("packsswh %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_packs_pi32_f(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("packsswh %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_packs_pu16(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("packushb %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_extract_pi16(__m64 __m, int64_t __pos)
-{
- __m64 ret;
-
- asm("pextrh %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m), "f" (*(__m64 *)&__pos)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_insert_pi16(__m64 __m1, __m64 __m2, int64_t __pos)
-{
- __m64 ret;
-
- switch (__pos) {
- case 0:
-
- asm("pinsrh_0 %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2), "i" (__pos)
- );
-
- break;
-
- case 1:
-
- asm("pinsrh_1 %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2), "i" (__pos)
- );
-
- break;
- case 2:
-
- asm("pinsrh_2 %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2), "i" (__pos)
- );
-
- break;
-
- case 3:
-
- asm("pinsrh_3 %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2), "i" (__pos)
- );
-
- break;
- }
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_shuffle_pi16(__m64 __m, int64_t __n)
-{
- __m64 ret;
-
- asm("pshufh %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m), "f" (*(__m64 *)&__n)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("punpckhbh %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_unpackhi_pi8_f(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("punpckhbh %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("punpckhhw %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_unpackhi_pi16_f(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("punpckhhw %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("punpckhwd %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("punpcklbh %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-/* Since punpcklbh cares about the high 32-bits, we use the __m64 datatype,
- which preserves the data. */
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_unpacklo_pi8_f64(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("punpcklbh %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-/* Since punpcklbh doesn't care about the high 32-bits, we use the __m32,
- datatype, which allows load8888 to use 32-bit loads. */
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_unpacklo_pi8_f(__m32 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("punpcklbh %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("punpcklhw %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_unpacklo_pi16_f(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("punpcklhw %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("punpcklwd %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_unpacklo_pi32_f(__m64 __m1, __m64 __m2)
-{
- __m64 ret;
-
- asm("punpcklwd %0, %1, %2\n\t"
- : "=f" (ret)
- : "f" (__m1), "f" (__m2)
- );
-
- return ret;
-}
-
-extern __inline void FUNCTION_ATTRIBS
-_mm_store_pi32(__m32 *dest, __m64 src)
-{
- src = _mm_packs_pu16(src, _mm_setzero_si64());
-
- asm("swc1 %1, %0\n\t"
- : "=m" (*dest)
- : "f" (src)
- : "memory"
- );
-}
-
-extern __inline void FUNCTION_ATTRIBS
-_mm_store_si64(__m64 *dest, __m64 src)
-{
- asm("gssdlc1 %1, 7+%0\n\t"
- "gssdrc1 %1, %0\n\t"
- : "=m" (*dest)
- : "f" (src)
- : "memory"
- );
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_load_si32(const __m32 *src)
-{
- __m32 ret;
-
- asm("lwc1 %0, %1\n\t"
- : "=f" (ret)
- : "m" (*src)
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_load_si64(const __m64 *src)
-{
- __m64 ret;
-
- asm("ldc1 %0, %1\n\t"
- : "=f" (ret)
- : "m" (*src)
- : "memory"
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_loadu_si64(const __m64 *src)
-{
- __m64 ret;
-
- asm("gsldlc1 %0, 7(%1)\n\t"
- "gsldrc1 %0, 0(%1)\n\t"
- : "=f" (ret)
- : "r" (src)
- : "memory"
- );
-
- return ret;
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_loadlo_pi8(const uint32_t *src)
-{
- return _mm_unpacklo_pi8_f(*(__m32 *)src, _mm_setzero_si64());
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_loadlo_pi8_f(__m64 src)
-{
- return _mm_unpacklo_pi8_f64(src, _mm_setzero_si64());
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_loadhi_pi8_f(__m64 src)
-{
- return _mm_unpackhi_pi8_f(src, _mm_setzero_si64());
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_loadlo_pi16(__m64 src)
-{
- return _mm_unpacklo_pi16(src, _mm_setzero_si64());
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_loadlo_pi16_f(__m64 src)
-{
- return _mm_unpacklo_pi16_f(_mm_setzero_si64(), src);
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_loadhi_pi16(__m64 src)
-{
- return _mm_unpackhi_pi16(src, _mm_setzero_si64());
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_loadhi_pi16_f(__m64 src)
-{
- return _mm_unpackhi_pi16_f(_mm_setzero_si64(), src);
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_expand_alpha(__m64 pixel)
-{
- return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(3, 3, 3, 3));
-}
-
-extern __inline __m64 FUNCTION_ATTRIBS
-_mm_expand_alpha_rev(__m64 pixel)
-{
- return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(0, 0, 0, 0));
-}
-
-#endif /* __LOONGSON_MMINTRIN_H__ */
diff --git a/simd/mips/jsimd.c b/simd/mips/jsimd.c
deleted file mode 100644
index 454cc99..0000000
--- a/simd/mips/jsimd.c
+++ /dev/null
@@ -1,1123 +0,0 @@
-/*
- * jsimd_mips.c
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander.
- * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
- * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
- *
- * Based on the x86 SIMD extension for IJG JPEG library,
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- * For conditions of distribution and use, see copyright notice in jsimdext.inc
- *
- * This file contains the interface between the "normal" portions
- * of the library and the SIMD implementations when running on a
- * MIPS architecture.
- */
-
-#define JPEG_INTERNALS
-#include "../../jinclude.h"
-#include "../../jpeglib.h"
-#include "../../jsimd.h"
-#include "../../jdct.h"
-#include "../../jsimddct.h"
-#include "../jsimd.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-
-static unsigned int simd_support = ~0;
-
-#if defined(__linux__)
-
-LOCAL(int)
-parse_proc_cpuinfo(const char *search_string)
-{
- const char *file_name = "/proc/cpuinfo";
- char cpuinfo_line[256];
- FILE *f = NULL;
-
- simd_support = 0;
-
- if ((f = fopen(file_name, "r")) != NULL) {
- while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f) != NULL) {
- if (strstr(cpuinfo_line, search_string) != NULL) {
- fclose(f);
- simd_support |= JSIMD_DSPR2;
- return 1;
- }
- }
- fclose(f);
- }
- /* Did not find string in the proc file, or not Linux ELF. */
- return 0;
-}
-
-#endif
-
-/*
- * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
- */
-LOCAL(void)
-init_simd(void)
-{
-#ifndef NO_GETENV
- char *env = NULL;
-#endif
-
- if (simd_support != ~0U)
- return;
-
- simd_support = 0;
-
-#if defined(__MIPSEL__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
- simd_support |= JSIMD_DSPR2;
-#elif defined(__linux__)
- /* We still have a chance to use MIPS DSPR2 regardless of globally used
- * -mdspr2 options passed to gcc by performing runtime detection via
- * /proc/cpuinfo parsing on linux */
- if (!parse_proc_cpuinfo("MIPS 74K"))
- return;
-#endif
-
-#ifndef NO_GETENV
- /* Force different settings through environment variables */
- env = getenv("JSIMD_FORCEDSPR2");
- if ((env != NULL) && (strcmp(env, "1") == 0))
- simd_support = JSIMD_DSPR2;
- env = getenv("JSIMD_FORCENONE");
- if ((env != NULL) && (strcmp(env, "1") == 0))
- simd_support = 0;
-#endif
-}
-
-static const int mips_idct_ifast_coefs[4] = {
- 0x45404540, /* FIX( 1.082392200 / 2) = 17734 = 0x4546 */
- 0x5A805A80, /* FIX( 1.414213562 / 2) = 23170 = 0x5A82 */
- 0x76407640, /* FIX( 1.847759065 / 2) = 30274 = 0x7642 */
- 0xAC60AC60 /* FIX(-2.613125930 / 4) = -21407 = 0xAC61 */
-};
-
-/* The following struct is borrowed from jdsample.c */
-typedef void (*upsample1_ptr) (j_decompress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr);
-typedef struct {
- struct jpeg_upsampler pub;
- JSAMPARRAY color_buf[MAX_COMPONENTS];
- upsample1_ptr methods[MAX_COMPONENTS];
- int next_row_out;
- JDIMENSION rows_to_go;
- int rowgroup_height[MAX_COMPONENTS];
- UINT8 h_expand[MAX_COMPONENTS];
- UINT8 v_expand[MAX_COMPONENTS];
-} my_upsampler;
-
-typedef my_upsampler *my_upsample_ptr;
-
-GLOBAL(int)
-jsimd_can_rgb_ycc(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
- return 0;
-
- if (simd_support & JSIMD_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_gray(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
- return 0;
-
- if (simd_support & JSIMD_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
- return 0;
-
- if (simd_support & JSIMD_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb565(void)
-{
- return 0;
-}
-
-GLOBAL(int)
-jsimd_c_can_null_convert(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
- JSAMPIMAGE output_buf, JDIMENSION output_row,
- int num_rows)
-{
- void (*dspr2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
- switch (cinfo->in_color_space) {
- case JCS_EXT_RGB:
- dspr2fct = jsimd_extrgb_ycc_convert_dspr2;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- dspr2fct = jsimd_extrgbx_ycc_convert_dspr2;
- break;
- case JCS_EXT_BGR:
- dspr2fct = jsimd_extbgr_ycc_convert_dspr2;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- dspr2fct = jsimd_extbgrx_ycc_convert_dspr2;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- dspr2fct = jsimd_extxbgr_ycc_convert_dspr2;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- dspr2fct = jsimd_extxrgb_ycc_convert_dspr2;
- break;
- default:
- dspr2fct = jsimd_extrgb_ycc_convert_dspr2;
- break;
- }
-
- dspr2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
- JSAMPIMAGE output_buf, JDIMENSION output_row,
- int num_rows)
-{
- void (*dspr2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
- switch (cinfo->in_color_space) {
- case JCS_EXT_RGB:
- dspr2fct = jsimd_extrgb_gray_convert_dspr2;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- dspr2fct = jsimd_extrgbx_gray_convert_dspr2;
- break;
- case JCS_EXT_BGR:
- dspr2fct = jsimd_extbgr_gray_convert_dspr2;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- dspr2fct = jsimd_extbgrx_gray_convert_dspr2;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- dspr2fct = jsimd_extxbgr_gray_convert_dspr2;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- dspr2fct = jsimd_extxrgb_gray_convert_dspr2;
- break;
- default:
- dspr2fct = jsimd_extrgb_gray_convert_dspr2;
- break;
- }
-
- dspr2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
- JDIMENSION input_row, JSAMPARRAY output_buf,
- int num_rows)
-{
- void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
-
- switch (cinfo->out_color_space) {
- case JCS_EXT_RGB:
- dspr2fct = jsimd_ycc_extrgb_convert_dspr2;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- dspr2fct = jsimd_ycc_extrgbx_convert_dspr2;
- break;
- case JCS_EXT_BGR:
- dspr2fct = jsimd_ycc_extbgr_convert_dspr2;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- dspr2fct = jsimd_ycc_extbgrx_convert_dspr2;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- dspr2fct = jsimd_ycc_extxbgr_convert_dspr2;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- dspr2fct = jsimd_ycc_extxrgb_convert_dspr2;
- break;
- default:
- dspr2fct = jsimd_ycc_extrgb_convert_dspr2;
- break;
- }
-
- dspr2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
- JDIMENSION input_row, JSAMPARRAY output_buf,
- int num_rows)
-{
-}
-
-GLOBAL(void)
-jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
- JSAMPIMAGE output_buf, JDIMENSION output_row,
- int num_rows)
-{
- jsimd_c_null_convert_dspr2(cinfo->image_width, input_buf, output_buf,
- output_row, num_rows, cinfo->num_components);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_downsample(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_smooth_downsample(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (DCTSIZE != 8)
- return 0;
-
- if (simd_support & JSIMD_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_downsample(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
- jsimd_h2v2_downsample_dspr2(cinfo->image_width, cinfo->max_v_samp_factor,
- compptr->v_samp_factor, compptr->width_in_blocks,
- input_data, output_data);
-}
-
-GLOBAL(void)
-jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo,
- jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
- jsimd_h2v2_smooth_downsample_dspr2(input_data, output_data,
- compptr->v_samp_factor,
- cinfo->max_v_samp_factor,
- cinfo->smoothing_factor,
- compptr->width_in_blocks,
- cinfo->image_width);
-}
-
-GLOBAL(void)
-jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
- jsimd_h2v1_downsample_dspr2(cinfo->image_width, cinfo->max_v_samp_factor,
- compptr->v_samp_factor, compptr->width_in_blocks,
- input_data, output_data);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_upsample(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_upsample(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_int_upsample(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
-{
- jsimd_h2v2_upsample_dspr2(cinfo->max_v_samp_factor, cinfo->output_width,
- input_data, output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
-{
- jsimd_h2v1_upsample_dspr2(cinfo->max_v_samp_factor, cinfo->output_width,
- input_data, output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
-{
- my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
-
- jsimd_int_upsample_dspr2(upsample->h_expand[compptr->component_index],
- upsample->v_expand[compptr->component_index],
- input_data, output_data_ptr, cinfo->output_width,
- cinfo->max_v_samp_factor);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
-{
- jsimd_h2v2_fancy_upsample_dspr2(cinfo->max_v_samp_factor,
- compptr->downsampled_width, input_data,
- output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
-{
- jsimd_h2v1_fancy_upsample_dspr2(cinfo->max_v_samp_factor,
- compptr->downsampled_width, input_data,
- output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_merged_upsample(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_merged_upsample(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
-{
- void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, JSAMPLE *);
-
- switch (cinfo->out_color_space) {
- case JCS_EXT_RGB:
- dspr2fct = jsimd_h2v2_extrgb_merged_upsample_dspr2;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- dspr2fct = jsimd_h2v2_extrgbx_merged_upsample_dspr2;
- break;
- case JCS_EXT_BGR:
- dspr2fct = jsimd_h2v2_extbgr_merged_upsample_dspr2;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- dspr2fct = jsimd_h2v2_extbgrx_merged_upsample_dspr2;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- dspr2fct = jsimd_h2v2_extxbgr_merged_upsample_dspr2;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- dspr2fct = jsimd_h2v2_extxrgb_merged_upsample_dspr2;
- break;
- default:
- dspr2fct = jsimd_h2v2_extrgb_merged_upsample_dspr2;
- break;
- }
-
- dspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf,
- cinfo->sample_range_limit);
-}
-
-GLOBAL(void)
-jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
-{
- void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, JSAMPLE *);
-
- switch (cinfo->out_color_space) {
- case JCS_EXT_RGB:
- dspr2fct = jsimd_h2v1_extrgb_merged_upsample_dspr2;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- dspr2fct = jsimd_h2v1_extrgbx_merged_upsample_dspr2;
- break;
- case JCS_EXT_BGR:
- dspr2fct = jsimd_h2v1_extbgr_merged_upsample_dspr2;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- dspr2fct = jsimd_h2v1_extbgrx_merged_upsample_dspr2;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- dspr2fct = jsimd_h2v1_extxbgr_merged_upsample_dspr2;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- dspr2fct = jsimd_h2v1_extxrgb_merged_upsample_dspr2;
- break;
- default:
- dspr2fct = jsimd_h2v1_extrgb_merged_upsample_dspr2;
- break;
- }
-
- dspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf,
- cinfo->sample_range_limit);
-}
-
-GLOBAL(int)
-jsimd_can_convsamp(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if (simd_support & JSIMD_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_convsamp_float(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(ISLOW_MULT_TYPE) != 2)
- return 0;
-
-#ifndef __mips_soft_float
- if (simd_support & JSIMD_DSPR2)
- return 1;
-#endif
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
- DCTELEM *workspace)
-{
- jsimd_convsamp_dspr2(sample_data, start_col, workspace);
-}
-
-GLOBAL(void)
-jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
- FAST_FLOAT *workspace)
-{
-#ifndef __mips_soft_float
- jsimd_convsamp_float_dspr2(sample_data, start_col, workspace);
-#endif
-}
-
-GLOBAL(int)
-jsimd_can_fdct_islow(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if (simd_support & JSIMD_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_ifast(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if (simd_support & JSIMD_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_float(void)
-{
- return 0;
-}
-
-GLOBAL(void)
-jsimd_fdct_islow(DCTELEM *data)
-{
- jsimd_fdct_islow_dspr2(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_ifast(DCTELEM *data)
-{
- jsimd_fdct_ifast_dspr2(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_float(FAST_FLOAT *data)
-{
-}
-
-GLOBAL(int)
-jsimd_can_quantize(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if (simd_support & JSIMD_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_quantize_float(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(ISLOW_MULT_TYPE) != 2)
- return 0;
-
-#ifndef __mips_soft_float
- if (simd_support & JSIMD_DSPR2)
- return 1;
-#endif
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
-{
- jsimd_quantize_dspr2(coef_block, divisors, workspace);
-}
-
-GLOBAL(void)
-jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
- FAST_FLOAT *workspace)
-{
-#ifndef __mips_soft_float
- jsimd_quantize_float_dspr2(coef_block, divisors, workspace);
-#endif
-}
-
-GLOBAL(int)
-jsimd_can_idct_2x2(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(ISLOW_MULT_TYPE) != 2)
- return 0;
-
- if (simd_support & JSIMD_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_4x4(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(ISLOW_MULT_TYPE) != 2)
- return 0;
-
- if (simd_support & JSIMD_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_6x6(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(ISLOW_MULT_TYPE) != 2)
- return 0;
-
- if (simd_support & JSIMD_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_12x12(void)
-{
- init_simd();
-
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(ISLOW_MULT_TYPE) != 2)
- return 0;
-
- if (simd_support & JSIMD_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- jsimd_idct_2x2_dspr2(compptr->dct_table, coef_block, output_buf, output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- int workspace[DCTSIZE * 4]; /* buffers data between passes */
-
- jsimd_idct_4x4_dspr2(compptr->dct_table, coef_block, output_buf, output_col,
- workspace);
-}
-
-GLOBAL(void)
-jsimd_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- jsimd_idct_6x6_dspr2(compptr->dct_table, coef_block, output_buf, output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- int workspace[96];
- int output[12] = {
- (int)(output_buf[0] + output_col),
- (int)(output_buf[1] + output_col),
- (int)(output_buf[2] + output_col),
- (int)(output_buf[3] + output_col),
- (int)(output_buf[4] + output_col),
- (int)(output_buf[5] + output_col),
- (int)(output_buf[6] + output_col),
- (int)(output_buf[7] + output_col),
- (int)(output_buf[8] + output_col),
- (int)(output_buf[9] + output_col),
- (int)(output_buf[10] + output_col),
- (int)(output_buf[11] + output_col)
- };
-
- jsimd_idct_12x12_pass1_dspr2(coef_block, compptr->dct_table, workspace);
- jsimd_idct_12x12_pass2_dspr2(workspace, output);
-}
-
-GLOBAL(int)
-jsimd_can_idct_islow(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(ISLOW_MULT_TYPE) != 2)
- return 0;
-
- if (simd_support & JSIMD_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_ifast(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(IFAST_MULT_TYPE) != 2)
- return 0;
- if (IFAST_SCALE_BITS != 2)
- return 0;
-
- if (simd_support & JSIMD_DSPR2)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_float(void)
-{
- return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- int output[8] = {
- (int)(output_buf[0] + output_col),
- (int)(output_buf[1] + output_col),
- (int)(output_buf[2] + output_col),
- (int)(output_buf[3] + output_col),
- (int)(output_buf[4] + output_col),
- (int)(output_buf[5] + output_col),
- (int)(output_buf[6] + output_col),
- (int)(output_buf[7] + output_col)
- };
-
- jsimd_idct_islow_dspr2(coef_block, compptr->dct_table, output,
- IDCT_range_limit(cinfo));
-}
-
-GLOBAL(void)
-jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- JCOEFPTR inptr;
- IFAST_MULT_TYPE *quantptr;
- DCTELEM workspace[DCTSIZE2]; /* buffers data between passes */
-
- /* Pass 1: process columns from input, store into work array. */
-
- inptr = coef_block;
- quantptr = (IFAST_MULT_TYPE *)compptr->dct_table;
-
- jsimd_idct_ifast_cols_dspr2(inptr, quantptr, workspace,
- mips_idct_ifast_coefs);
-
- /* Pass 2: process rows from work array, store into output array. */
- /* Note that we must descale the results by a factor of 8 == 2**3, */
- /* and also undo the PASS1_BITS scaling. */
-
- jsimd_idct_ifast_rows_dspr2(workspace, output_buf, output_col,
- mips_idct_ifast_coefs);
-}
-
-GLOBAL(void)
-jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
-}
-
-GLOBAL(int)
-jsimd_can_huff_encode_one_block(void)
-{
- return 0;
-}
-
-GLOBAL(JOCTET *)
-jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
- int last_dc_val, c_derived_tbl *dctbl,
- c_derived_tbl *actbl)
-{
- return NULL;
-}
-
-GLOBAL(int)
-jsimd_can_encode_mcu_AC_first_prepare(void)
-{
- return 0;
-}
-
-GLOBAL(void)
-jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
- const int *jpeg_natural_order_start, int Sl,
- int Al, JCOEF *values, size_t *zerobits)
-{
-}
-
-GLOBAL(int)
-jsimd_can_encode_mcu_AC_refine_prepare(void)
-{
- return 0;
-}
-
-GLOBAL(int)
-jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
- const int *jpeg_natural_order_start, int Sl,
- int Al, JCOEF *absvalues, size_t *bits)
-{
- return 0;
-}
diff --git a/simd/mips/jsimd_dspr2.S b/simd/mips/jsimd_dspr2.S
deleted file mode 100644
index a28c116..0000000
--- a/simd/mips/jsimd_dspr2.S
+++ /dev/null
@@ -1,4479 +0,0 @@
-/*
- * MIPS DSPr2 optimizations for libjpeg-turbo
- *
- * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
- * All Rights Reserved.
- * Authors: Teodora Novkovic <teodora.novkovic@imgtec.com>
- * Darko Laus <darko.laus@imgtec.com>
- * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-#include "jsimd_dspr2_asm.h"
-
-
-/*****************************************************************************/
-LEAF_DSPR2(jsimd_c_null_convert_dspr2)
-/*
- * a0 = cinfo->image_width
- * a1 = input_buf
- * a2 = output_buf
- * a3 = output_row
- * 16(sp) = num_rows
- * 20(sp) = cinfo->num_components
- *
- * Null conversion for compression
- */
- SAVE_REGS_ON_STACK 8, s0, s1
-
- lw t9, 24(sp) // t9 = num_rows
- lw s0, 28(sp) // s0 = cinfo->num_components
- andi t0, a0, 3 // t0 = cinfo->image_width & 3
- beqz t0, 4f // no residual
- nop
-0:
- addiu t9, t9, -1
- bltz t9, 7f
- li t1, 0
-1:
- sll t3, t1, 2
- lwx t5, t3(a2) // t5 = outptr = output_buf[ci]
- lw t2, 0(a1) // t2 = inptr = *input_buf
- sll t4, a3, 2
- lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row]
- addu t2, t2, t1
- addu s1, t5, a0
- addu t6, t5, t0
-2:
- lbu t3, 0(t2)
- addiu t5, t5, 1
- sb t3, -1(t5)
- bne t6, t5, 2b
- addu t2, t2, s0
-3:
- lbu t3, 0(t2)
- addu t4, t2, s0
- addu t7, t4, s0
- addu t8, t7, s0
- addu t2, t8, s0
- lbu t4, 0(t4)
- lbu t7, 0(t7)
- lbu t8, 0(t8)
- addiu t5, t5, 4
- sb t3, -4(t5)
- sb t4, -3(t5)
- sb t7, -2(t5)
- bne s1, t5, 3b
- sb t8, -1(t5)
- addiu t1, t1, 1
- bne t1, s0, 1b
- nop
- addiu a1, a1, 4
- bgez t9, 0b
- addiu a3, a3, 1
- b 7f
- nop
-4:
- addiu t9, t9, -1
- bltz t9, 7f
- li t1, 0
-5:
- sll t3, t1, 2
- lwx t5, t3(a2) // t5 = outptr = output_buf[ci]
- lw t2, 0(a1) // t2 = inptr = *input_buf
- sll t4, a3, 2
- lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row]
- addu t2, t2, t1
- addu s1, t5, a0
- addu t6, t5, t0
-6:
- lbu t3, 0(t2)
- addu t4, t2, s0
- addu t7, t4, s0
- addu t8, t7, s0
- addu t2, t8, s0
- lbu t4, 0(t4)
- lbu t7, 0(t7)
- lbu t8, 0(t8)
- addiu t5, t5, 4
- sb t3, -4(t5)
- sb t4, -3(t5)
- sb t7, -2(t5)
- bne s1, t5, 6b
- sb t8, -1(t5)
- addiu t1, t1, 1
- bne t1, s0, 5b
- nop
- addiu a1, a1, 4
- bgez t9, 4b
- addiu a3, a3, 1
-7:
- RESTORE_REGS_FROM_STACK 8, s0, s1
-
- j ra
- nop
-
-END(jsimd_c_null_convert_dspr2)
-
-
-/*****************************************************************************/
-/*
- * jsimd_extrgb_ycc_convert_dspr2
- * jsimd_extbgr_ycc_convert_dspr2
- * jsimd_extrgbx_ycc_convert_dspr2
- * jsimd_extbgrx_ycc_convert_dspr2
- * jsimd_extxbgr_ycc_convert_dspr2
- * jsimd_extxrgb_ycc_convert_dspr2
- *
- * Colorspace conversion RGB -> YCbCr
- */
-
-.macro GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 colorid, pixel_size, \
- r_offs, g_offs, b_offs
-
-.macro DO_RGB_TO_YCC r, g, b, inptr
- lbu \r, \r_offs(\inptr)
- lbu \g, \g_offs(\inptr)
- lbu \b, \b_offs(\inptr)
- addiu \inptr, \pixel_size
-.endm
-
-LEAF_DSPR2(jsimd_\colorid\()_ycc_convert_dspr2)
-/*
- * a0 = cinfo->image_width
- * a1 = input_buf
- * a2 = output_buf
- * a3 = output_row
- * 16(sp) = num_rows
- */
- SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- lw t7, 48(sp) // t7 = num_rows
- li s0, 0x4c8b // FIX(0.29900)
- li s1, 0x9646 // FIX(0.58700)
- li s2, 0x1d2f // FIX(0.11400)
- li s3, 0xffffd4cd // -FIX(0.16874)
- li s4, 0xffffab33 // -FIX(0.33126)
- li s5, 0x8000 // FIX(0.50000)
- li s6, 0xffff94d1 // -FIX(0.41869)
- li s7, 0xffffeb2f // -FIX(0.08131)
- li t8, 0x807fff // CBCR_OFFSET + ONE_HALF-1
-
-0:
- addiu t7, -1 // --num_rows
- lw t6, 0(a1) // t6 = input_buf[0]
- lw t0, 0(a2)
- lw t1, 4(a2)
- lw t2, 8(a2)
- sll t3, a3, 2
- lwx t0, t3(t0) // t0 = output_buf[0][output_row]
- lwx t1, t3(t1) // t1 = output_buf[1][output_row]
- lwx t2, t3(t2) // t2 = output_buf[2][output_row]
-
- addu t9, t2, a0 // t9 = end address
- addiu a3, 1
-
-1:
- DO_RGB_TO_YCC t3, t4, t5, t6
-
- mtlo s5, $ac0
- mtlo t8, $ac1
- mtlo t8, $ac2
- maddu $ac0, s2, t5
- maddu $ac1, s5, t5
- maddu $ac2, s5, t3
- maddu $ac0, s0, t3
- maddu $ac1, s3, t3
- maddu $ac2, s6, t4
- maddu $ac0, s1, t4
- maddu $ac1, s4, t4
- maddu $ac2, s7, t5
- extr.w t3, $ac0, 16
- extr.w t4, $ac1, 16
- extr.w t5, $ac2, 16
- sb t3, 0(t0)
- sb t4, 0(t1)
- sb t5, 0(t2)
- addiu t0, 1
- addiu t2, 1
- bne t2, t9, 1b
- addiu t1, 1
- bgtz t7, 0b
- addiu a1, 4
-
- RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- j ra
- nop
-END(jsimd_\colorid\()_ycc_convert_dspr2)
-
-.purgem DO_RGB_TO_YCC
-
-.endm
-
-/*-------------------------------------id -- pix R G B */
-GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgb, 3, 0, 1, 2
-GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgr, 3, 2, 1, 0
-GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
-GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
-GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
-GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
-
-
-/*****************************************************************************/
-/*
- * jsimd_ycc_extrgb_convert_dspr2
- * jsimd_ycc_extbgr_convert_dspr2
- * jsimd_ycc_extrgbx_convert_dspr2
- * jsimd_ycc_extbgrx_convert_dspr2
- * jsimd_ycc_extxbgr_convert_dspr2
- * jsimd_ycc_extxrgb_convert_dspr2
- *
- * Colorspace conversion YCbCr -> RGB
- */
-
-.macro GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 colorid, pixel_size, \
- r_offs, g_offs, b_offs, a_offs
-
-.macro STORE_YCC_TO_RGB scratch0 scratch1 scratch2 outptr
- sb \scratch0, \r_offs(\outptr)
- sb \scratch1, \g_offs(\outptr)
- sb \scratch2, \b_offs(\outptr)
-.if (\pixel_size == 4)
- li t0, 0xFF
- sb t0, \a_offs(\outptr)
-.endif
- addiu \outptr, \pixel_size
-.endm
-
-LEAF_DSPR2(jsimd_ycc_\colorid\()_convert_dspr2)
-/*
- * a0 = cinfo->image_width
- * a1 = input_buf
- * a2 = input_row
- * a3 = output_buf
- * 16(sp) = num_rows
- */
- SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- lw s1, 48(sp)
- li t3, 0x8000
- li t4, 0x166e9 // FIX(1.40200)
- li t5, 0x1c5a2 // FIX(1.77200)
- li t6, 0xffff492e // -FIX(0.71414)
- li t7, 0xffffa7e6 // -FIX(0.34414)
- repl.ph t8, 128
-
-0:
- lw s0, 0(a3)
- lw t0, 0(a1)
- lw t1, 4(a1)
- lw t2, 8(a1)
- sll s5, a2, 2
- addiu s1, -1
- lwx s2, s5(t0)
- lwx s3, s5(t1)
- lwx s4, s5(t2)
- addu t9, s2, a0
- addiu a2, 1
-
-1:
- lbu s7, 0(s4) // cr
- lbu s6, 0(s3) // cb
- lbu s5, 0(s2) // y
- addiu s2, 1
- addiu s4, 1
- addiu s7, -128
- addiu s6, -128
- mul t2, t7, s6
- mul t0, t6, s7 // Crgtab[cr]
- sll s7, 15
- mulq_rs.w t1, t4, s7 // Crrtab[cr]
- sll s6, 15
- addu t2, t3 // Cbgtab[cb]
- addu t2, t0
-
- mulq_rs.w t0, t5, s6 // Cbbtab[cb]
- sra t2, 16
- addu t1, s5
- addu t2, s5 // add y
- ins t2, t1, 16, 16
- subu.ph t2, t2, t8
- addu t0, s5
- shll_s.ph t2, t2, 8
- subu t0, 128
- shra.ph t2, t2, 8
- shll_s.w t0, t0, 24
- addu.ph t2, t2, t8 // clip & store
- sra t0, t0, 24
- sra t1, t2, 16
- addiu t0, 128
-
- STORE_YCC_TO_RGB t1, t2, t0, s0
-
- bne s2, t9, 1b
- addiu s3, 1
- bgtz s1, 0b
- addiu a3, 4
-
- RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- j ra
- nop
-END(jsimd_ycc_\colorid\()_convert_dspr2)
-
-.purgem STORE_YCC_TO_RGB
-
-.endm
-
-/*-------------------------------------id -- pix R G B A */
-GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgb, 3, 0, 1, 2, 3
-GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgr, 3, 2, 1, 0, 3
-GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2, 3
-GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0, 3
-GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1, 0
-GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3, 0
-
-
-/*****************************************************************************/
-/*
- * jsimd_extrgb_gray_convert_dspr2
- * jsimd_extbgr_gray_convert_dspr2
- * jsimd_extrgbx_gray_convert_dspr2
- * jsimd_extbgrx_gray_convert_dspr2
- * jsimd_extxbgr_gray_convert_dspr2
- * jsimd_extxrgb_gray_convert_dspr2
- *
- * Colorspace conversion RGB -> GRAY
- */
-
-.macro GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 colorid, pixel_size, \
- r_offs, g_offs, b_offs
-
-.macro DO_RGB_TO_GRAY r, g, b, inptr
- lbu \r, \r_offs(\inptr)
- lbu \g, \g_offs(\inptr)
- lbu \b, \b_offs(\inptr)
- addiu \inptr, \pixel_size
-.endm
-
-LEAF_DSPR2(jsimd_\colorid\()_gray_convert_dspr2)
-/*
- * a0 = cinfo->image_width
- * a1 = input_buf
- * a2 = output_buf
- * a3 = output_row
- * 16(sp) = num_rows
- */
- SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- li s0, 0x4c8b // s0 = FIX(0.29900)
- li s1, 0x9646 // s1 = FIX(0.58700)
- li s2, 0x1d2f // s2 = FIX(0.11400)
- li s7, 0x8000 // s7 = FIX(0.50000)
- lw s6, 48(sp)
- andi t7, a0, 3
-
-0:
- addiu s6, -1 // s6 = num_rows
- lw t0, 0(a1)
- lw t1, 0(a2)
- sll t3, a3, 2
- lwx t1, t3(t1)
- addiu a3, 1
- addu t9, t1, a0
- subu t8, t9, t7
- beq t1, t8, 2f
- nop
-
-1:
- DO_RGB_TO_GRAY t3, t4, t5, t0
- DO_RGB_TO_GRAY s3, s4, s5, t0
-
- mtlo s7, $ac0
- maddu $ac0, s2, t5
- maddu $ac0, s1, t4
- maddu $ac0, s0, t3
- mtlo s7, $ac1
- maddu $ac1, s2, s5
- maddu $ac1, s1, s4
- maddu $ac1, s0, s3
- extr.w t6, $ac0, 16
-
- DO_RGB_TO_GRAY t3, t4, t5, t0
- DO_RGB_TO_GRAY s3, s4, s5, t0
-
- mtlo s7, $ac0
- maddu $ac0, s2, t5
- maddu $ac0, s1, t4
- extr.w t2, $ac1, 16
- maddu $ac0, s0, t3
- mtlo s7, $ac1
- maddu $ac1, s2, s5
- maddu $ac1, s1, s4
- maddu $ac1, s0, s3
- extr.w t5, $ac0, 16
- sb t6, 0(t1)
- sb t2, 1(t1)
- extr.w t3, $ac1, 16
- addiu t1, 4
- sb t5, -2(t1)
- sb t3, -1(t1)
- bne t1, t8, 1b
- nop
-
-2:
- beqz t7, 4f
- nop
-
-3:
- DO_RGB_TO_GRAY t3, t4, t5, t0
-
- mtlo s7, $ac0
- maddu $ac0, s2, t5
- maddu $ac0, s1, t4
- maddu $ac0, s0, t3
- extr.w t6, $ac0, 16
- sb t6, 0(t1)
- addiu t1, 1
- bne t1, t9, 3b
- nop
-
-4:
- bgtz s6, 0b
- addiu a1, 4
-
- RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- j ra
- nop
-END(jsimd_\colorid\()_gray_convert_dspr2)
-
-.purgem DO_RGB_TO_GRAY
-
-.endm
-
-/*-------------------------------------id -- pix R G B */
-GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgb, 3, 0, 1, 2
-GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgr, 3, 2, 1, 0
-GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
-GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
-GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
-GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
-
-
-/*****************************************************************************/
-/*
- * jsimd_h2v2_merged_upsample_dspr2
- * jsimd_h2v2_extrgb_merged_upsample_dspr2
- * jsimd_h2v2_extrgbx_merged_upsample_dspr2
- * jsimd_h2v2_extbgr_merged_upsample_dspr2
- * jsimd_h2v2_extbgrx_merged_upsample_dspr2
- * jsimd_h2v2_extxbgr_merged_upsample_dspr2
- * jsimd_h2v2_extxrgb_merged_upsample_dspr2
- *
- * Merged h2v2 upsample routines
- */
-.macro GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \
- r1_offs, g1_offs, \
- b1_offs, a1_offs, \
- r2_offs, g2_offs, \
- b2_offs, a2_offs
-
-.macro STORE_H2V2_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \
- scratch5 outptr
- sb \scratch0, \r1_offs(\outptr)
- sb \scratch1, \g1_offs(\outptr)
- sb \scratch2, \b1_offs(\outptr)
- sb \scratch3, \r2_offs(\outptr)
- sb \scratch4, \g2_offs(\outptr)
- sb \scratch5, \b2_offs(\outptr)
-.if (\pixel_size == 8)
- li \scratch0, 0xFF
- sb \scratch0, \a1_offs(\outptr)
- sb \scratch0, \a2_offs(\outptr)
-.endif
- addiu \outptr, \pixel_size
-.endm
-
-.macro STORE_H2V2_1_PIXEL scratch0 scratch1 scratch2 outptr
- sb \scratch0, \r1_offs(\outptr)
- sb \scratch1, \g1_offs(\outptr)
- sb \scratch2, \b1_offs(\outptr)
-
-.if (\pixel_size == 8)
- li t0, 0xFF
- sb t0, \a1_offs(\outptr)
-.endif
-.endm
-
-LEAF_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
-/*
- * a0 = cinfo->output_width
- * a1 = input_buf
- * a2 = in_row_group_ctr
- * a3 = output_buf
- * 16(sp) = cinfo->sample_range_limit
- */
- SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
-
- lw t9, 56(sp) // cinfo->sample_range_limit
- lw v0, 0(a1)
- lw v1, 4(a1)
- lw t0, 8(a1)
- sll t1, a2, 3
- addiu t2, t1, 4
- sll t3, a2, 2
- lw t4, 0(a3) // t4 = output_buf[0]
- lwx t1, t1(v0) // t1 = input_buf[0][in_row_group_ctr*2]
- lwx t2, t2(v0) // t2 = input_buf[0][in_row_group_ctr*2 + 1]
- lwx t5, t3(v1) // t5 = input_buf[1][in_row_group_ctr]
- lwx t6, t3(t0) // t6 = input_buf[2][in_row_group_ctr]
- lw t7, 4(a3) // t7 = output_buf[1]
- li s1, 0xe6ea
- addiu t8, s1, 0x7fff // t8 = 0x166e9 [FIX(1.40200)]
- addiu s0, t8, 0x5eb9 // s0 = 0x1c5a2 [FIX(1.77200)]
- addiu s1, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)]
- xori s2, s1, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)]
- srl t3, a0, 1
- blez t3, 2f
- addu t0, t5, t3 // t0 = end address
- 1:
- lbu t3, 0(t5)
- lbu s3, 0(t6)
- addiu t5, t5, 1
- addiu t3, t3, -128 // (cb - 128)
- addiu s3, s3, -128 // (cr - 128)
- mult $ac1, s1, t3
- madd $ac1, s2, s3
- sll s3, s3, 15
- sll t3, t3, 15
- mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
- extr_r.w s5, $ac1, 16
- mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
- lbu v0, 0(t1)
- addiu t6, t6, 1
- addiu t1, t1, 2
- addu t3, v0, s4 // y+cred
- addu s3, v0, s5 // y+cgreen
- addu v1, v0, s6 // y+cblue
- addu t3, t9, t3 // y+cred
- addu s3, t9, s3 // y+cgreen
- addu v1, t9, v1 // y+cblue
- lbu AT, 0(t3)
- lbu s7, 0(s3)
- lbu ra, 0(v1)
- lbu v0, -1(t1)
- addu t3, v0, s4 // y+cred
- addu s3, v0, s5 // y+cgreen
- addu v1, v0, s6 // y+cblue
- addu t3, t9, t3 // y+cred
- addu s3, t9, s3 // y+cgreen
- addu v1, t9, v1 // y+cblue
- lbu t3, 0(t3)
- lbu s3, 0(s3)
- lbu v1, 0(v1)
- lbu v0, 0(t2)
-
- STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4
-
- addu t3, v0, s4 // y+cred
- addu s3, v0, s5 // y+cgreen
- addu v1, v0, s6 // y+cblue
- addu t3, t9, t3 // y+cred
- addu s3, t9, s3 // y+cgreen
- addu v1, t9, v1 // y+cblue
- lbu AT, 0(t3)
- lbu s7, 0(s3)
- lbu ra, 0(v1)
- lbu v0, 1(t2)
- addiu t2, t2, 2
- addu t3, v0, s4 // y+cred
- addu s3, v0, s5 // y+cgreen
- addu v1, v0, s6 // y+cblue
- addu t3, t9, t3 // y+cred
- addu s3, t9, s3 // y+cgreen
- addu v1, t9, v1 // y+cblue
- lbu t3, 0(t3)
- lbu s3, 0(s3)
- lbu v1, 0(v1)
-
- STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7
-
- bne t0, t5, 1b
- nop
-2:
- andi t0, a0, 1
- beqz t0, 4f
- lbu t3, 0(t5)
- lbu s3, 0(t6)
- addiu t3, t3, -128 // (cb - 128)
- addiu s3, s3, -128 // (cr - 128)
- mult $ac1, s1, t3
- madd $ac1, s2, s3
- sll s3, s3, 15
- sll t3, t3, 15
- lbu v0, 0(t1)
- extr_r.w s5, $ac1, 16
- mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
- mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
- addu t3, v0, s4 // y+cred
- addu s3, v0, s5 // y+cgreen
- addu v1, v0, s6 // y+cblue
- addu t3, t9, t3 // y+cred
- addu s3, t9, s3 // y+cgreen
- addu v1, t9, v1 // y+cblue
- lbu t3, 0(t3)
- lbu s3, 0(s3)
- lbu v1, 0(v1)
- lbu v0, 0(t2)
-
- STORE_H2V2_1_PIXEL t3, s3, v1, t4
-
- addu t3, v0, s4 // y+cred
- addu s3, v0, s5 // y+cgreen
- addu v1, v0, s6 // y+cblue
- addu t3, t9, t3 // y+cred
- addu s3, t9, s3 // y+cgreen
- addu v1, t9, v1 // y+cblue
- lbu t3, 0(t3)
- lbu s3, 0(s3)
- lbu v1, 0(v1)
-
- STORE_H2V2_1_PIXEL t3, s3, v1, t7
-4:
- RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
-
- j ra
- nop
-
-END(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
-
-.purgem STORE_H2V2_1_PIXEL
-.purgem STORE_H2V2_2_PIXELS
-.endm
-
-/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
-GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6
-GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6
-GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
-GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
-GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
-GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
-
-
-/*****************************************************************************/
-/*
- * jsimd_h2v1_merged_upsample_dspr2
- * jsimd_h2v1_extrgb_merged_upsample_dspr2
- * jsimd_h2v1_extrgbx_merged_upsample_dspr2
- * jsimd_h2v1_extbgr_merged_upsample_dspr2
- * jsimd_h2v1_extbgrx_merged_upsample_dspr2
- * jsimd_h2v1_extxbgr_merged_upsample_dspr2
- * jsimd_h2v1_extxrgb_merged_upsample_dspr2
- *
- * Merged h2v1 upsample routines
- */
-
-.macro GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \
- r1_offs, g1_offs, \
- b1_offs, a1_offs, \
- r2_offs, g2_offs, \
- b2_offs, a2_offs
-
-.macro STORE_H2V1_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \
- scratch5 outptr
- sb \scratch0, \r1_offs(\outptr)
- sb \scratch1, \g1_offs(\outptr)
- sb \scratch2, \b1_offs(\outptr)
- sb \scratch3, \r2_offs(\outptr)
- sb \scratch4, \g2_offs(\outptr)
- sb \scratch5, \b2_offs(\outptr)
-.if (\pixel_size == 8)
- li t0, 0xFF
- sb t0, \a1_offs(\outptr)
- sb t0, \a2_offs(\outptr)
-.endif
- addiu \outptr, \pixel_size
-.endm
-
-.macro STORE_H2V1_1_PIXEL scratch0 scratch1 scratch2 outptr
- sb \scratch0, \r1_offs(\outptr)
- sb \scratch1, \g1_offs(\outptr)
- sb \scratch2, \b1_offs(\outptr)
-.if (\pixel_size == 8)
- li t0, 0xFF
- sb t0, \a1_offs(\outptr)
-.endif
-.endm
-
-LEAF_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
-/*
- * a0 = cinfo->output_width
- * a1 = input_buf
- * a2 = in_row_group_ctr
- * a3 = output_buf
- * 16(sp) = range_limit
- */
- SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
-
- li t0, 0xe6ea
- lw t1, 0(a1) // t1 = input_buf[0]
- lw t2, 4(a1) // t2 = input_buf[1]
- lw t3, 8(a1) // t3 = input_buf[2]
- lw t8, 56(sp) // t8 = range_limit
- addiu s1, t0, 0x7fff // s1 = 0x166e9 [FIX(1.40200)]
- addiu s2, s1, 0x5eb9 // s2 = 0x1c5a2 [FIX(1.77200)]
- addiu s0, t0, 0x9916 // s0 = 0x8000
- addiu s4, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)]
- xori s3, s4, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)]
- srl t0, a0, 1
- sll t4, a2, 2
- lwx s5, t4(t1) // s5 = inptr0
- lwx s6, t4(t2) // s6 = inptr1
- lwx s7, t4(t3) // s7 = inptr2
- lw t7, 0(a3) // t7 = outptr
- blez t0, 2f
- addu t9, s6, t0 // t9 = end address
-1:
- lbu t2, 0(s6) // t2 = cb
- lbu t0, 0(s7) // t0 = cr
- lbu t1, 0(s5) // t1 = y
- addiu t2, t2, -128 // t2 = cb - 128
- addiu t0, t0, -128 // t0 = cr - 128
- mult $ac1, s4, t2
- madd $ac1, s3, t0
- sll t0, t0, 15
- sll t2, t2, 15
- mulq_rs.w t0, s1, t0 // t0 = (C1*cr + ONE_HALF)>> SCALEBITS
- extr_r.w t5, $ac1, 16
- mulq_rs.w t6, s2, t2 // t6 = (C2*cb + ONE_HALF)>> SCALEBITS
- addiu s7, s7, 1
- addiu s6, s6, 1
- addu t2, t1, t0 // t2 = y + cred
- addu t3, t1, t5 // t3 = y + cgreen
- addu t4, t1, t6 // t4 = y + cblue
- addu t2, t8, t2
- addu t3, t8, t3
- addu t4, t8, t4
- lbu t1, 1(s5)
- lbu v0, 0(t2)
- lbu v1, 0(t3)
- lbu ra, 0(t4)
- addu t2, t1, t0
- addu t3, t1, t5
- addu t4, t1, t6
- addu t2, t8, t2
- addu t3, t8, t3
- addu t4, t8, t4
- lbu t2, 0(t2)
- lbu t3, 0(t3)
- lbu t4, 0(t4)
-
- STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7
-
- bne t9, s6, 1b
- addiu s5, s5, 2
-2:
- andi t0, a0, 1
- beqz t0, 4f
- nop
-3:
- lbu t2, 0(s6)
- lbu t0, 0(s7)
- lbu t1, 0(s5)
- addiu t2, t2, -128 // (cb - 128)
- addiu t0, t0, -128 // (cr - 128)
- mul t3, s4, t2
- mul t4, s3, t0
- sll t0, t0, 15
- sll t2, t2, 15
- mulq_rs.w t0, s1, t0 // (C1*cr + ONE_HALF)>> SCALEBITS
- mulq_rs.w t6, s2, t2 // (C2*cb + ONE_HALF)>> SCALEBITS
- addu t3, t3, s0
- addu t3, t4, t3
- sra t5, t3, 16 // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS
- addu t2, t1, t0 // y + cred
- addu t3, t1, t5 // y + cgreen
- addu t4, t1, t6 // y + cblue
- addu t2, t8, t2
- addu t3, t8, t3
- addu t4, t8, t4
- lbu t2, 0(t2)
- lbu t3, 0(t3)
- lbu t4, 0(t4)
-
- STORE_H2V1_1_PIXEL t2, t3, t4, t7
-4:
- RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
-
- j ra
- nop
-
-END(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
-
-.purgem STORE_H2V1_1_PIXEL
-.purgem STORE_H2V1_2_PIXELS
-.endm
-
-/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
-GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6
-GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6
-GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
-GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
-GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
-GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
-
-
-/*****************************************************************************/
-/*
- * jsimd_h2v2_fancy_upsample_dspr2
- *
- * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
- */
-LEAF_DSPR2(jsimd_h2v2_fancy_upsample_dspr2)
-/*
- * a0 = cinfo->max_v_samp_factor
- * a1 = downsampled_width
- * a2 = input_data
- * a3 = output_data_ptr
- */
- SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
-
- li s4, 0
- lw s2, 0(a3) // s2 = *output_data_ptr
-0:
- li t9, 2
- lw s1, -4(a2) // s1 = inptr1
-
-1:
- lw s0, 0(a2) // s0 = inptr0
- lwx s3, s4(s2)
- addiu s5, a1, -2 // s5 = downsampled_width - 2
- srl t4, s5, 1
- sll t4, t4, 1
- lbu t0, 0(s0)
- lbu t1, 1(s0)
- lbu t2, 0(s1)
- lbu t3, 1(s1)
- addiu s0, 2
- addiu s1, 2
- addu t8, s0, t4 // t8 = end address
- andi s5, s5, 1 // s5 = residual
- sll t4, t0, 1
- sll t6, t1, 1
- addu t0, t0, t4 // t0 = (*inptr0++) * 3
- addu t1, t1, t6 // t1 = (*inptr0++) * 3
- addu t7, t0, t2 // t7 = thiscolsum
- addu t6, t1, t3 // t5 = nextcolsum
- sll t0, t7, 2 // t0 = thiscolsum * 4
- subu t1, t0, t7 // t1 = thiscolsum * 3
- shra_r.w t0, t0, 4
- addiu t1, 7
- addu t1, t1, t6
- srl t1, t1, 4
- sb t0, 0(s3)
- sb t1, 1(s3)
- beq t8, s0, 22f // skip to final iteration if width == 3
- addiu s3, 2
-2:
- lh t0, 0(s0) // t0 = A3|A2
- lh t2, 0(s1) // t2 = B3|B2
- addiu s0, 2
- addiu s1, 2
- preceu.ph.qbr t0, t0 // t0 = 0|A3|0|A2
- preceu.ph.qbr t2, t2 // t2 = 0|B3|0|B2
- shll.ph t1, t0, 1
- sll t3, t6, 1
- addu.ph t0, t1, t0 // t0 = A3*3|A2*3
- addu t3, t3, t6 // t3 = this * 3
- addu.ph t0, t0, t2 // t0 = next2|next1
- addu t1, t3, t7
- andi t7, t0, 0xFFFF // t7 = next1
- sll t2, t7, 1
- addu t2, t7, t2 // t2 = next1*3
- addu t4, t2, t6
- srl t6, t0, 16 // t6 = next2
- shra_r.w t1, t1, 4 // t1 = (this*3 + last + 8) >> 4
- addu t0, t3, t7
- addiu t0, 7
- srl t0, t0, 4 // t0 = (this*3 + next1 + 7) >> 4
- shra_r.w t4, t4, 4 // t3 = (next1*3 + this + 8) >> 4
- addu t2, t2, t6
- addiu t2, 7
- srl t2, t2, 4 // t2 = (next1*3 + next2 + 7) >> 4
- sb t1, 0(s3)
- sb t0, 1(s3)
- sb t4, 2(s3)
- sb t2, 3(s3)
- bne t8, s0, 2b
- addiu s3, 4
-22:
- beqz s5, 4f
- addu t8, s0, s5
-3:
- lbu t0, 0(s0)
- lbu t2, 0(s1)
- addiu s0, 1
- addiu s1, 1
- sll t3, t6, 1
- sll t1, t0, 1
- addu t1, t0, t1 // t1 = inptr0 * 3
- addu t3, t3, t6 // t3 = thiscolsum * 3
- addu t5, t1, t2
- addu t1, t3, t7
- shra_r.w t1, t1, 4
- addu t0, t3, t5
- addiu t0, 7
- srl t0, t0, 4
- sb t1, 0(s3)
- sb t0, 1(s3)
- addiu s3, 2
- move t7, t6
- bne t8, s0, 3b
- move t6, t5
-4:
- sll t0, t6, 2 // t0 = thiscolsum * 4
- subu t1, t0, t6 // t1 = thiscolsum * 3
- addu t1, t1, t7
- addiu s4, 4
- shra_r.w t1, t1, 4
- addiu t0, 7
- srl t0, t0, 4
- sb t1, 0(s3)
- sb t0, 1(s3)
- addiu t9, -1
- addiu s3, 2
- bnez t9, 1b
- lw s1, 4(a2)
- srl t0, s4, 2
- subu t0, a0, t0
- bgtz t0, 0b
- addiu a2, 4
-
- RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
-
- j ra
- nop
-END(jsimd_h2v2_fancy_upsample_dspr2)
-
-
-/*****************************************************************************/
-LEAF_DSPR2(jsimd_h2v1_fancy_upsample_dspr2)
-/*
- * a0 = cinfo->max_v_samp_factor
- * a1 = downsampled_width
- * a2 = input_data
- * a3 = output_data_ptr
- */
- SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
-
- .set at
-
- beqz a0, 3f
- sll t0, a0, 2
- lw s1, 0(a3)
- li s3, 0x10001
- addu s0, s1, t0
-0:
- addiu t8, a1, -2
- srl t9, t8, 2
- lw t7, 0(a2)
- lw s2, 0(s1)
- lbu t0, 0(t7)
- lbu t1, 1(t7) // t1 = inptr[1]
- sll t2, t0, 1
- addu t2, t2, t0 // t2 = invalue*3
- addu t2, t2, t1
- shra_r.w t2, t2, 2
- sb t0, 0(s2)
- sb t2, 1(s2)
- beqz t9, 11f
- addiu s2, 2
-1:
- ulw t0, 0(t7) // t0 = |P3|P2|P1|P0|
- ulw t1, 1(t7)
- ulh t2, 4(t7) // t2 = |0|0|P5|P4|
- preceu.ph.qbl t3, t0 // t3 = |0|P3|0|P2|
- preceu.ph.qbr t0, t0 // t0 = |0|P1|0|P0|
- preceu.ph.qbr t2, t2 // t2 = |0|P5|0|P4|
- preceu.ph.qbl t4, t1 // t4 = |0|P4|0|P3|
- preceu.ph.qbr t1, t1 // t1 = |0|P2|0|P1|
- shll.ph t5, t4, 1
- shll.ph t6, t1, 1
- addu.ph t5, t5, t4 // t5 = |P4*3|P3*3|
- addu.ph t6, t6, t1 // t6 = |P2*3|P1*3|
- addu.ph t4, t3, s3
- addu.ph t0, t0, s3
- addu.ph t4, t4, t5
- addu.ph t0, t0, t6
- shrl.ph t4, t4, 2 // t4 = |0|P3|0|P2|
- shrl.ph t0, t0, 2 // t0 = |0|P1|0|P0|
- addu.ph t2, t2, t5
- addu.ph t3, t3, t6
- shra_r.ph t2, t2, 2 // t2 = |0|P5|0|P4|
- shra_r.ph t3, t3, 2 // t3 = |0|P3|0|P2|
- shll.ph t2, t2, 8
- shll.ph t3, t3, 8
- or t2, t4, t2
- or t3, t3, t0
- addiu t9, -1
- usw t3, 0(s2)
- usw t2, 4(s2)
- addiu s2, 8
- bgtz t9, 1b
- addiu t7, 4
-11:
- andi t8, 3
- beqz t8, 22f
- addiu t7, 1
-
-2:
- lbu t0, 0(t7)
- addiu t7, 1
- sll t1, t0, 1
- addu t2, t0, t1 // t2 = invalue
- lbu t3, -2(t7)
- lbu t4, 0(t7)
- addiu t3, 1
- addiu t4, 2
- addu t3, t3, t2
- addu t4, t4, t2
- srl t3, 2
- srl t4, 2
- sb t3, 0(s2)
- sb t4, 1(s2)
- addiu t8, -1
- bgtz t8, 2b
- addiu s2, 2
-
-22:
- lbu t0, 0(t7)
- lbu t2, -1(t7)
- sll t1, t0, 1
- addu t1, t1, t0 // t1 = invalue * 3
- addu t1, t1, t2
- addiu t1, 1
- srl t1, t1, 2
- sb t1, 0(s2)
- sb t0, 1(s2)
- addiu s1, 4
- bne s1, s0, 0b
- addiu a2, 4
-3:
- RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
-
- j ra
- nop
-END(jsimd_h2v1_fancy_upsample_dspr2)
-
-
-/*****************************************************************************/
-LEAF_DSPR2(jsimd_h2v1_downsample_dspr2)
-/*
- * a0 = cinfo->image_width
- * a1 = cinfo->max_v_samp_factor
- * a2 = compptr->v_samp_factor
- * a3 = compptr->width_in_blocks
- * 16(sp) = input_data
- * 20(sp) = output_data
- */
- .set at
-
- SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
-
- beqz a2, 7f
- lw s1, 44(sp) // s1 = output_data
- lw s0, 40(sp) // s0 = input_data
- srl s2, a0, 2
- andi t9, a0, 2
- srl t7, t9, 1
- addu s2, t7, s2
- sll t0, a3, 3 // t0 = width_in_blocks*DCT
- srl t7, t0, 1
- subu s2, t7, s2
-0:
- andi t6, a0, 1 // t6 = temp_index
- addiu t6, -1
- lw t4, 0(s1) // t4 = outptr
- lw t5, 0(s0) // t5 = inptr0
- li s3, 0 // s3 = bias
- srl t7, a0, 1 // t7 = image_width1
- srl s4, t7, 2
- andi t8, t7, 3
-1:
- ulhu t0, 0(t5)
- ulhu t1, 2(t5)
- ulhu t2, 4(t5)
- ulhu t3, 6(t5)
- raddu.w.qb t0, t0
- raddu.w.qb t1, t1
- raddu.w.qb t2, t2
- raddu.w.qb t3, t3
- shra.ph t0, t0, 1
- shra_r.ph t1, t1, 1
- shra.ph t2, t2, 1
- shra_r.ph t3, t3, 1
- sb t0, 0(t4)
- sb t1, 1(t4)
- sb t2, 2(t4)
- sb t3, 3(t4)
- addiu s4, -1
- addiu t4, 4
- bgtz s4, 1b
- addiu t5, 8
- beqz t8, 3f
- addu s4, t4, t8
-2:
- ulhu t0, 0(t5)
- raddu.w.qb t0, t0
- addqh.w t0, t0, s3
- xori s3, s3, 1
- sb t0, 0(t4)
- addiu t4, 1
- bne t4, s4, 2b
- addiu t5, 2
-3:
- lbux t1, t6(t5)
- sll t1, 1
- addqh.w t2, t1, s3 // t2 = pixval1
- xori s3, s3, 1
- addqh.w t3, t1, s3 // t3 = pixval2
- blez s2, 5f
- append t3, t2, 8
- addu t5, t4, s2 // t5 = loop_end2
-4:
- ush t3, 0(t4)
- addiu s2, -1
- bgtz s2, 4b
- addiu t4, 2
-5:
- beqz t9, 6f
- nop
- sb t2, 0(t4)
-6:
- addiu s1, 4
- addiu a2, -1
- bnez a2, 0b
- addiu s0, 4
-7:
- RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
-
- j ra
- nop
-END(jsimd_h2v1_downsample_dspr2)
-
-
-/*****************************************************************************/
-LEAF_DSPR2(jsimd_h2v2_downsample_dspr2)
-/*
- * a0 = cinfo->image_width
- * a1 = cinfo->max_v_samp_factor
- * a2 = compptr->v_samp_factor
- * a3 = compptr->width_in_blocks
- * 16(sp) = input_data
- * 20(sp) = output_data
- */
- .set at
-
- SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- beqz a2, 8f
- lw s1, 52(sp) // s1 = output_data
- lw s0, 48(sp) // s0 = input_data
-
- andi t6, a0, 1 // t6 = temp_index
- addiu t6, -1
- srl t7, a0, 1 // t7 = image_width1
- srl s4, t7, 2
- andi t8, t7, 3
- andi t9, a0, 2
- srl s2, a0, 2
- srl t7, t9, 1
- addu s2, t7, s2
- sll t0, a3, 3 // s2 = width_in_blocks*DCT
- srl t7, t0, 1
- subu s2, t7, s2
-0:
- lw t4, 0(s1) // t4 = outptr
- lw t5, 0(s0) // t5 = inptr0
- lw s7, 4(s0) // s7 = inptr1
- li s6, 1 // s6 = bias
-2:
- ulw t0, 0(t5) // t0 = |P3|P2|P1|P0|
- ulw t1, 0(s7) // t1 = |Q3|Q2|Q1|Q0|
- ulw t2, 4(t5)
- ulw t3, 4(s7)
- precrq.ph.w t7, t0, t1 // t2 = |P3|P2|Q3|Q2|
- ins t0, t1, 16, 16 // t0 = |Q1|Q0|P1|P0|
- raddu.w.qb t1, t7
- raddu.w.qb t0, t0
- shra_r.w t1, t1, 2
- addiu t0, 1
- srl t0, 2
- precrq.ph.w t7, t2, t3
- ins t2, t3, 16, 16
- raddu.w.qb t7, t7
- raddu.w.qb t2, t2
- shra_r.w t7, t7, 2
- addiu t2, 1
- srl t2, 2
- sb t0, 0(t4)
- sb t1, 1(t4)
- sb t2, 2(t4)
- sb t7, 3(t4)
- addiu t4, 4
- addiu t5, 8
- addiu s4, s4, -1
- bgtz s4, 2b
- addiu s7, 8
- beqz t8, 4f
- addu t8, t4, t8
-3:
- ulhu t0, 0(t5)
- ulhu t1, 0(s7)
- ins t0, t1, 16, 16
- raddu.w.qb t0, t0
- addu t0, t0, s6
- srl t0, 2
- xori s6, s6, 3
- sb t0, 0(t4)
- addiu t5, 2
- addiu t4, 1
- bne t8, t4, 3b
- addiu s7, 2
-4:
- lbux t1, t6(t5)
- sll t1, 1
- lbux t0, t6(s7)
- sll t0, 1
- addu t1, t1, t0
- addu t3, t1, s6
- srl t0, t3, 2 // t2 = pixval1
- xori s6, s6, 3
- addu t2, t1, s6
- srl t1, t2, 2 // t3 = pixval2
- blez s2, 6f
- append t1, t0, 8
-5:
- ush t1, 0(t4)
- addiu s2, -1
- bgtz s2, 5b
- addiu t4, 2
-6:
- beqz t9, 7f
- nop
- sb t0, 0(t4)
-7:
- addiu s1, 4
- addiu a2, -1
- bnez a2, 0b
- addiu s0, 8
-8:
- RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- j ra
- nop
-END(jsimd_h2v2_downsample_dspr2)
-
-
-/*****************************************************************************/
-LEAF_DSPR2(jsimd_h2v2_smooth_downsample_dspr2)
-/*
- * a0 = input_data
- * a1 = output_data
- * a2 = compptr->v_samp_factor
- * a3 = cinfo->max_v_samp_factor
- * 16(sp) = cinfo->smoothing_factor
- * 20(sp) = compptr->width_in_blocks
- * 24(sp) = cinfo->image_width
- */
- .set at
-
- SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- lw s7, 52(sp) // compptr->width_in_blocks
- lw s0, 56(sp) // cinfo->image_width
- lw s6, 48(sp) // cinfo->smoothing_factor
- sll s7, 3 // output_cols = width_in_blocks * DCTSIZE
- sll v0, s7, 1
- subu v0, v0, s0
- blez v0, 2f
- move v1, zero
- addiu t0, a3, 2 // t0 = cinfo->max_v_samp_factor + 2
-0:
- addiu t1, a0, -4
- sll t2, v1, 2
- lwx t1, t2(t1)
- move t3, v0
- addu t1, t1, s0
- lbu t2, -1(t1)
-1:
- addiu t3, t3, -1
- sb t2, 0(t1)
- bgtz t3, 1b
- addiu t1, t1, 1
- addiu v1, v1, 1
- bne v1, t0, 0b
- nop
-2:
- li v0, 80
- mul v0, s6, v0
- li v1, 16384
- move t4, zero
- move t5, zero
- subu t6, v1, v0 // t6 = 16384 - tmp_smoot_f * 80
- sll t7, s6, 4 // t7 = tmp_smoot_f * 16
-3:
-/* Special case for first column: pretend column -1 is same as column 0 */
- sll v0, t4, 2
- lwx t8, v0(a1) // outptr = output_data[outrow]
- sll v1, t5, 2
- addiu t9, v1, 4
- addiu s0, v1, -4
- addiu s1, v1, 8
- lwx s2, v1(a0) // inptr0 = input_data[inrow]
- lwx t9, t9(a0) // inptr1 = input_data[inrow+1]
- lwx s0, s0(a0) // above_ptr = input_data[inrow-1]
- lwx s1, s1(a0) // below_ptr = input_data[inrow+2]
- lh v0, 0(s2)
- lh v1, 0(t9)
- lh t0, 0(s0)
- lh t1, 0(s1)
- ins v0, v1, 16, 16
- ins t0, t1, 16, 16
- raddu.w.qb t2, v0
- raddu.w.qb s3, t0
- lbu v0, 0(s2)
- lbu v1, 2(s2)
- lbu t0, 0(t9)
- lbu t1, 2(t9)
- addu v0, v0, v1
- mult $ac1, t2, t6
- addu t0, t0, t1
- lbu t2, 2(s0)
- addu t0, t0, v0
- lbu t3, 2(s1)
- addu s3, t0, s3
- lbu v0, 0(s0)
- lbu t0, 0(s1)
- sll s3, s3, 1
- addu v0, v0, t2
- addu t0, t0, t3
- addu t0, t0, v0
- addu s3, t0, s3
- madd $ac1, s3, t7
- extr_r.w v0, $ac1, 16
- addiu t8, t8, 1
- addiu s2, s2, 2
- addiu t9, t9, 2
- addiu s0, s0, 2
- addiu s1, s1, 2
- sb v0, -1(t8)
- addiu s4, s7, -2
- and s4, s4, 3
- addu s5, s4, t8 // end address
-4:
- lh v0, 0(s2)
- lh v1, 0(t9)
- lh t0, 0(s0)
- lh t1, 0(s1)
- ins v0, v1, 16, 16
- ins t0, t1, 16, 16
- raddu.w.qb t2, v0
- raddu.w.qb s3, t0
- lbu v0, -1(s2)
- lbu v1, 2(s2)
- lbu t0, -1(t9)
- lbu t1, 2(t9)
- addu v0, v0, v1
- mult $ac1, t2, t6
- addu t0, t0, t1
- lbu t2, 2(s0)
- addu t0, t0, v0
- lbu t3, 2(s1)
- addu s3, t0, s3
- lbu v0, -1(s0)
- lbu t0, -1(s1)
- sll s3, s3, 1
- addu v0, v0, t2
- addu t0, t0, t3
- addu t0, t0, v0
- addu s3, t0, s3
- madd $ac1, s3, t7
- extr_r.w t2, $ac1, 16
- addiu t8, t8, 1
- addiu s2, s2, 2
- addiu t9, t9, 2
- addiu s0, s0, 2
- sb t2, -1(t8)
- bne s5, t8, 4b
- addiu s1, s1, 2
- addiu s5, s7, -2
- subu s5, s5, s4
- addu s5, s5, t8 // end address
-5:
- lh v0, 0(s2)
- lh v1, 0(t9)
- lh t0, 0(s0)
- lh t1, 0(s1)
- ins v0, v1, 16, 16
- ins t0, t1, 16, 16
- raddu.w.qb t2, v0
- raddu.w.qb s3, t0
- lbu v0, -1(s2)
- lbu v1, 2(s2)
- lbu t0, -1(t9)
- lbu t1, 2(t9)
- addu v0, v0, v1
- mult $ac1, t2, t6
- addu t0, t0, t1
- lbu t2, 2(s0)
- addu t0, t0, v0
- lbu t3, 2(s1)
- addu s3, t0, s3
- lbu v0, -1(s0)
- lbu t0, -1(s1)
- sll s3, s3, 1
- addu v0, v0, t2
- addu t0, t0, t3
- lh v1, 2(t9)
- addu t0, t0, v0
- lh v0, 2(s2)
- addu s3, t0, s3
- lh t0, 2(s0)
- lh t1, 2(s1)
- madd $ac1, s3, t7
- extr_r.w t2, $ac1, 16
- ins t0, t1, 16, 16
- ins v0, v1, 16, 16
- raddu.w.qb s3, t0
- lbu v1, 4(s2)
- lbu t0, 1(t9)
- lbu t1, 4(t9)
- sb t2, 0(t8)
- raddu.w.qb t3, v0
- lbu v0, 1(s2)
- addu t0, t0, t1
- mult $ac1, t3, t6
- addu v0, v0, v1
- lbu t2, 4(s0)
- addu t0, t0, v0
- lbu v0, 1(s0)
- addu s3, t0, s3
- lbu t0, 1(s1)
- lbu t3, 4(s1)
- addu v0, v0, t2
- sll s3, s3, 1
- addu t0, t0, t3
- lh v1, 4(t9)
- addu t0, t0, v0
- lh v0, 4(s2)
- addu s3, t0, s3
- lh t0, 4(s0)
- lh t1, 4(s1)
- madd $ac1, s3, t7
- extr_r.w t2, $ac1, 16
- ins t0, t1, 16, 16
- ins v0, v1, 16, 16
- raddu.w.qb s3, t0
- lbu v1, 6(s2)
- lbu t0, 3(t9)
- lbu t1, 6(t9)
- sb t2, 1(t8)
- raddu.w.qb t3, v0
- lbu v0, 3(s2)
- addu t0, t0, t1
- mult $ac1, t3, t6
- addu v0, v0, v1
- lbu t2, 6(s0)
- addu t0, t0, v0
- lbu v0, 3(s0)
- addu s3, t0, s3
- lbu t0, 3(s1)
- lbu t3, 6(s1)
- addu v0, v0, t2
- sll s3, s3, 1
- addu t0, t0, t3
- lh v1, 6(t9)
- addu t0, t0, v0
- lh v0, 6(s2)
- addu s3, t0, s3
- lh t0, 6(s0)
- lh t1, 6(s1)
- madd $ac1, s3, t7
- extr_r.w t3, $ac1, 16
- ins t0, t1, 16, 16
- ins v0, v1, 16, 16
- raddu.w.qb s3, t0
- lbu v1, 8(s2)
- lbu t0, 5(t9)
- lbu t1, 8(t9)
- sb t3, 2(t8)
- raddu.w.qb t2, v0
- lbu v0, 5(s2)
- addu t0, t0, t1
- mult $ac1, t2, t6
- addu v0, v0, v1
- lbu t2, 8(s0)
- addu t0, t0, v0
- lbu v0, 5(s0)
- addu s3, t0, s3
- lbu t0, 5(s1)
- lbu t3, 8(s1)
- addu v0, v0, t2
- sll s3, s3, 1
- addu t0, t0, t3
- addiu t8, t8, 4
- addu t0, t0, v0
- addiu s2, s2, 8
- addu s3, t0, s3
- addiu t9, t9, 8
- madd $ac1, s3, t7
- extr_r.w t1, $ac1, 16
- addiu s0, s0, 8
- addiu s1, s1, 8
- bne s5, t8, 5b
- sb t1, -1(t8)
-/* Special case for last column */
- lh v0, 0(s2)
- lh v1, 0(t9)
- lh t0, 0(s0)
- lh t1, 0(s1)
- ins v0, v1, 16, 16
- ins t0, t1, 16, 16
- raddu.w.qb t2, v0
- raddu.w.qb s3, t0
- lbu v0, -1(s2)
- lbu v1, 1(s2)
- lbu t0, -1(t9)
- lbu t1, 1(t9)
- addu v0, v0, v1
- mult $ac1, t2, t6
- addu t0, t0, t1
- lbu t2, 1(s0)
- addu t0, t0, v0
- lbu t3, 1(s1)
- addu s3, t0, s3
- lbu v0, -1(s0)
- lbu t0, -1(s1)
- sll s3, s3, 1
- addu v0, v0, t2
- addu t0, t0, t3
- addu t0, t0, v0
- addu s3, t0, s3
- madd $ac1, s3, t7
- extr_r.w t0, $ac1, 16
- addiu t5, t5, 2
- sb t0, 0(t8)
- addiu t4, t4, 1
- bne t4, a2, 3b
- addiu t5, t5, 2
-
- RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- j ra
- nop
-
-END(jsimd_h2v2_smooth_downsample_dspr2)
-
-
-/*****************************************************************************/
-LEAF_DSPR2(jsimd_int_upsample_dspr2)
-/*
- * a0 = upsample->h_expand[compptr->component_index]
- * a1 = upsample->v_expand[compptr->component_index]
- * a2 = input_data
- * a3 = output_data_ptr
- * 16(sp) = cinfo->output_width
- * 20(sp) = cinfo->max_v_samp_factor
- */
- .set at
-
- SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
-
- lw s0, 0(a3) // s0 = output_data
- lw s1, 32(sp) // s1 = cinfo->output_width
- lw s2, 36(sp) // s2 = cinfo->max_v_samp_factor
- li t6, 0 // t6 = inrow
- beqz s2, 10f
- li s3, 0 // s3 = outrow
-0:
- addu t0, a2, t6
- addu t7, s0, s3
- lw t3, 0(t0) // t3 = inptr
- lw t8, 0(t7) // t8 = outptr
- beqz s1, 4f
- addu t5, t8, s1 // t5 = outend
-1:
- lb t2, 0(t3) // t2 = invalue = *inptr++
- addiu t3, 1
- beqz a0, 3f
- move t0, a0 // t0 = h_expand
-2:
- sb t2, 0(t8)
- addiu t0, -1
- bgtz t0, 2b
- addiu t8, 1
-3:
- bgt t5, t8, 1b
- nop
-4:
- addiu t9, a1, -1 // t9 = v_expand - 1
- blez t9, 9f
- nop
-5:
- lw t3, 0(s0)
- lw t4, 4(s0)
- subu t0, s1, 0xF
- blez t0, 7f
- addu t5, t3, s1 // t5 = end address
- andi t7, s1, 0xF // t7 = residual
- subu t8, t5, t7
-6:
- ulw t0, 0(t3)
- ulw t1, 4(t3)
- ulw t2, 8(t3)
- usw t0, 0(t4)
- ulw t0, 12(t3)
- usw t1, 4(t4)
- usw t2, 8(t4)
- usw t0, 12(t4)
- addiu t3, 16
- bne t3, t8, 6b
- addiu t4, 16
- beqz t7, 8f
- nop
-7:
- lbu t0, 0(t3)
- sb t0, 0(t4)
- addiu t3, 1
- bne t3, t5, 7b
- addiu t4, 1
-8:
- addiu t9, -1
- bgtz t9, 5b
- addiu s0, 8
-9:
- addu s3, s3, a1
- bne s3, s2, 0b
- addiu t6, 1
-10:
- RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
-
- j ra
- nop
-END(jsimd_int_upsample_dspr2)
-
-
-/*****************************************************************************/
-LEAF_DSPR2(jsimd_h2v1_upsample_dspr2)
-/*
- * a0 = cinfo->max_v_samp_factor
- * a1 = cinfo->output_width
- * a2 = input_data
- * a3 = output_data_ptr
- */
- lw t7, 0(a3) // t7 = output_data
- andi t8, a1, 0xf // t8 = residual
- sll t0, a0, 2
- blez a0, 4f
- addu t9, t7, t0 // t9 = output_data end address
-0:
- lw t5, 0(t7) // t5 = outptr
- lw t6, 0(a2) // t6 = inptr
- addu t3, t5, a1 // t3 = outptr + output_width (end address)
- subu t3, t8 // t3 = end address - residual
- beq t5, t3, 2f
- move t4, t8
-1:
- ulw t0, 0(t6) // t0 = |P3|P2|P1|P0|
- ulw t2, 4(t6) // t2 = |P7|P6|P5|P4|
- srl t1, t0, 16 // t1 = |X|X|P3|P2|
- ins t0, t0, 16, 16 // t0 = |P1|P0|P1|P0|
- ins t1, t1, 16, 16 // t1 = |P3|P2|P3|P2|
- ins t0, t0, 8, 16 // t0 = |P1|P1|P0|P0|
- ins t1, t1, 8, 16 // t1 = |P3|P3|P2|P2|
- usw t0, 0(t5)
- usw t1, 4(t5)
- srl t0, t2, 16 // t0 = |X|X|P7|P6|
- ins t2, t2, 16, 16 // t2 = |P5|P4|P5|P4|
- ins t0, t0, 16, 16 // t0 = |P7|P6|P7|P6|
- ins t2, t2, 8, 16 // t2 = |P5|P5|P4|P4|
- ins t0, t0, 8, 16 // t0 = |P7|P7|P6|P6|
- usw t2, 8(t5)
- usw t0, 12(t5)
- addiu t5, 16
- bne t5, t3, 1b
- addiu t6, 8
- beqz t8, 3f
- move t4, t8
-2:
- lbu t1, 0(t6)
- sb t1, 0(t5)
- sb t1, 1(t5)
- addiu t4, -2
- addiu t6, 1
- bgtz t4, 2b
- addiu t5, 2
-3:
- addiu t7, 4
- bne t9, t7, 0b
- addiu a2, 4
-4:
- j ra
- nop
-END(jsimd_h2v1_upsample_dspr2)
-
-
-/*****************************************************************************/
-LEAF_DSPR2(jsimd_h2v2_upsample_dspr2)
-/*
- * a0 = cinfo->max_v_samp_factor
- * a1 = cinfo->output_width
- * a2 = input_data
- * a3 = output_data_ptr
- */
- lw t7, 0(a3)
- blez a0, 7f
- andi t9, a1, 0xf // t9 = residual
-0:
- lw t6, 0(a2) // t6 = inptr
- lw t5, 0(t7) // t5 = outptr
- addu t8, t5, a1 // t8 = outptr end address
- subu t8, t9 // t8 = end address - residual
- beq t5, t8, 2f
- move t4, t9
-1:
- ulw t0, 0(t6)
- srl t1, t0, 16
- ins t0, t0, 16, 16
- ins t0, t0, 8, 16
- ins t1, t1, 16, 16
- ins t1, t1, 8, 16
- ulw t2, 4(t6)
- usw t0, 0(t5)
- usw t1, 4(t5)
- srl t3, t2, 16
- ins t2, t2, 16, 16
- ins t2, t2, 8, 16
- ins t3, t3, 16, 16
- ins t3, t3, 8, 16
- usw t2, 8(t5)
- usw t3, 12(t5)
- addiu t5, 16
- bne t5, t8, 1b
- addiu t6, 8
- beqz t9, 3f
- move t4, t9
-2:
- lbu t0, 0(t6)
- sb t0, 0(t5)
- sb t0, 1(t5)
- addiu t4, -2
- addiu t6, 1
- bgtz t4, 2b
- addiu t5, 2
-3:
- lw t6, 0(t7) // t6 = outptr[0]
- lw t5, 4(t7) // t5 = outptr[1]
- addu t4, t6, a1 // t4 = new end address
- beq a1, t9, 5f
- subu t8, t4, t9
-4:
- ulw t0, 0(t6)
- ulw t1, 4(t6)
- ulw t2, 8(t6)
- usw t0, 0(t5)
- ulw t0, 12(t6)
- usw t1, 4(t5)
- usw t2, 8(t5)
- usw t0, 12(t5)
- addiu t6, 16
- bne t6, t8, 4b
- addiu t5, 16
- beqz t9, 6f
- nop
-5:
- lbu t0, 0(t6)
- sb t0, 0(t5)
- addiu t6, 1
- bne t6, t4, 5b
- addiu t5, 1
-6:
- addiu t7, 8
- addiu a0, -2
- bgtz a0, 0b
- addiu a2, 4
-7:
- j ra
- nop
-END(jsimd_h2v2_upsample_dspr2)
-
-
-/*****************************************************************************/
-LEAF_DSPR2(jsimd_idct_islow_dspr2)
-/*
- * a0 = coef_block
- * a1 = compptr->dcttable
- * a2 = output
- * a3 = range_limit
- */
- SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- addiu sp, sp, -256
- move v0, sp
- addiu v1, zero, 8 // v1 = DCTSIZE = 8
-1:
- lh s4, 32(a0) // s4 = inptr[16]
- lh s5, 64(a0) // s5 = inptr[32]
- lh s6, 96(a0) // s6 = inptr[48]
- lh t1, 112(a0) // t1 = inptr[56]
- lh t7, 16(a0) // t7 = inptr[8]
- lh t5, 80(a0) // t5 = inptr[40]
- lh t3, 48(a0) // t3 = inptr[24]
- or s4, s4, t1
- or s4, s4, t3
- or s4, s4, t5
- or s4, s4, t7
- or s4, s4, s5
- or s4, s4, s6
- bnez s4, 2f
- addiu v1, v1, -1
- lh s5, 0(a1) // quantptr[DCTSIZE*0]
- lh s6, 0(a0) // inptr[DCTSIZE*0]
- mul s5, s5, s6 // DEQUANTIZE(inptr[0], quantptr[0])
- sll s5, s5, 2
- sw s5, 0(v0)
- sw s5, 32(v0)
- sw s5, 64(v0)
- sw s5, 96(v0)
- sw s5, 128(v0)
- sw s5, 160(v0)
- sw s5, 192(v0)
- b 3f
- sw s5, 224(v0)
-2:
- lh t0, 112(a1)
- lh t2, 48(a1)
- lh t4, 80(a1)
- lh t6, 16(a1)
- mul t0, t0, t1 // DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7])
- mul t1, t2, t3 // DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3])
- mul t2, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5])
- mul t3, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1])
- lh t4, 32(a1)
- lh t5, 32(a0)
- lh t6, 96(a1)
- lh t7, 96(a0)
- addu s0, t0, t1 // z3 = tmp0 + tmp2
- addu s1, t1, t2 // z2 = tmp1 + tmp2
- addu s2, t2, t3 // z4 = tmp1 + tmp3
- addu s3, s0, s2 // z3 + z4
- addiu t9, zero, 9633 // FIX_1_175875602
- mul s3, s3, t9 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
- addu t8, t0, t3 // z1 = tmp0 + tmp3
- addiu t9, zero, 2446 // FIX_0_298631336
- mul t0, t0, t9 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
- addiu t9, zero, 16819 // FIX_2_053119869
- mul t2, t2, t9 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
- addiu t9, zero, 25172 // FIX_3_072711026
- mul t1, t1, t9 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
- addiu t9, zero, 12299 // FIX_1_501321110
- mul t3, t3, t9 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
- addiu t9, zero, 16069 // FIX_1_961570560
- mul s0, s0, t9 // -z3 = MULTIPLY(z3, FIX_1_961570560)
- addiu t9, zero, 3196 // FIX_0_390180644
- mul s2, s2, t9 // -z4 = MULTIPLY(z4, FIX_0_390180644)
- addiu t9, zero, 7373 // FIX_0_899976223
- mul t8, t8, t9 // -z1 = MULTIPLY(z1, FIX_0_899976223)
- addiu t9, zero, 20995 // FIX_2_562915447
- mul s1, s1, t9 // -z2 = MULTIPLY(z2, FIX_2_562915447)
- subu s0, s3, s0 // z3 += z5
- addu t0, t0, s0 // tmp0 += z3
- addu t1, t1, s0 // tmp2 += z3
- subu s2, s3, s2 // z4 += z5
- addu t2, t2, s2 // tmp1 += z4
- addu t3, t3, s2 // tmp3 += z4
- subu t0, t0, t8 // tmp0 += z1
- subu t1, t1, s1 // tmp2 += z2
- subu t2, t2, s1 // tmp1 += z2
- subu t3, t3, t8 // tmp3 += z1
- mul s0, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2])
- addiu t9, zero, 6270 // FIX_0_765366865
- mul s1, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6])
- lh t4, 0(a1)
- lh t5, 0(a0)
- lh t6, 64(a1)
- lh t7, 64(a0)
- mul s2, t9, s0 // MULTIPLY(z2, FIX_0_765366865)
- mul t5, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0])
- mul t6, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4])
- addiu t9, zero, 4433 // FIX_0_541196100
- addu s3, s0, s1 // z2 + z3
- mul s3, s3, t9 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
- addiu t9, zero, 15137 // FIX_1_847759065
- mul t8, s1, t9 // MULTIPLY(z3, FIX_1_847759065)
- addu t4, t5, t6
- subu t5, t5, t6
- sll t4, t4, 13 // tmp0 = (z2 + z3) << CONST_BITS
- sll t5, t5, 13 // tmp1 = (z2 - z3) << CONST_BITS
- addu t7, s3, s2 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
- subu t6, s3, t8 // tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065)
- addu s0, t4, t7
- subu s1, t4, t7
- addu s2, t5, t6
- subu s3, t5, t6
- addu t4, s0, t3
- subu s0, s0, t3
- addu t3, s2, t1
- subu s2, s2, t1
- addu t1, s3, t2
- subu s3, s3, t2
- addu t2, s1, t0
- subu s1, s1, t0
- shra_r.w t4, t4, 11
- shra_r.w t3, t3, 11
- shra_r.w t1, t1, 11
- shra_r.w t2, t2, 11
- shra_r.w s1, s1, 11
- shra_r.w s3, s3, 11
- shra_r.w s2, s2, 11
- shra_r.w s0, s0, 11
- sw t4, 0(v0)
- sw t3, 32(v0)
- sw t1, 64(v0)
- sw t2, 96(v0)
- sw s1, 128(v0)
- sw s3, 160(v0)
- sw s2, 192(v0)
- sw s0, 224(v0)
-3:
- addiu a1, a1, 2
- addiu a0, a0, 2
- bgtz v1, 1b
- addiu v0, v0, 4
- move v0, sp
- addiu v1, zero, 8
-4:
- lw t0, 8(v0) // z2 = (JLONG)wsptr[2]
- lw t1, 24(v0) // z3 = (JLONG)wsptr[6]
- lw t2, 0(v0) // (JLONG)wsptr[0]
- lw t3, 16(v0) // (JLONG)wsptr[4]
- lw s4, 4(v0) // (JLONG)wsptr[1]
- lw s5, 12(v0) // (JLONG)wsptr[3]
- lw s6, 20(v0) // (JLONG)wsptr[5]
- lw s7, 28(v0) // (JLONG)wsptr[7]
- or s4, s4, t0
- or s4, s4, t1
- or s4, s4, t3
- or s4, s4, s7
- or s4, s4, s5
- or s4, s4, s6
- bnez s4, 5f
- addiu v1, v1, -1
- shra_r.w s5, t2, 5
- andi s5, s5, 0x3ff
- lbux s5, s5(a3)
- lw s1, 0(a2)
- replv.qb s5, s5
- usw s5, 0(s1)
- usw s5, 4(s1)
- b 6f
- nop
-5:
- addu t4, t0, t1 // z2 + z3
- addiu t8, zero, 4433 // FIX_0_541196100
- mul t5, t4, t8 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
- addiu t8, zero, 15137 // FIX_1_847759065
- mul t1, t1, t8 // MULTIPLY(z3, FIX_1_847759065)
- addiu t8, zero, 6270 // FIX_0_765366865
- mul t0, t0, t8 // MULTIPLY(z2, FIX_0_765366865)
- addu t4, t2, t3 // (JLONG)wsptr[0] + (JLONG)wsptr[4]
- subu t2, t2, t3 // (JLONG)wsptr[0] - (JLONG)wsptr[4]
- sll t4, t4, 13 // tmp0 = (wsptr[0] + wsptr[4]) << CONST_BITS
- sll t2, t2, 13 // tmp1 = (wsptr[0] - wsptr[4]) << CONST_BITS
- subu t1, t5, t1 // tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065)
- subu t3, t2, t1 // tmp12 = tmp1 - tmp2
- addu t2, t2, t1 // tmp11 = tmp1 + tmp2
- addu t5, t5, t0 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
- subu t1, t4, t5 // tmp13 = tmp0 - tmp3
- addu t0, t4, t5 // tmp10 = tmp0 + tmp3
- lw t4, 28(v0) // tmp0 = (JLONG)wsptr[7]
- lw t6, 12(v0) // tmp2 = (JLONG)wsptr[3]
- lw t5, 20(v0) // tmp1 = (JLONG)wsptr[5]
- lw t7, 4(v0) // tmp3 = (JLONG)wsptr[1]
- addu s0, t4, t6 // z3 = tmp0 + tmp2
- addiu t8, zero, 9633 // FIX_1_175875602
- addu s1, t5, t7 // z4 = tmp1 + tmp3
- addu s2, s0, s1 // z3 + z4
- mul s2, s2, t8 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
- addu s3, t4, t7 // z1 = tmp0 + tmp3
- addu t9, t5, t6 // z2 = tmp1 + tmp2
- addiu t8, zero, 16069 // FIX_1_961570560
- mul s0, s0, t8 // -z3 = MULTIPLY(z3, FIX_1_961570560)
- addiu t8, zero, 3196 // FIX_0_390180644
- mul s1, s1, t8 // -z4 = MULTIPLY(z4, FIX_0_390180644)
- addiu t8, zero, 2446 // FIX_0_298631336
- mul t4, t4, t8 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
- addiu t8, zero, 7373 // FIX_0_899976223
- mul s3, s3, t8 // -z1 = MULTIPLY(z1, FIX_0_899976223)
- addiu t8, zero, 16819 // FIX_2_053119869
- mul t5, t5, t8 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
- addiu t8, zero, 20995 // FIX_2_562915447
- mul t9, t9, t8 // -z2 = MULTIPLY(z2, FIX_2_562915447)
- addiu t8, zero, 25172 // FIX_3_072711026
- mul t6, t6, t8 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
- addiu t8, zero, 12299 // FIX_1_501321110
- mul t7, t7, t8 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
- subu s0, s2, s0 // z3 += z5
- subu s1, s2, s1 // z4 += z5
- addu t4, t4, s0
- subu t4, t4, s3 // tmp0
- addu t5, t5, s1
- subu t5, t5, t9 // tmp1
- addu t6, t6, s0
- subu t6, t6, t9 // tmp2
- addu t7, t7, s1
- subu t7, t7, s3 // tmp3
- addu s0, t0, t7
- subu t0, t0, t7
- addu t7, t2, t6
- subu t2, t2, t6
- addu t6, t3, t5
- subu t3, t3, t5
- addu t5, t1, t4
- subu t1, t1, t4
- shra_r.w s0, s0, 18
- shra_r.w t7, t7, 18
- shra_r.w t6, t6, 18
- shra_r.w t5, t5, 18
- shra_r.w t1, t1, 18
- shra_r.w t3, t3, 18
- shra_r.w t2, t2, 18
- shra_r.w t0, t0, 18
- andi s0, s0, 0x3ff
- andi t7, t7, 0x3ff
- andi t6, t6, 0x3ff
- andi t5, t5, 0x3ff
- andi t1, t1, 0x3ff
- andi t3, t3, 0x3ff
- andi t2, t2, 0x3ff
- andi t0, t0, 0x3ff
- lw s1, 0(a2)
- lbux s0, s0(a3)
- lbux t7, t7(a3)
- lbux t6, t6(a3)
- lbux t5, t5(a3)
- lbux t1, t1(a3)
- lbux t3, t3(a3)
- lbux t2, t2(a3)
- lbux t0, t0(a3)
- sb s0, 0(s1)
- sb t7, 1(s1)
- sb t6, 2(s1)
- sb t5, 3(s1)
- sb t1, 4(s1)
- sb t3, 5(s1)
- sb t2, 6(s1)
- sb t0, 7(s1)
-6:
- addiu v0, v0, 32
- bgtz v1, 4b
- addiu a2, a2, 4
- addiu sp, sp, 256
-
- RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- j ra
- nop
-
-END(jsimd_idct_islow_dspr2)
-
-
-/*****************************************************************************/
-LEAF_DSPR2(jsimd_idct_ifast_cols_dspr2)
-/*
- * a0 = inptr
- * a1 = quantptr
- * a2 = wsptr
- * a3 = mips_idct_ifast_coefs
- */
- SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- addiu t9, a0, 16 // end address
- or AT, a3, zero
-
-0:
- lw s0, 0(a1) // quantptr[DCTSIZE*0]
- lw t0, 0(a0) // inptr[DCTSIZE*0]
- lw t1, 16(a0) // inptr[DCTSIZE*1]
- muleq_s.w.phl v0, t0, s0 // tmp0 ...
- lw t2, 32(a0) // inptr[DCTSIZE*2]
- lw t3, 48(a0) // inptr[DCTSIZE*3]
- lw t4, 64(a0) // inptr[DCTSIZE*4]
- lw t5, 80(a0) // inptr[DCTSIZE*5]
- muleq_s.w.phr t0, t0, s0 // ... tmp0 ...
- lw t6, 96(a0) // inptr[DCTSIZE*6]
- lw t7, 112(a0) // inptr[DCTSIZE*7]
- or s4, t1, t2
- or s5, t3, t4
- bnez s4, 1f
- ins t0, v0, 16, 16 // ... tmp0
- bnez s5, 1f
- or s6, t5, t6
- or s6, s6, t7
- bnez s6, 1f
- sw t0, 0(a2) // wsptr[DCTSIZE*0]
- sw t0, 16(a2) // wsptr[DCTSIZE*1]
- sw t0, 32(a2) // wsptr[DCTSIZE*2]
- sw t0, 48(a2) // wsptr[DCTSIZE*3]
- sw t0, 64(a2) // wsptr[DCTSIZE*4]
- sw t0, 80(a2) // wsptr[DCTSIZE*5]
- sw t0, 96(a2) // wsptr[DCTSIZE*6]
- sw t0, 112(a2) // wsptr[DCTSIZE*7]
- addiu a0, a0, 4
- b 2f
- addiu a1, a1, 4
-
-1:
- lw s1, 32(a1) // quantptr[DCTSIZE*2]
- lw s2, 64(a1) // quantptr[DCTSIZE*4]
- muleq_s.w.phl v0, t2, s1 // tmp1 ...
- muleq_s.w.phr t2, t2, s1 // ... tmp1 ...
- lw s0, 16(a1) // quantptr[DCTSIZE*1]
- lw s1, 48(a1) // quantptr[DCTSIZE*3]
- lw s3, 96(a1) // quantptr[DCTSIZE*6]
- muleq_s.w.phl v1, t4, s2 // tmp2 ...
- muleq_s.w.phr t4, t4, s2 // ... tmp2 ...
- lw s2, 80(a1) // quantptr[DCTSIZE*5]
- lw t8, 4(AT) // FIX(1.414213562)
- ins t2, v0, 16, 16 // ... tmp1
- muleq_s.w.phl v0, t6, s3 // tmp3 ...
- muleq_s.w.phr t6, t6, s3 // ... tmp3 ...
- ins t4, v1, 16, 16 // ... tmp2
- addq.ph s4, t0, t4 // tmp10
- subq.ph s5, t0, t4 // tmp11
- ins t6, v0, 16, 16 // ... tmp3
- subq.ph s6, t2, t6 // tmp12 ...
- addq.ph s7, t2, t6 // tmp13
- mulq_s.ph s6, s6, t8 // ... tmp12 ...
- addq.ph t0, s4, s7 // tmp0
- subq.ph t6, s4, s7 // tmp3
- muleq_s.w.phl v0, t1, s0 // tmp4 ...
- muleq_s.w.phr t1, t1, s0 // ... tmp4 ...
- shll_s.ph s6, s6, 1 // x2
- lw s3, 112(a1) // quantptr[DCTSIZE*7]
- subq.ph s6, s6, s7 // ... tmp12
- muleq_s.w.phl v1, t7, s3 // tmp7 ...
- muleq_s.w.phr t7, t7, s3 // ... tmp7 ...
- ins t1, v0, 16, 16 // ... tmp4
- addq.ph t2, s5, s6 // tmp1
- subq.ph t4, s5, s6 // tmp2
- muleq_s.w.phl v0, t5, s2 // tmp6 ...
- muleq_s.w.phr t5, t5, s2 // ... tmp6 ...
- ins t7, v1, 16, 16 // ... tmp7
- addq.ph s5, t1, t7 // z11
- subq.ph s6, t1, t7 // z12
- muleq_s.w.phl v1, t3, s1 // tmp5 ...
- muleq_s.w.phr t3, t3, s1 // ... tmp5 ...
- ins t5, v0, 16, 16 // ... tmp6
- ins t3, v1, 16, 16 // ... tmp5
- addq.ph s7, t5, t3 // z13
- subq.ph v0, t5, t3 // z10
- addq.ph t7, s5, s7 // tmp7
- subq.ph s5, s5, s7 // tmp11 ...
- addq.ph v1, v0, s6 // z5 ...
- mulq_s.ph s5, s5, t8 // ... tmp11
- lw t8, 8(AT) // FIX(1.847759065)
- lw s4, 0(AT) // FIX(1.082392200)
- addq.ph s0, t0, t7
- subq.ph s1, t0, t7
- mulq_s.ph v1, v1, t8 // ... z5
- shll_s.ph s5, s5, 1 // x2
- lw t8, 12(AT) // FIX(-2.613125930)
- sw s0, 0(a2) // wsptr[DCTSIZE*0]
- shll_s.ph v0, v0, 1 // x4
- mulq_s.ph v0, v0, t8 // tmp12 ...
- mulq_s.ph s4, s6, s4 // tmp10 ...
- shll_s.ph v1, v1, 1 // x2
- addiu a0, a0, 4
- addiu a1, a1, 4
- sw s1, 112(a2) // wsptr[DCTSIZE*7]
- shll_s.ph s6, v0, 1 // x4
- shll_s.ph s4, s4, 1 // x2
- addq.ph s6, s6, v1 // ... tmp12
- subq.ph t5, s6, t7 // tmp6
- subq.ph s4, s4, v1 // ... tmp10
- subq.ph t3, s5, t5 // tmp5
- addq.ph s2, t2, t5
- addq.ph t1, s4, t3 // tmp4
- subq.ph s3, t2, t5
- sw s2, 16(a2) // wsptr[DCTSIZE*1]
- sw s3, 96(a2) // wsptr[DCTSIZE*6]
- addq.ph v0, t4, t3
- subq.ph v1, t4, t3
- sw v0, 32(a2) // wsptr[DCTSIZE*2]
- sw v1, 80(a2) // wsptr[DCTSIZE*5]
- addq.ph v0, t6, t1
- subq.ph v1, t6, t1
- sw v0, 64(a2) // wsptr[DCTSIZE*4]
- sw v1, 48(a2) // wsptr[DCTSIZE*3]
-
-2:
- bne a0, t9, 0b
- addiu a2, a2, 4
-
- RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- j ra
- nop
-
-END(jsimd_idct_ifast_cols_dspr2)
-
-
-/*****************************************************************************/
-LEAF_DSPR2(jsimd_idct_ifast_rows_dspr2)
-/*
- * a0 = wsptr
- * a1 = output_buf
- * a2 = output_col
- * a3 = mips_idct_ifast_coefs
- */
- SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
-
- addiu t9, a0, 128 // end address
- lui s8, 0x8080
- ori s8, s8, 0x8080
-
-0:
- lw AT, 36(sp) // restore $a3 (mips_idct_ifast_coefs)
- lw t0, 0(a0) // wsptr[DCTSIZE*0+0/1] b a
- lw s0, 16(a0) // wsptr[DCTSIZE*1+0/1] B A
- lw t2, 4(a0) // wsptr[DCTSIZE*0+2/3] d c
- lw s2, 20(a0) // wsptr[DCTSIZE*1+2/3] D C
- lw t4, 8(a0) // wsptr[DCTSIZE*0+4/5] f e
- lw s4, 24(a0) // wsptr[DCTSIZE*1+4/5] F E
- lw t6, 12(a0) // wsptr[DCTSIZE*0+6/7] h g
- lw s6, 28(a0) // wsptr[DCTSIZE*1+6/7] H G
- precrq.ph.w t1, s0, t0 // B b
- ins t0, s0, 16, 16 // A a
- bnez t1, 1f
- or s0, t2, s2
- bnez s0, 1f
- or s0, t4, s4
- bnez s0, 1f
- or s0, t6, s6
- bnez s0, 1f
- shll_s.ph s0, t0, 2 // A a
- lw a3, 0(a1)
- lw AT, 4(a1)
- precrq.ph.w t0, s0, s0 // A A
- ins s0, s0, 16, 16 // a a
- addu a3, a3, a2
- addu AT, AT, a2
- precrq.qb.ph t0, t0, t0 // A A A A
- precrq.qb.ph s0, s0, s0 // a a a a
- addu.qb s0, s0, s8
- addu.qb t0, t0, s8
- sw s0, 0(a3)
- sw s0, 4(a3)
- sw t0, 0(AT)
- sw t0, 4(AT)
- addiu a0, a0, 32
- bne a0, t9, 0b
- addiu a1, a1, 8
- b 2f
- nop
-
-1:
- precrq.ph.w t3, s2, t2
- ins t2, s2, 16, 16
- precrq.ph.w t5, s4, t4
- ins t4, s4, 16, 16
- precrq.ph.w t7, s6, t6
- ins t6, s6, 16, 16
- lw t8, 4(AT) // FIX(1.414213562)
- addq.ph s4, t0, t4 // tmp10
- subq.ph s5, t0, t4 // tmp11
- subq.ph s6, t2, t6 // tmp12 ...
- addq.ph s7, t2, t6 // tmp13
- mulq_s.ph s6, s6, t8 // ... tmp12 ...
- addq.ph t0, s4, s7 // tmp0
- subq.ph t6, s4, s7 // tmp3
- shll_s.ph s6, s6, 1 // x2
- subq.ph s6, s6, s7 // ... tmp12
- addq.ph t2, s5, s6 // tmp1
- subq.ph t4, s5, s6 // tmp2
- addq.ph s5, t1, t7 // z11
- subq.ph s6, t1, t7 // z12
- addq.ph s7, t5, t3 // z13
- subq.ph v0, t5, t3 // z10
- addq.ph t7, s5, s7 // tmp7
- subq.ph s5, s5, s7 // tmp11 ...
- addq.ph v1, v0, s6 // z5 ...
- mulq_s.ph s5, s5, t8 // ... tmp11
- lw t8, 8(AT) // FIX(1.847759065)
- lw s4, 0(AT) // FIX(1.082392200)
- addq.ph s0, t0, t7 // tmp0 + tmp7
- subq.ph s7, t0, t7 // tmp0 - tmp7
- mulq_s.ph v1, v1, t8 // ... z5
- lw a3, 0(a1)
- lw t8, 12(AT) // FIX(-2.613125930)
- shll_s.ph s5, s5, 1 // x2
- addu a3, a3, a2
- shll_s.ph v0, v0, 1 // x4
- mulq_s.ph v0, v0, t8 // tmp12 ...
- mulq_s.ph s4, s6, s4 // tmp10 ...
- shll_s.ph v1, v1, 1 // x2
- addiu a0, a0, 32
- addiu a1, a1, 8
- shll_s.ph s6, v0, 1 // x4
- shll_s.ph s4, s4, 1 // x2
- addq.ph s6, s6, v1 // ... tmp12
- shll_s.ph s0, s0, 2
- subq.ph t5, s6, t7 // tmp6
- subq.ph s4, s4, v1 // ... tmp10
- subq.ph t3, s5, t5 // tmp5
- shll_s.ph s7, s7, 2
- addq.ph t1, s4, t3 // tmp4
- addq.ph s1, t2, t5 // tmp1 + tmp6
- subq.ph s6, t2, t5 // tmp1 - tmp6
- addq.ph s2, t4, t3 // tmp2 + tmp5
- subq.ph s5, t4, t3 // tmp2 - tmp5
- addq.ph s4, t6, t1 // tmp3 + tmp4
- subq.ph s3, t6, t1 // tmp3 - tmp4
- shll_s.ph s1, s1, 2
- shll_s.ph s2, s2, 2
- shll_s.ph s3, s3, 2
- shll_s.ph s4, s4, 2
- shll_s.ph s5, s5, 2
- shll_s.ph s6, s6, 2
- precrq.ph.w t0, s1, s0 // B A
- ins s0, s1, 16, 16 // b a
- precrq.ph.w t2, s3, s2 // D C
- ins s2, s3, 16, 16 // d c
- precrq.ph.w t4, s5, s4 // F E
- ins s4, s5, 16, 16 // f e
- precrq.ph.w t6, s7, s6 // H G
- ins s6, s7, 16, 16 // h g
- precrq.qb.ph t0, t2, t0 // D C B A
- precrq.qb.ph s0, s2, s0 // d c b a
- precrq.qb.ph t4, t6, t4 // H G F E
- precrq.qb.ph s4, s6, s4 // h g f e
- addu.qb s0, s0, s8
- addu.qb s4, s4, s8
- sw s0, 0(a3) // outptr[0/1/2/3] d c b a
- sw s4, 4(a3) // outptr[4/5/6/7] h g f e
- lw a3, -4(a1)
- addu.qb t0, t0, s8
- addu a3, a3, a2
- addu.qb t4, t4, s8
- sw t0, 0(a3) // outptr[0/1/2/3] D C B A
- bne a0, t9, 0b
- sw t4, 4(a3) // outptr[4/5/6/7] H G F E
-
-2:
-
- RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
-
- j ra
- nop
-
-END(jsimd_idct_ifast_rows_dspr2)
-
-
-/*****************************************************************************/
-LEAF_DSPR2(jsimd_fdct_islow_dspr2)
-/*
- * a0 = data
- */
- SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
-
- lui t0, 6437
- ori t0, 2260
- lui t1, 9633
- ori t1, 11363
- lui t2, 0xd39e
- ori t2, 0xe6dc
- lui t3, 0xf72d
- ori t3, 9633
- lui t4, 2261
- ori t4, 9633
- lui t5, 0xd39e
- ori t5, 6437
- lui t6, 9633
- ori t6, 0xd39d
- lui t7, 0xe6dc
- ori t7, 2260
- lui t8, 4433
- ori t8, 10703
- lui t9, 0xd630
- ori t9, 4433
- li s8, 8
- move a1, a0
-1:
- lw s0, 0(a1) // tmp0 = 1|0
- lw s1, 4(a1) // tmp1 = 3|2
- lw s2, 8(a1) // tmp2 = 5|4
- lw s3, 12(a1) // tmp3 = 7|6
- packrl.ph s1, s1, s1 // tmp1 = 2|3
- packrl.ph s3, s3, s3 // tmp3 = 6|7
- subq.ph s7, s1, s2 // tmp7 = 2-5|3-4 = t5|t4
- subq.ph s5, s0, s3 // tmp5 = 1-6|0-7 = t6|t7
- mult $0, $0 // ac0 = 0
- dpa.w.ph $ac0, s7, t0 // ac0 += t5* 6437 + t4* 2260
- dpa.w.ph $ac0, s5, t1 // ac0 += t6* 9633 + t7* 11363
- mult $ac1, $0, $0 // ac1 = 0
- dpa.w.ph $ac1, s7, t2 // ac1 += t5*-11362 + t4* -6436
- dpa.w.ph $ac1, s5, t3 // ac1 += t6* -2259 + t7* 9633
- mult $ac2, $0, $0 // ac2 = 0
- dpa.w.ph $ac2, s7, t4 // ac2 += t5* 2261 + t4* 9633
- dpa.w.ph $ac2, s5, t5 // ac2 += t6*-11362 + t7* 6437
- mult $ac3, $0, $0 // ac3 = 0
- dpa.w.ph $ac3, s7, t6 // ac3 += t5* 9633 + t4*-11363
- dpa.w.ph $ac3, s5, t7 // ac3 += t6* -6436 + t7* 2260
- addq.ph s6, s1, s2 // tmp6 = 2+5|3+4 = t2|t3
- addq.ph s4, s0, s3 // tmp4 = 1+6|0+7 = t1|t0
- extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11
- extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11
- extr_r.w s2, $ac2, 11 // tmp2 = (ac2 + 1024) >> 11
- extr_r.w s3, $ac3, 11 // tmp3 = (ac3 + 1024) >> 11
- addq.ph s5, s4, s6 // tmp5 = t1+t2|t0+t3 = t11|t10
- subq.ph s7, s4, s6 // tmp7 = t1-t2|t0-t3 = t12|t13
- sh s0, 2(a1)
- sh s1, 6(a1)
- sh s2, 10(a1)
- sh s3, 14(a1)
- mult $0, $0 // ac0 = 0
- dpa.w.ph $ac0, s7, t8 // ac0 += t12* 4433 + t13* 10703
- mult $ac1, $0, $0 // ac1 = 0
- dpa.w.ph $ac1, s7, t9 // ac1 += t12*-10704 + t13* 4433
- sra s4, s5, 16 // tmp4 = t11
- addiu a1, a1, 16
- addiu s8, s8, -1
- extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11
- extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11
- addu s2, s5, s4 // tmp2 = t10 + t11
- subu s3, s5, s4 // tmp3 = t10 - t11
- sll s2, s2, 2 // tmp2 = (t10 + t11) << 2
- sll s3, s3, 2 // tmp3 = (t10 - t11) << 2
- sh s2, -16(a1)
- sh s3, -8(a1)
- sh s0, -12(a1)
- bgtz s8, 1b
- sh s1, -4(a1)
- li t0, 2260
- li t1, 11363
- li t2, 9633
- li t3, 6436
- li t4, 6437
- li t5, 2261
- li t6, 11362
- li t7, 2259
- li t8, 4433
- li t9, 10703
- li a1, 10704
- li s8, 8
-
-2:
- lh a2, 0(a0) // 0
- lh a3, 16(a0) // 8
- lh v0, 32(a0) // 16
- lh v1, 48(a0) // 24
- lh s4, 64(a0) // 32
- lh s5, 80(a0) // 40
- lh s6, 96(a0) // 48
- lh s7, 112(a0) // 56
- addu s2, v0, s5 // tmp2 = 16 + 40
- subu s5, v0, s5 // tmp5 = 16 - 40
- addu s3, v1, s4 // tmp3 = 24 + 32
- subu s4, v1, s4 // tmp4 = 24 - 32
- addu s0, a2, s7 // tmp0 = 0 + 56
- subu s7, a2, s7 // tmp7 = 0 - 56
- addu s1, a3, s6 // tmp1 = 8 + 48
- subu s6, a3, s6 // tmp6 = 8 - 48
- addu a2, s0, s3 // tmp10 = tmp0 + tmp3
- subu v1, s0, s3 // tmp13 = tmp0 - tmp3
- addu a3, s1, s2 // tmp11 = tmp1 + tmp2
- subu v0, s1, s2 // tmp12 = tmp1 - tmp2
- mult s7, t1 // ac0 = tmp7 * c1
- madd s4, t0 // ac0 += tmp4 * c0
- madd s5, t4 // ac0 += tmp5 * c4
- madd s6, t2 // ac0 += tmp6 * c2
- mult $ac1, s7, t2 // ac1 = tmp7 * c2
- msub $ac1, s4, t3 // ac1 -= tmp4 * c3
- msub $ac1, s5, t6 // ac1 -= tmp5 * c6
- msub $ac1, s6, t7 // ac1 -= tmp6 * c7
- mult $ac2, s7, t4 // ac2 = tmp7 * c4
- madd $ac2, s4, t2 // ac2 += tmp4 * c2
- madd $ac2, s5, t5 // ac2 += tmp5 * c5
- msub $ac2, s6, t6 // ac2 -= tmp6 * c6
- mult $ac3, s7, t0 // ac3 = tmp7 * c0
- msub $ac3, s4, t1 // ac3 -= tmp4 * c1
- madd $ac3, s5, t2 // ac3 += tmp5 * c2
- msub $ac3, s6, t3 // ac3 -= tmp6 * c3
- extr_r.w s0, $ac0, 15 // tmp0 = (ac0 + 16384) >> 15
- extr_r.w s1, $ac1, 15 // tmp1 = (ac1 + 16384) >> 15
- extr_r.w s2, $ac2, 15 // tmp2 = (ac2 + 16384) >> 15
- extr_r.w s3, $ac3, 15 // tmp3 = (ac3 + 16384) >> 15
- addiu s8, s8, -1
- addu s4, a2, a3 // tmp4 = tmp10 + tmp11
- subu s5, a2, a3 // tmp5 = tmp10 - tmp11
- sh s0, 16(a0)
- sh s1, 48(a0)
- sh s2, 80(a0)
- sh s3, 112(a0)
- mult v0, t8 // ac0 = tmp12 * c8
- madd v1, t9 // ac0 += tmp13 * c9
- mult $ac1, v1, t8 // ac1 = tmp13 * c8
- msub $ac1, v0, a1 // ac1 -= tmp12 * c10
- addiu a0, a0, 2
- extr_r.w s6, $ac0, 15 // tmp6 = (ac0 + 16384) >> 15
- extr_r.w s7, $ac1, 15 // tmp7 = (ac1 + 16384) >> 15
- shra_r.w s4, s4, 2 // tmp4 = (tmp4 + 2) >> 2
- shra_r.w s5, s5, 2 // tmp5 = (tmp5 + 2) >> 2
- sh s4, -2(a0)
- sh s5, 62(a0)
- sh s6, 30(a0)
- bgtz s8, 2b
- sh s7, 94(a0)
-
- RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
-
- jr ra
- nop
-
-END(jsimd_fdct_islow_dspr2)
-
-
-/**************************************************************************/
-LEAF_DSPR2(jsimd_fdct_ifast_dspr2)
-/*
- * a0 = data
- */
- .set at
-
- SAVE_REGS_ON_STACK 8, s0, s1
-
- li a1, 0x014e014e // FIX_1_306562965 (334 << 16)|(334 & 0xffff)
- li a2, 0x008b008b // FIX_0_541196100 (139 << 16)|(139 & 0xffff)
- li a3, 0x00620062 // FIX_0_382683433 (98 << 16) |(98 & 0xffff)
- li s1, 0x00b500b5 // FIX_0_707106781 (181 << 16)|(181 & 0xffff)
-
- move v0, a0
- addiu v1, v0, 128 // end address
-
-0:
- lw t0, 0(v0) // tmp0 = 1|0
- lw t1, 4(v0) // tmp1 = 3|2
- lw t2, 8(v0) // tmp2 = 5|4
- lw t3, 12(v0) // tmp3 = 7|6
- packrl.ph t1, t1, t1 // tmp1 = 2|3
- packrl.ph t3, t3, t3 // tmp3 = 6|7
- subq.ph t7, t1, t2 // tmp7 = 2-5|3-4 = t5|t4
- subq.ph t5, t0, t3 // tmp5 = 1-6|0-7 = t6|t7
- addq.ph t6, t1, t2 // tmp6 = 2+5|3+4 = t2|t3
- addq.ph t4, t0, t3 // tmp4 = 1+6|0+7 = t1|t0
- addq.ph t8, t4, t6 // tmp5 = t1+t2|t0+t3 = t11|t10
- subq.ph t9, t4, t6 // tmp7 = t1-t2|t0-t3 = t12|t13
- sra t4, t8, 16 // tmp4 = t11
- mult $0, $0 // ac0 = 0
- dpa.w.ph $ac0, t9, s1
- mult $ac1, $0, $0 // ac1 = 0
- dpa.w.ph $ac1, t7, a3 // ac1 += t4*98 + t5*98
- dpsx.w.ph $ac1, t5, a3 // ac1 += t6*98 + t7*98
- mult $ac2, $0, $0 // ac2 = 0
- dpa.w.ph $ac2, t7, a2 // ac2 += t4*139 + t5*139
- mult $ac3, $0, $0 // ac3 = 0
- dpa.w.ph $ac3, t5, a1 // ac3 += t6*334 + t7*334
- precrq.ph.w t0, t5, t7 // t0 = t5|t6
- addq.ph t2, t8, t4 // tmp2 = t10 + t11
- subq.ph t3, t8, t4 // tmp3 = t10 - t11
- extr.w t4, $ac0, 8
- mult $0, $0 // ac0 = 0
- dpa.w.ph $ac0, t0, s1 // ac0 += t5*181 + t6*181
- extr.w t0, $ac1, 8 // t0 = z5
- extr.w t1, $ac2, 8 // t1 = MULTIPLY(tmp10, 139)
- extr.w t7, $ac3, 8 // t2 = MULTIPLY(tmp12, 334)
- extr.w t8, $ac0, 8 // t8 = z3 = MULTIPLY(tmp11, 181)
- add t6, t1, t0 // t6 = z2
- add t7, t7, t0 // t7 = z4
- subq.ph t0, t5, t8 // t0 = z13 = tmp7 - z3
- addq.ph t8, t5, t8 // t9 = z11 = tmp7 + z3
- addq.ph t1, t0, t6 // t1 = z13 + z2
- subq.ph t6, t0, t6 // t6 = z13 - z2
- addq.ph t0, t8, t7 // t0 = z11 + z4
- subq.ph t7, t8, t7 // t7 = z11 - z4
- addq.ph t5, t4, t9
- subq.ph t4, t9, t4
- sh t2, 0(v0)
- sh t5, 4(v0)
- sh t3, 8(v0)
- sh t4, 12(v0)
- sh t1, 10(v0)
- sh t6, 6(v0)
- sh t0, 2(v0)
- sh t7, 14(v0)
- addiu v0, 16
- bne v1, v0, 0b
- nop
- move v0, a0
- addiu v1, v0, 16
-
-1:
- lh t0, 0(v0) // 0
- lh t1, 16(v0) // 8
- lh t2, 32(v0) // 16
- lh t3, 48(v0) // 24
- lh t4, 64(v0) // 32
- lh t5, 80(v0) // 40
- lh t6, 96(v0) // 48
- lh t7, 112(v0) // 56
- add t8, t0, t7 // t8 = tmp0
- sub t7, t0, t7 // t7 = tmp7
- add t0, t1, t6 // t0 = tmp1
- sub t1, t1, t6 // t1 = tmp6
- add t6, t2, t5 // t6 = tmp2
- sub t5, t2, t5 // t5 = tmp5
- add t2, t3, t4 // t2 = tmp3
- sub t3, t3, t4 // t3 = tmp4
- add t4, t8, t2 // t4 = tmp10 = tmp0 + tmp3
- sub t8, t8, t2 // t8 = tmp13 = tmp0 - tmp3
- sub s0, t0, t6 // s0 = tmp12 = tmp1 - tmp2
- ins t8, s0, 16, 16 // t8 = tmp12|tmp13
- add t2, t0, t6 // t2 = tmp11 = tmp1 + tmp2
- mult $0, $0 // ac0 = 0
- dpa.w.ph $ac0, t8, s1 // ac0 += t12*181 + t13*181
- add s0, t4, t2 // t8 = tmp10+tmp11
- sub t4, t4, t2 // t4 = tmp10-tmp11
- sh s0, 0(v0)
- sh t4, 64(v0)
- extr.w t2, $ac0, 8 // z1 = MULTIPLY(tmp12+tmp13, FIX_0_707106781)
- addq.ph t4, t8, t2 // t9 = tmp13 + z1
- subq.ph t8, t8, t2 // t2 = tmp13 - z1
- sh t4, 32(v0)
- sh t8, 96(v0)
- add t3, t3, t5 // t3 = tmp10 = tmp4 + tmp5
- add t0, t5, t1 // t0 = tmp11 = tmp5 + tmp6
- add t1, t1, t7 // t1 = tmp12 = tmp6 + tmp7
- andi t4, a1, 0xffff
- mul s0, t1, t4
- sra s0, s0, 8 // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965)
- ins t1, t3, 16, 16 // t1 = tmp10|tmp12
- mult $0, $0 // ac0 = 0
- mulsa.w.ph $ac0, t1, a3 // ac0 += t10*98 - t12*98
- extr.w t8, $ac0, 8 // z5 = MULTIPLY(tmp10-tmp12, FIX_0_382683433)
- add t2, t7, t8 // t2 = tmp7 + z5
- sub t7, t7, t8 // t7 = tmp7 - z5
- andi t4, a2, 0xffff
- mul t8, t3, t4
- sra t8, t8, 8 // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100)
- andi t4, s1, 0xffff
- mul t6, t0, t4
- sra t6, t6, 8 // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781)
- add t0, t6, t8 // t0 = z3 + z2
- sub t1, t6, t8 // t1 = z3 - z2
- add t3, t6, s0 // t3 = z3 + z4
- sub t4, t6, s0 // t4 = z3 - z4
- sub t5, t2, t1 // t5 = dataptr[5]
- sub t6, t7, t0 // t6 = dataptr[3]
- add t3, t2, t3 // t3 = dataptr[1]
- add t4, t7, t4 // t4 = dataptr[7]
- sh t5, 80(v0)
- sh t6, 48(v0)
- sh t3, 16(v0)
- sh t4, 112(v0)
- addiu v0, 2
- bne v0, v1, 1b
- nop
-
- RESTORE_REGS_FROM_STACK 8, s0, s1
-
- j ra
- nop
-END(jsimd_fdct_ifast_dspr2)
-
-
-/*****************************************************************************/
-LEAF_DSPR2(jsimd_quantize_dspr2)
-/*
- * a0 = coef_block
- * a1 = divisors
- * a2 = workspace
- */
- .set at
-
- SAVE_REGS_ON_STACK 16, s0, s1, s2
-
- addiu v0, a2, 124 // v0 = workspace_end
- lh t0, 0(a2)
- lh t1, 0(a1)
- lh t2, 128(a1)
- sra t3, t0, 15
- sll t3, t3, 1
- addiu t3, t3, 1
- mul t0, t0, t3
- lh t4, 384(a1)
- lh t5, 130(a1)
- lh t6, 2(a2)
- lh t7, 2(a1)
- lh t8, 386(a1)
-
-1:
- andi t1, 0xffff
- add t9, t0, t2
- andi t9, 0xffff
- mul v1, t9, t1
- sra s0, t6, 15
- sll s0, s0, 1
- addiu s0, s0, 1
- addiu t9, t4, 16
- srav v1, v1, t9
- mul v1, v1, t3
- mul t6, t6, s0
- andi t7, 0xffff
- addiu a2, a2, 4
- addiu a1, a1, 4
- add s1, t6, t5
- andi s1, 0xffff
- sh v1, 0(a0)
-
- mul s2, s1, t7
- addiu s1, t8, 16
- srav s2, s2, s1
- mul s2, s2, s0
- lh t0, 0(a2)
- lh t1, 0(a1)
- sra t3, t0, 15
- sll t3, t3, 1
- addiu t3, t3, 1
- mul t0, t0, t3
- lh t2, 128(a1)
- lh t4, 384(a1)
- lh t5, 130(a1)
- lh t8, 386(a1)
- lh t6, 2(a2)
- lh t7, 2(a1)
- sh s2, 2(a0)
- lh t0, 0(a2)
- sra t3, t0, 15
- sll t3, t3, 1
- addiu t3, t3, 1
- mul t0, t0, t3
- bne a2, v0, 1b
- addiu a0, a0, 4
-
- andi t1, 0xffff
- add t9, t0, t2
- andi t9, 0xffff
- mul v1, t9, t1
- sra s0, t6, 15
- sll s0, s0, 1
- addiu s0, s0, 1
- addiu t9, t4, 16
- srav v1, v1, t9
- mul v1, v1, t3
- mul t6, t6, s0
- andi t7, 0xffff
- sh v1, 0(a0)
- add s1, t6, t5
- andi s1, 0xffff
- mul s2, s1, t7
- addiu s1, t8, 16
- addiu a2, a2, 4
- addiu a1, a1, 4
- srav s2, s2, s1
- mul s2, s2, s0
- sh s2, 2(a0)
-
- RESTORE_REGS_FROM_STACK 16, s0, s1, s2
-
- j ra
- nop
-
-END(jsimd_quantize_dspr2)
-
-
-#ifndef __mips_soft_float
-
-/*****************************************************************************/
-LEAF_DSPR2(jsimd_quantize_float_dspr2)
-/*
- * a0 = coef_block
- * a1 = divisors
- * a2 = workspace
- */
- .set at
-
- li t1, 0x46800100 // integer representation 16384.5
- mtc1 t1, f0
- li t0, 63
-0:
- lwc1 f2, 0(a2)
- lwc1 f10, 0(a1)
- lwc1 f4, 4(a2)
- lwc1 f12, 4(a1)
- lwc1 f6, 8(a2)
- lwc1 f14, 8(a1)
- lwc1 f8, 12(a2)
- lwc1 f16, 12(a1)
- madd.s f2, f0, f2, f10
- madd.s f4, f0, f4, f12
- madd.s f6, f0, f6, f14
- madd.s f8, f0, f8, f16
- lwc1 f10, 16(a1)
- lwc1 f12, 20(a1)
- trunc.w.s f2, f2
- trunc.w.s f4, f4
- trunc.w.s f6, f6
- trunc.w.s f8, f8
- lwc1 f14, 24(a1)
- lwc1 f16, 28(a1)
- mfc1 t1, f2
- mfc1 t2, f4
- mfc1 t3, f6
- mfc1 t4, f8
- lwc1 f2, 16(a2)
- lwc1 f4, 20(a2)
- lwc1 f6, 24(a2)
- lwc1 f8, 28(a2)
- madd.s f2, f0, f2, f10
- madd.s f4, f0, f4, f12
- madd.s f6, f0, f6, f14
- madd.s f8, f0, f8, f16
- addiu t1, t1, -16384
- addiu t2, t2, -16384
- addiu t3, t3, -16384
- addiu t4, t4, -16384
- trunc.w.s f2, f2
- trunc.w.s f4, f4
- trunc.w.s f6, f6
- trunc.w.s f8, f8
- sh t1, 0(a0)
- sh t2, 2(a0)
- sh t3, 4(a0)
- sh t4, 6(a0)
- mfc1 t1, f2
- mfc1 t2, f4
- mfc1 t3, f6
- mfc1 t4, f8
- addiu t0, t0, -8
- addiu a2, a2, 32
- addiu a1, a1, 32
- addiu t1, t1, -16384
- addiu t2, t2, -16384
- addiu t3, t3, -16384
- addiu t4, t4, -16384
- sh t1, 8(a0)
- sh t2, 10(a0)
- sh t3, 12(a0)
- sh t4, 14(a0)
- bgez t0, 0b
- addiu a0, a0, 16
-
- j ra
- nop
-
-END(jsimd_quantize_float_dspr2)
-
-#endif
-
-
-/*****************************************************************************/
-LEAF_DSPR2(jsimd_idct_2x2_dspr2)
-/*
- * a0 = compptr->dct_table
- * a1 = coef_block
- * a2 = output_buf
- * a3 = output_col
- */
- .set at
-
- SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
-
- addiu sp, sp, -40
- move v0, sp
- addiu s2, zero, 29692
- addiu s3, zero, -10426
- addiu s4, zero, 6967
- addiu s5, zero, -5906
- lh t0, 0(a1) // t0 = inptr[DCTSIZE*0]
- lh t5, 0(a0) // t5 = quantptr[DCTSIZE*0]
- lh t1, 48(a1) // t1 = inptr[DCTSIZE*3]
- lh t6, 48(a0) // t6 = quantptr[DCTSIZE*3]
- mul t4, t5, t0
- lh t0, 16(a1) // t0 = inptr[DCTSIZE*1]
- lh t5, 16(a0) // t5 = quantptr[DCTSIZE*1]
- mul t6, t6, t1
- mul t5, t5, t0
- lh t2, 80(a1) // t2 = inptr[DCTSIZE*5]
- lh t7, 80(a0) // t7 = quantptr[DCTSIZE*5]
- lh t3, 112(a1) // t3 = inptr[DCTSIZE*7]
- lh t8, 112(a0) // t8 = quantptr[DCTSIZE*7]
- mul t7, t7, t2
- mult zero, zero
- mul t8, t8, t3
- li s0, 0x73FCD746 // s0 = (29692 << 16) | (-10426 & 0xffff)
- li s1, 0x1B37E8EE // s1 = (6967 << 16) | (-5906 & 0xffff)
- ins t6, t5, 16, 16 // t6 = t5|t6
- sll t4, t4, 15
- dpa.w.ph $ac0, t6, s0
- lh t1, 2(a1)
- lh t6, 2(a0)
- ins t8, t7, 16, 16 // t8 = t7|t8
- dpa.w.ph $ac0, t8, s1
- mflo t0, $ac0
- mul t5, t6, t1
- lh t1, 18(a1)
- lh t6, 18(a0)
- lh t2, 50(a1)
- lh t7, 50(a0)
- mul t6, t6, t1
- subu t8, t4, t0
- mul t7, t7, t2
- addu t0, t4, t0
- shra_r.w t0, t0, 13
- lh t1, 82(a1)
- lh t2, 82(a0)
- lh t3, 114(a1)
- lh t4, 114(a0)
- shra_r.w t8, t8, 13
- mul t1, t1, t2
- mul t3, t3, t4
- sw t0, 0(v0)
- sw t8, 20(v0)
- sll t4, t5, 15
- ins t7, t6, 16, 16
- mult zero, zero
- dpa.w.ph $ac0, t7, s0
- ins t3, t1, 16, 16
- lh t1, 6(a1)
- lh t6, 6(a0)
- dpa.w.ph $ac0, t3, s1
- mflo t0, $ac0
- mul t5, t6, t1
- lh t1, 22(a1)
- lh t6, 22(a0)
- lh t2, 54(a1)
- lh t7, 54(a0)
- mul t6, t6, t1
- subu t8, t4, t0
- mul t7, t7, t2
- addu t0, t4, t0
- shra_r.w t0, t0, 13
- lh t1, 86(a1)
- lh t2, 86(a0)
- lh t3, 118(a1)
- lh t4, 118(a0)
- shra_r.w t8, t8, 13
- mul t1, t1, t2
- mul t3, t3, t4
- sw t0, 4(v0)
- sw t8, 24(v0)
- sll t4, t5, 15
- ins t7, t6, 16, 16
- mult zero, zero
- dpa.w.ph $ac0, t7, s0
- ins t3, t1, 16, 16
- lh t1, 10(a1)
- lh t6, 10(a0)
- dpa.w.ph $ac0, t3, s1
- mflo t0, $ac0
- mul t5, t6, t1
- lh t1, 26(a1)
- lh t6, 26(a0)
- lh t2, 58(a1)
- lh t7, 58(a0)
- mul t6, t6, t1
- subu t8, t4, t0
- mul t7, t7, t2
- addu t0, t4, t0
- shra_r.w t0, t0, 13
- lh t1, 90(a1)
- lh t2, 90(a0)
- lh t3, 122(a1)
- lh t4, 122(a0)
- shra_r.w t8, t8, 13
- mul t1, t1, t2
- mul t3, t3, t4
- sw t0, 8(v0)
- sw t8, 28(v0)
- sll t4, t5, 15
- ins t7, t6, 16, 16
- mult zero, zero
- dpa.w.ph $ac0, t7, s0
- ins t3, t1, 16, 16
- lh t1, 14(a1)
- lh t6, 14(a0)
- dpa.w.ph $ac0, t3, s1
- mflo t0, $ac0
- mul t5, t6, t1
- lh t1, 30(a1)
- lh t6, 30(a0)
- lh t2, 62(a1)
- lh t7, 62(a0)
- mul t6, t6, t1
- subu t8, t4, t0
- mul t7, t7, t2
- addu t0, t4, t0
- shra_r.w t0, t0, 13
- lh t1, 94(a1)
- lh t2, 94(a0)
- lh t3, 126(a1)
- lh t4, 126(a0)
- shra_r.w t8, t8, 13
- mul t1, t1, t2
- mul t3, t3, t4
- sw t0, 12(v0)
- sw t8, 32(v0)
- sll t4, t5, 15
- ins t7, t6, 16, 16
- mult zero, zero
- dpa.w.ph $ac0, t7, s0
- ins t3, t1, 16, 16
- dpa.w.ph $ac0, t3, s1
- mflo t0, $ac0
- lw t9, 0(a2)
- lw t3, 0(v0)
- lw t7, 4(v0)
- lw t1, 8(v0)
- addu t9, t9, a3
- sll t3, t3, 15
- subu t8, t4, t0
- addu t0, t4, t0
- shra_r.w t0, t0, 13
- shra_r.w t8, t8, 13
- sw t0, 16(v0)
- sw t8, 36(v0)
- lw t5, 12(v0)
- lw t6, 16(v0)
- mult t7, s2
- madd t1, s3
- madd t5, s4
- madd t6, s5
- lw t5, 24(v0)
- lw t7, 28(v0)
- mflo t0, $ac0
- lw t8, 32(v0)
- lw t2, 36(v0)
- mult $ac1, t5, s2
- madd $ac1, t7, s3
- madd $ac1, t8, s4
- madd $ac1, t2, s5
- addu t1, t3, t0
- subu t6, t3, t0
- shra_r.w t1, t1, 20
- shra_r.w t6, t6, 20
- mflo t4, $ac1
- shll_s.w t1, t1, 24
- shll_s.w t6, t6, 24
- sra t1, t1, 24
- sra t6, t6, 24
- addiu t1, t1, 128
- addiu t6, t6, 128
- lw t0, 20(v0)
- sb t1, 0(t9)
- sb t6, 1(t9)
- sll t0, t0, 15
- lw t9, 4(a2)
- addu t1, t0, t4
- subu t6, t0, t4
- addu t9, t9, a3
- shra_r.w t1, t1, 20
- shra_r.w t6, t6, 20
- shll_s.w t1, t1, 24
- shll_s.w t6, t6, 24
- sra t1, t1, 24
- sra t6, t6, 24
- addiu t1, t1, 128
- addiu t6, t6, 128
- sb t1, 0(t9)
- sb t6, 1(t9)
- addiu sp, sp, 40
-
- RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
-
- j ra
- nop
-
-END(jsimd_idct_2x2_dspr2)
-
-
-/*****************************************************************************/
-LEAF_DSPR2(jsimd_idct_4x4_dspr2)
-/*
- * a0 = compptr->dct_table
- * a1 = coef_block
- * a2 = output_buf
- * a3 = output_col
- * 16(sp) = workspace[DCTSIZE*4]; // buffers data between passes
- */
- .set at
-
- SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- lw v1, 48(sp)
- move t0, a1
- move t1, v1
- li t9, 4
- li s0, 0x2e75f93e
- li s1, 0x21f9ba79
- li s2, 0xecc2efb0
- li s3, 0x52031ccd
-
-0:
- lh s6, 32(t0) // inptr[DCTSIZE*2]
- lh t6, 32(a0) // quantptr[DCTSIZE*2]
- lh s7, 96(t0) // inptr[DCTSIZE*6]
- lh t7, 96(a0) // quantptr[DCTSIZE*6]
- mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
- lh s4, 0(t0) // inptr[DCTSIZE*0]
- mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
- lh s5, 0(a0) // quantptr[0]
- li s6, 15137
- li s7, 6270
- mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0])
- mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
- lh t5, 112(t0) // inptr[DCTSIZE*7]
- mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
- lh s4, 112(a0) // quantptr[DCTSIZE*7]
- lh v0, 80(t0) // inptr[DCTSIZE*5]
- lh s5, 80(a0) // quantptr[DCTSIZE*5]
- lh s6, 48(a0) // quantptr[DCTSIZE*3]
- sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1)
- lh s7, 16(a0) // quantptr[DCTSIZE*1]
- lh t8, 16(t0) // inptr[DCTSIZE*1]
- subu t6, t6, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
- lh t7, 48(t0) // inptr[DCTSIZE*3]
- mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
- mul v0, s5, v0 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
- mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
- mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
- addu t3, t2, t6 // tmp10 = tmp0 + z2
- subu t4, t2, t6 // tmp10 = tmp0 - z2
- mult $ac0, zero, zero
- mult $ac1, zero, zero
- ins t5, v0, 16, 16
- ins t7, t8, 16, 16
- addiu t9, t9, -1
- dpa.w.ph $ac0, t5, s0
- dpa.w.ph $ac0, t7, s1
- dpa.w.ph $ac1, t5, s2
- dpa.w.ph $ac1, t7, s3
- mflo s4, $ac0
- mflo s5, $ac1
- addiu a0, a0, 2
- addiu t1, t1, 4
- addiu t0, t0, 2
- addu t6, t4, s4
- subu t5, t4, s4
- addu s6, t3, s5
- subu s7, t3, s5
- shra_r.w t6, t6, 12 // DESCALE(tmp12 + temp1, 12)
- shra_r.w t5, t5, 12 // DESCALE(tmp12 - temp1, 12)
- shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12)
- shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12)
- sw t6, 28(t1)
- sw t5, 60(t1)
- sw s6, -4(t1)
- bgtz t9, 0b
- sw s7, 92(t1)
- // second loop three pass
- li t9, 3
-1:
- lh s6, 34(t0) // inptr[DCTSIZE*2]
- lh t6, 34(a0) // quantptr[DCTSIZE*2]
- lh s7, 98(t0) // inptr[DCTSIZE*6]
- lh t7, 98(a0) // quantptr[DCTSIZE*6]
- mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
- lh s4, 2(t0) // inptr[DCTSIZE*0]
- mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
- lh s5, 2(a0) // quantptr[DCTSIZE*0]
- li s6, 15137
- li s7, 6270
- mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0])
- mul v0, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
- lh t5, 114(t0) // inptr[DCTSIZE*7]
- mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
- lh s4, 114(a0) // quantptr[DCTSIZE*7]
- lh s5, 82(a0) // quantptr[DCTSIZE*5]
- lh t6, 82(t0) // inptr[DCTSIZE*5]
- sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1)
- lh s6, 50(a0) // quantptr[DCTSIZE*3]
- lh t8, 18(t0) // inptr[DCTSIZE*1]
- subu v0, v0, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
- lh t7, 50(t0) // inptr[DCTSIZE*3]
- lh s7, 18(a0) // quantptr[DCTSIZE*1]
- mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
- mul t6, s5, t6 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
- mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
- mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
- addu t3, t2, v0 // tmp10 = tmp0 + z2
- subu t4, t2, v0 // tmp10 = tmp0 - z2
- mult $ac0, zero, zero
- mult $ac1, zero, zero
- ins t5, t6, 16, 16
- ins t7, t8, 16, 16
- dpa.w.ph $ac0, t5, s0
- dpa.w.ph $ac0, t7, s1
- dpa.w.ph $ac1, t5, s2
- dpa.w.ph $ac1, t7, s3
- mflo t5, $ac0
- mflo t6, $ac1
- addiu t9, t9, -1
- addiu t0, t0, 2
- addiu a0, a0, 2
- addiu t1, t1, 4
- addu s5, t4, t5
- subu s4, t4, t5
- addu s6, t3, t6
- subu s7, t3, t6
- shra_r.w s5, s5, 12 // DESCALE(tmp12 + temp1, 12)
- shra_r.w s4, s4, 12 // DESCALE(tmp12 - temp1, 12)
- shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12)
- shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12)
- sw s5, 32(t1)
- sw s4, 64(t1)
- sw s6, 0(t1)
- bgtz t9, 1b
- sw s7, 96(t1)
- move t1, v1
- li s4, 15137
- lw s6, 8(t1) // wsptr[2]
- li s5, 6270
- lw s7, 24(t1) // wsptr[6]
- mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
- lw t2, 0(t1) // wsptr[0]
- mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
- lh t5, 28(t1) // wsptr[7]
- lh t6, 20(t1) // wsptr[5]
- lh t7, 12(t1) // wsptr[3]
- lh t8, 4(t1) // wsptr[1]
- ins t5, t6, 16, 16
- ins t7, t8, 16, 16
- mult $ac0, zero, zero
- dpa.w.ph $ac0, t5, s0
- dpa.w.ph $ac0, t7, s1
- mult $ac1, zero, zero
- dpa.w.ph $ac1, t5, s2
- dpa.w.ph $ac1, t7, s3
- sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
- mflo s6, $ac0
- // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
- subu s4, s4, s5
- addu t3, t2, s4 // tmp10 = tmp0 + z2
- mflo s7, $ac1
- subu t4, t2, s4 // tmp10 = tmp0 - z2
- addu t7, t4, s6
- subu t8, t4, s6
- addu t5, t3, s7
- subu t6, t3, s7
- shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19)
- shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19)
- shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19)
- shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19)
- sll s4, t9, 2
- lw v0, 0(a2) // output_buf[ctr]
- shll_s.w t5, t5, 24
- shll_s.w t6, t6, 24
- shll_s.w t7, t7, 24
- shll_s.w t8, t8, 24
- sra t5, t5, 24
- sra t6, t6, 24
- sra t7, t7, 24
- sra t8, t8, 24
- addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
- addiu t5, t5, 128
- addiu t6, t6, 128
- addiu t7, t7, 128
- addiu t8, t8, 128
- sb t5, 0(v0)
- sb t7, 1(v0)
- sb t8, 2(v0)
- sb t6, 3(v0)
- // 2
- li s4, 15137
- lw s6, 40(t1) // wsptr[2]
- li s5, 6270
- lw s7, 56(t1) // wsptr[6]
- mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
- lw t2, 32(t1) // wsptr[0]
- mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
- lh t5, 60(t1) // wsptr[7]
- lh t6, 52(t1) // wsptr[5]
- lh t7, 44(t1) // wsptr[3]
- lh t8, 36(t1) // wsptr[1]
- ins t5, t6, 16, 16
- ins t7, t8, 16, 16
- mult $ac0, zero, zero
- dpa.w.ph $ac0, t5, s0
- dpa.w.ph $ac0, t7, s1
- mult $ac1, zero, zero
- dpa.w.ph $ac1, t5, s2
- dpa.w.ph $ac1, t7, s3
- sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
- mflo s6, $ac0
- // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
- subu s4, s4, s5
- addu t3, t2, s4 // tmp10 = tmp0 + z2
- mflo s7, $ac1
- subu t4, t2, s4 // tmp10 = tmp0 - z2
- addu t7, t4, s6
- subu t8, t4, s6
- addu t5, t3, s7
- subu t6, t3, s7
- shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+1)
- shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+1)
- shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+1)
- shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+1)
- sll s4, t9, 2
- lw v0, 4(a2) // output_buf[ctr]
- shll_s.w t5, t5, 24
- shll_s.w t6, t6, 24
- shll_s.w t7, t7, 24
- shll_s.w t8, t8, 24
- sra t5, t5, 24
- sra t6, t6, 24
- sra t7, t7, 24
- sra t8, t8, 24
- addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
- addiu t5, t5, 128
- addiu t6, t6, 128
- addiu t7, t7, 128
- addiu t8, t8, 128
- sb t5, 0(v0)
- sb t7, 1(v0)
- sb t8, 2(v0)
- sb t6, 3(v0)
- // 3
- li s4, 15137
- lw s6, 72(t1) // wsptr[2]
- li s5, 6270
- lw s7, 88(t1) // wsptr[6]
- mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
- lw t2, 64(t1) // wsptr[0]
- mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
- lh t5, 92(t1) // wsptr[7]
- lh t6, 84(t1) // wsptr[5]
- lh t7, 76(t1) // wsptr[3]
- lh t8, 68(t1) // wsptr[1]
- ins t5, t6, 16, 16
- ins t7, t8, 16, 16
- mult $ac0, zero, zero
- dpa.w.ph $ac0, t5, s0
- dpa.w.ph $ac0, t7, s1
- mult $ac1, zero, zero
- dpa.w.ph $ac1, t5, s2
- dpa.w.ph $ac1, t7, s3
- sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
- mflo s6, $ac0
- // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
- subu s4, s4, s5
- addu t3, t2, s4 // tmp10 = tmp0 + z2
- mflo s7, $ac1
- subu t4, t2, s4 // tmp10 = tmp0 - z2
- addu t7, t4, s6
- subu t8, t4, s6
- addu t5, t3, s7
- subu t6, t3, s7
- shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19)
- shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19)
- shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19)
- shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19)
- sll s4, t9, 2
- lw v0, 8(a2) // output_buf[ctr]
- shll_s.w t5, t5, 24
- shll_s.w t6, t6, 24
- shll_s.w t7, t7, 24
- shll_s.w t8, t8, 24
- sra t5, t5, 24
- sra t6, t6, 24
- sra t7, t7, 24
- sra t8, t8, 24
- addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
- addiu t5, t5, 128
- addiu t6, t6, 128
- addiu t7, t7, 128
- addiu t8, t8, 128
- sb t5, 0(v0)
- sb t7, 1(v0)
- sb t8, 2(v0)
- sb t6, 3(v0)
- li s4, 15137
- lw s6, 104(t1) // wsptr[2]
- li s5, 6270
- lw s7, 120(t1) // wsptr[6]
- mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
- lw t2, 96(t1) // wsptr[0]
- mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
- lh t5, 124(t1) // wsptr[7]
- lh t6, 116(t1) // wsptr[5]
- lh t7, 108(t1) // wsptr[3]
- lh t8, 100(t1) // wsptr[1]
- ins t5, t6, 16, 16
- ins t7, t8, 16, 16
- mult $ac0, zero, zero
- dpa.w.ph $ac0, t5, s0
- dpa.w.ph $ac0, t7, s1
- mult $ac1, zero, zero
- dpa.w.ph $ac1, t5, s2
- dpa.w.ph $ac1, t7, s3
- sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
- mflo s6, $ac0
- // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
- subu s4, s4, s5
- addu t3, t2, s4 // tmp10 = tmp0 + z2;
- mflo s7, $ac1
- subu t4, t2, s4 // tmp10 = tmp0 - z2;
- addu t7, t4, s6
- subu t8, t4, s6
- addu t5, t3, s7
- subu t6, t3, s7
- shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19)
- shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19)
- shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19)
- shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19)
- sll s4, t9, 2
- lw v0, 12(a2) // output_buf[ctr]
- shll_s.w t5, t5, 24
- shll_s.w t6, t6, 24
- shll_s.w t7, t7, 24
- shll_s.w t8, t8, 24
- sra t5, t5, 24
- sra t6, t6, 24
- sra t7, t7, 24
- sra t8, t8, 24
- addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
- addiu t5, t5, 128
- addiu t6, t6, 128
- addiu t7, t7, 128
- addiu t8, t8, 128
- sb t5, 0(v0)
- sb t7, 1(v0)
- sb t8, 2(v0)
- sb t6, 3(v0)
-
- RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- j ra
- nop
-END(jsimd_idct_4x4_dspr2)
-
-
-/*****************************************************************************/
-LEAF_DSPR2(jsimd_idct_6x6_dspr2)
-/*
- * a0 = compptr->dct_table
- * a1 = coef_block
- * a2 = output_buf
- * a3 = output_col
- */
- .set at
-
- SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- addiu sp, sp, -144
- move v0, sp
- addiu v1, v0, 24
- addiu t9, zero, 5793
- addiu s0, zero, 10033
- addiu s1, zero, 2998
-
-1:
- lh s2, 0(a0) // q0 = quantptr[ 0]
- lh s3, 32(a0) // q1 = quantptr[16]
- lh s4, 64(a0) // q2 = quantptr[32]
- lh t2, 64(a1) // tmp2 = inptr[32]
- lh t1, 32(a1) // tmp1 = inptr[16]
- lh t0, 0(a1) // tmp0 = inptr[ 0]
- mul t2, t2, s4 // tmp2 = tmp2 * q2
- mul t1, t1, s3 // tmp1 = tmp1 * q1
- mul t0, t0, s2 // tmp0 = tmp0 * q0
- lh t6, 16(a1) // z1 = inptr[ 8]
- lh t8, 80(a1) // z3 = inptr[40]
- lh t7, 48(a1) // z2 = inptr[24]
- lh s2, 16(a0) // q0 = quantptr[ 8]
- lh s4, 80(a0) // q2 = quantptr[40]
- lh s3, 48(a0) // q1 = quantptr[24]
- mul t2, t2, t9 // tmp2 = tmp2 * 5793
- mul t1, t1, s0 // tmp1 = tmp1 * 10033
- sll t0, t0, 13 // tmp0 = tmp0 << 13
- mul t6, t6, s2 // z1 = z1 * q0
- mul t8, t8, s4 // z3 = z3 * q2
- mul t7, t7, s3 // z2 = z2 * q1
- addu t3, t0, t2 // tmp10 = tmp0 + tmp2
- sll t2, t2, 1 // tmp2 = tmp2 << 2
- subu t4, t0, t2 // tmp11 = tmp0 - tmp2;
- subu t5, t3, t1 // tmp12 = tmp10 - tmp1
- addu t3, t3, t1 // tmp10 = tmp10 + tmp1
- addu t1, t6, t8 // tmp1 = z1 + z3
- mul t1, t1, s1 // tmp1 = tmp1 * 2998
- shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11
- subu t2, t6, t8 // tmp2 = z1 - z3
- subu t2, t2, t7 // tmp2 = tmp2 - z2
- sll t2, t2, 2 // tmp2 = tmp2 << 2
- addu t0, t6, t7 // tmp0 = z1 + z2
- sll t0, t0, 13 // tmp0 = tmp0 << 13
- subu s2, t8, t7 // q0 = z3 - z2
- sll s2, s2, 13 // q0 = q0 << 13
- addu t0, t0, t1 // tmp0 = tmp0 + tmp1
- addu t1, s2, t1 // tmp1 = q0 + tmp1
- addu s2, t4, t2 // q0 = tmp11 + tmp2
- subu s3, t4, t2 // q1 = tmp11 - tmp2
- addu t6, t3, t0 // z1 = tmp10 + tmp0
- subu t7, t3, t0 // z2 = tmp10 - tmp0
- addu t4, t5, t1 // tmp11 = tmp12 + tmp1
- subu t5, t5, t1 // tmp12 = tmp12 - tmp1
- shra_r.w t6, t6, 11 // z1 = (z1 + 1024) >> 11
- shra_r.w t7, t7, 11 // z2 = (z2 + 1024) >> 11
- shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11
- shra_r.w t5, t5, 11 // tmp12 = (tmp12 + 1024) >> 11
- sw s2, 24(v0)
- sw s3, 96(v0)
- sw t6, 0(v0)
- sw t7, 120(v0)
- sw t4, 48(v0)
- sw t5, 72(v0)
- addiu v0, v0, 4
- addiu a1, a1, 2
- bne v0, v1, 1b
- addiu a0, a0, 2
-
- /* Pass 2: process 6 rows from work array, store into output array. */
- move v0, sp
- addiu v1, v0, 144
-
-2:
- lw t0, 0(v0)
- lw t2, 16(v0)
- lw s5, 0(a2)
- addiu t0, t0, 16
- sll t0, t0, 13
- mul t3, t2, t9
- lw t6, 4(v0)
- lw t8, 20(v0)
- lw t7, 12(v0)
- addu s5, s5, a3
- addu s6, t6, t8
- mul s6, s6, s1
- addu t1, t0, t3
- subu t4, t0, t3
- subu t4, t4, t3
- lw t3, 8(v0)
- mul t0, t3, s0
- addu s7, t6, t7
- sll s7, s7, 13
- addu s7, s6, s7
- subu t2, t8, t7
- sll t2, t2, 13
- addu t2, s6, t2
- subu s6, t6, t7
- subu s6, s6, t8
- sll s6, s6, 13
- addu t3, t1, t0
- subu t5, t1, t0
- addu t6, t3, s7
- subu t3, t3, s7
- addu t7, t4, s6
- subu t4, t4, s6
- addu t8, t5, t2
- subu t5, t5, t2
- shll_s.w t6, t6, 6
- shll_s.w t3, t3, 6
- shll_s.w t7, t7, 6
- shll_s.w t4, t4, 6
- shll_s.w t8, t8, 6
- shll_s.w t5, t5, 6
- sra t6, t6, 24
- addiu t6, t6, 128
- sra t3, t3, 24
- addiu t3, t3, 128
- sb t6, 0(s5)
- sra t7, t7, 24
- addiu t7, t7, 128
- sb t3, 5(s5)
- sra t4, t4, 24
- addiu t4, t4, 128
- sb t7, 1(s5)
- sra t8, t8, 24
- addiu t8, t8, 128
- sb t4, 4(s5)
- addiu v0, v0, 24
- sra t5, t5, 24
- addiu t5, t5, 128
- sb t8, 2(s5)
- addiu a2, a2, 4
- bne v0, v1, 2b
- sb t5, 3(s5)
-
- addiu sp, sp, 144
-
- RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
- j ra
- nop
-
-END(jsimd_idct_6x6_dspr2)
-
-
-/*****************************************************************************/
-LEAF_DSPR2(jsimd_idct_12x12_pass1_dspr2)
-/*
- * a0 = compptr->dct_table
- * a1 = coef_block
- * a2 = workspace
- */
- SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
-
- li a3, 8
-
-1:
- // odd part
- lh t0, 48(a1)
- lh t1, 48(a0)
- lh t2, 16(a1)
- lh t3, 16(a0)
- lh t4, 80(a1)
- lh t5, 80(a0)
- lh t6, 112(a1)
- lh t7, 112(a0)
- mul t0, t0, t1 // z2
- mul t1, t2, t3 // z1
- mul t2, t4, t5 // z3
- mul t3, t6, t7 // z4
- li t4, 10703 // FIX(1.306562965)
- li t5, 4433 // FIX_0_541196100
- li t6, 7053 // FIX(0.860918669)
- mul t4, t0, t4 // tmp11
- mul t5, t0, t5 // -tmp14
- addu t7, t1, t2 // tmp10
- addu t8, t7, t3 // tmp10 + z4
- mul t6, t6, t8 // tmp15
- li t8, 2139 // FIX(0.261052384)
- mul t8, t7, t8 // MULTIPLY(tmp10, FIX(0.261052384))
- li t7, 2295 // FIX(0.280143716)
- mul t7, t1, t7 // MULTIPLY(z1, FIX(0.280143716))
- addu t9, t2, t3 // z3 + z4
- li s0, 8565 // FIX(1.045510580)
- mul t9, t9, s0 // -tmp13
- li s0, 12112 // FIX(1.478575242)
- mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242)
- li s1, 12998 // FIX(1.586706681)
- mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681))
- li s2, 5540 // FIX(0.676326758)
- mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758))
- li s3, 16244 // FIX(1.982889723)
- mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723))
- subu t1, t1, t3 // z1-=z4
- subu t0, t0, t2 // z2-=z3
- addu t2, t0, t1 // z1+z2
- li t3, 4433 // FIX_0_541196100
- mul t2, t2, t3 // z3
- li t3, 6270 // FIX_0_765366865
- mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865)
- li t3, 15137 // FIX_0_765366865
- mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065)
- addu t8, t6, t8 // tmp12
- addu t3, t8, t4 // tmp12 + tmp11
- addu t3, t3, t7 // tmp10
- subu t8, t8, t9 // tmp12 + tmp13
- addu s0, t5, s0
- subu t8, t8, s0 // tmp12
- subu t9, t6, t9
- subu s1, s1, t4
- addu t9, t9, s1 // tmp13
- subu t6, t6, t5
- subu t6, t6, s2
- subu t6, t6, s3 // tmp15
- // even part start
- lh t4, 64(a1)
- lh t5, 64(a0)
- lh t7, 32(a1)
- lh s0, 32(a0)
- lh s1, 0(a1)
- lh s2, 0(a0)
- lh s3, 96(a1)
- lh v0, 96(a0)
- mul t4, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4])
- mul t5, t7, s0 // DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2])
- mul t7, s1, s2 // DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0])
- mul s0, s3, v0 // DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6])
- // odd part end
- addu t1, t2, t1 // tmp11
- subu t0, t2, t0 // tmp14
- // update counter and pointers
- addiu a3, a3, -1
- addiu a0, a0, 2
- addiu a1, a1, 2
- // even part rest
- li s1, 10033
- li s2, 11190
- mul t4, t4, s1 // z4
- mul s1, t5, s2 // z4
- sll t5, t5, 13 // z1
- sll t7, t7, 13
- addiu t7, t7, 1024 // z3
- sll s0, s0, 13 // z2
- addu s2, t7, t4 // tmp10
- subu t4, t7, t4 // tmp11
- subu s3, t5, s0 // tmp12
- addu t2, t7, s3 // tmp21
- subu s3, t7, s3 // tmp24
- addu t7, s1, s0 // tmp12
- addu v0, s2, t7 // tmp20
- subu s2, s2, t7 // tmp25
- subu s1, s1, t5 // z4 - z1
- subu s1, s1, s0 // tmp12
- addu s0, t4, s1 // tmp22
- subu t4, t4, s1 // tmp23
- // final output stage
- addu t5, v0, t3
- subu v0, v0, t3
- addu t3, t2, t1
- subu t2, t2, t1
- addu t1, s0, t8
- subu s0, s0, t8
- addu t8, t4, t9
- subu t4, t4, t9
- addu t9, s3, t0
- subu s3, s3, t0
- addu t0, s2, t6
- subu s2, s2, t6
- sra t5, t5, 11
- sra t3, t3, 11
- sra t1, t1, 11
- sra t8, t8, 11
- sra t9, t9, 11
- sra t0, t0, 11
- sra s2, s2, 11
- sra s3, s3, 11
- sra t4, t4, 11
- sra s0, s0, 11
- sra t2, t2, 11
- sra v0, v0, 11
- sw t5, 0(a2)
- sw t3, 32(a2)
- sw t1, 64(a2)
- sw t8, 96(a2)
- sw t9, 128(a2)
- sw t0, 160(a2)
- sw s2, 192(a2)
- sw s3, 224(a2)
- sw t4, 256(a2)
- sw s0, 288(a2)
- sw t2, 320(a2)
- sw v0, 352(a2)
- bgtz a3, 1b
- addiu a2, a2, 4
-
- RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
-
- j ra
- nop
-
-END(jsimd_idct_12x12_pass1_dspr2)
-
-
-/*****************************************************************************/
-LEAF_DSPR2(jsimd_idct_12x12_pass2_dspr2)
-/*
- * a0 = workspace
- * a1 = output
- */
- SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
-
- li a3, 12
-
-1:
- // Odd part
- lw t0, 12(a0)
- lw t1, 4(a0)
- lw t2, 20(a0)
- lw t3, 28(a0)
- li t4, 10703 // FIX(1.306562965)
- li t5, 4433 // FIX_0_541196100
- mul t4, t0, t4 // tmp11
- mul t5, t0, t5 // -tmp14
- addu t6, t1, t2 // tmp10
- li t7, 2139 // FIX(0.261052384)
- mul t7, t6, t7 // MULTIPLY(tmp10, FIX(0.261052384))
- addu t6, t6, t3 // tmp10 + z4
- li t8, 7053 // FIX(0.860918669)
- mul t6, t6, t8 // tmp15
- li t8, 2295 // FIX(0.280143716)
- mul t8, t1, t8 // MULTIPLY(z1, FIX(0.280143716))
- addu t9, t2, t3 // z3 + z4
- li s0, 8565 // FIX(1.045510580)
- mul t9, t9, s0 // -tmp13
- li s0, 12112 // FIX(1.478575242)
- mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242))
- li s1, 12998 // FIX(1.586706681)
- mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681))
- li s2, 5540 // FIX(0.676326758)
- mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758))
- li s3, 16244 // FIX(1.982889723)
- mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723))
- subu t1, t1, t3 // z1 -= z4
- subu t0, t0, t2 // z2 -= z3
- addu t2, t1, t0 // z1 + z2
- li t3, 4433 // FIX_0_541196100
- mul t2, t2, t3 // z3
- li t3, 6270 // FIX_0_765366865
- mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865)
- li t3, 15137 // FIX_1_847759065
- mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065)
- addu t3, t6, t7 // tmp12
- addu t7, t3, t4
- addu t7, t7, t8 // tmp10
- subu t3, t3, t9
- subu t3, t3, t5
- subu t3, t3, s0 // tmp12
- subu t9, t6, t9
- subu t9, t9, t4
- addu t9, t9, s1 // tmp13
- subu t6, t6, t5
- subu t6, t6, s2
- subu t6, t6, s3 // tmp15
- addu t1, t2, t1 // tmp11
- subu t0, t2, t0 // tmp14
- // even part
- lw t2, 16(a0) // z4
- lw t4, 8(a0) // z1
- lw t5, 0(a0) // z3
- lw t8, 24(a0) // z2
- li s0, 10033 // FIX(1.224744871)
- li s1, 11190 // FIX(1.366025404)
- mul t2, t2, s0 // z4
- mul s0, t4, s1 // z4
- addiu t5, t5, 0x10
- sll t5, t5, 13 // z3
- sll t4, t4, 13 // z1
- sll t8, t8, 13 // z2
- subu s1, t4, t8 // tmp12
- addu s2, t5, t2 // tmp10
- subu t2, t5, t2 // tmp11
- addu s3, t5, s1 // tmp21
- subu s1, t5, s1 // tmp24
- addu t5, s0, t8 // tmp12
- addu v0, s2, t5 // tmp20
- subu t5, s2, t5 // tmp25
- subu t4, s0, t4
- subu t4, t4, t8 // tmp12
- addu t8, t2, t4 // tmp22
- subu t2, t2, t4 // tmp23
- // increment counter and pointers
- addiu a3, a3, -1
- addiu a0, a0, 32
- // Final stage
- addu t4, v0, t7
- subu v0, v0, t7
- addu t7, s3, t1
- subu s3, s3, t1
- addu t1, t8, t3
- subu t8, t8, t3
- addu t3, t2, t9
- subu t2, t2, t9
- addu t9, s1, t0
- subu s1, s1, t0
- addu t0, t5, t6
- subu t5, t5, t6
- sll t4, t4, 4
- sll t7, t7, 4
- sll t1, t1, 4
- sll t3, t3, 4
- sll t9, t9, 4
- sll t0, t0, 4
- sll t5, t5, 4
- sll s1, s1, 4
- sll t2, t2, 4
- sll t8, t8, 4
- sll s3, s3, 4
- sll v0, v0, 4
- shll_s.w t4, t4, 2
- shll_s.w t7, t7, 2
- shll_s.w t1, t1, 2
- shll_s.w t3, t3, 2
- shll_s.w t9, t9, 2
- shll_s.w t0, t0, 2
- shll_s.w t5, t5, 2
- shll_s.w s1, s1, 2
- shll_s.w t2, t2, 2
- shll_s.w t8, t8, 2
- shll_s.w s3, s3, 2
- shll_s.w v0, v0, 2
- srl t4, t4, 24
- srl t7, t7, 24
- srl t1, t1, 24
- srl t3, t3, 24
- srl t9, t9, 24
- srl t0, t0, 24
- srl t5, t5, 24
- srl s1, s1, 24
- srl t2, t2, 24
- srl t8, t8, 24
- srl s3, s3, 24
- srl v0, v0, 24
- lw t6, 0(a1)
- addiu t4, t4, 0x80
- addiu t7, t7, 0x80
- addiu t1, t1, 0x80
- addiu t3, t3, 0x80
- addiu t9, t9, 0x80
- addiu t0, t0, 0x80
- addiu t5, t5, 0x80
- addiu s1, s1, 0x80
- addiu t2, t2, 0x80
- addiu t8, t8, 0x80
- addiu s3, s3, 0x80
- addiu v0, v0, 0x80
- sb t4, 0(t6)
- sb t7, 1(t6)
- sb t1, 2(t6)
- sb t3, 3(t6)
- sb t9, 4(t6)
- sb t0, 5(t6)
- sb t5, 6(t6)
- sb s1, 7(t6)
- sb t2, 8(t6)
- sb t8, 9(t6)
- sb s3, 10(t6)
- sb v0, 11(t6)
- bgtz a3, 1b
- addiu a1, a1, 4
-
- RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
-
- jr ra
- nop
-
-END(jsimd_idct_12x12_pass2_dspr2)
-
-
-/*****************************************************************************/
-LEAF_DSPR2(jsimd_convsamp_dspr2)
-/*
- * a0 = sample_data
- * a1 = start_col
- * a2 = workspace
- */
- lw t0, 0(a0)
- li t7, 0xff80ff80
- addu t0, t0, a1
- ulw t1, 0(t0)
- ulw t2, 4(t0)
- preceu.ph.qbr t3, t1
- preceu.ph.qbl t4, t1
- lw t0, 4(a0)
- preceu.ph.qbr t5, t2
- preceu.ph.qbl t6, t2
- addu t0, t0, a1
- addu.ph t3, t3, t7
- addu.ph t4, t4, t7
- ulw t1, 0(t0)
- ulw t2, 4(t0)
- addu.ph t5, t5, t7
- addu.ph t6, t6, t7
- usw t3, 0(a2)
- usw t4, 4(a2)
- preceu.ph.qbr t3, t1
- preceu.ph.qbl t4, t1
- usw t5, 8(a2)
- usw t6, 12(a2)
-
- lw t0, 8(a0)
- preceu.ph.qbr t5, t2
- preceu.ph.qbl t6, t2
- addu t0, t0, a1
- addu.ph t3, t3, t7
- addu.ph t4, t4, t7
- ulw t1, 0(t0)
- ulw t2, 4(t0)
- addu.ph t5, t5, t7
- addu.ph t6, t6, t7
- usw t3, 16(a2)
- usw t4, 20(a2)
- preceu.ph.qbr t3, t1
- preceu.ph.qbl t4, t1
- usw t5, 24(a2)
- usw t6, 28(a2)
-
- lw t0, 12(a0)
- preceu.ph.qbr t5, t2
- preceu.ph.qbl t6, t2
- addu t0, t0, a1
- addu.ph t3, t3, t7
- addu.ph t4, t4, t7
- ulw t1, 0(t0)
- ulw t2, 4(t0)
- addu.ph t5, t5, t7
- addu.ph t6, t6, t7
- usw t3, 32(a2)
- usw t4, 36(a2)
- preceu.ph.qbr t3, t1
- preceu.ph.qbl t4, t1
- usw t5, 40(a2)
- usw t6, 44(a2)
-
- lw t0, 16(a0)
- preceu.ph.qbr t5, t2
- preceu.ph.qbl t6, t2
- addu t0, t0, a1
- addu.ph t3, t3, t7
- addu.ph t4, t4, t7
- ulw t1, 0(t0)
- ulw t2, 4(t0)
- addu.ph t5, t5, t7
- addu.ph t6, t6, t7
- usw t3, 48(a2)
- usw t4, 52(a2)
- preceu.ph.qbr t3, t1
- preceu.ph.qbl t4, t1
- usw t5, 56(a2)
- usw t6, 60(a2)
-
- lw t0, 20(a0)
- preceu.ph.qbr t5, t2
- preceu.ph.qbl t6, t2
- addu t0, t0, a1
- addu.ph t3, t3, t7
- addu.ph t4, t4, t7
- ulw t1, 0(t0)
- ulw t2, 4(t0)
- addu.ph t5, t5, t7
- addu.ph t6, t6, t7
- usw t3, 64(a2)
- usw t4, 68(a2)
- preceu.ph.qbr t3, t1
- preceu.ph.qbl t4, t1
- usw t5, 72(a2)
- usw t6, 76(a2)
-
- lw t0, 24(a0)
- preceu.ph.qbr t5, t2
- preceu.ph.qbl t6, t2
- addu t0, t0, a1
- addu.ph t3, t3, t7
- addu.ph t4, t4, t7
- ulw t1, 0(t0)
- ulw t2, 4(t0)
- addu.ph t5, t5, t7
- addu.ph t6, t6, t7
- usw t3, 80(a2)
- usw t4, 84(a2)
- preceu.ph.qbr t3, t1
- preceu.ph.qbl t4, t1
- usw t5, 88(a2)
- usw t6, 92(a2)
-
- lw t0, 28(a0)
- preceu.ph.qbr t5, t2
- preceu.ph.qbl t6, t2
- addu t0, t0, a1
- addu.ph t3, t3, t7
- addu.ph t4, t4, t7
- ulw t1, 0(t0)
- ulw t2, 4(t0)
- addu.ph t5, t5, t7
- addu.ph t6, t6, t7
- usw t3, 96(a2)
- usw t4, 100(a2)
- preceu.ph.qbr t3, t1
- preceu.ph.qbl t4, t1
- usw t5, 104(a2)
- usw t6, 108(a2)
- preceu.ph.qbr t5, t2
- preceu.ph.qbl t6, t2
- addu.ph t3, t3, t7
- addu.ph t4, t4, t7
- addu.ph t5, t5, t7
- addu.ph t6, t6, t7
- usw t3, 112(a2)
- usw t4, 116(a2)
- usw t5, 120(a2)
- usw t6, 124(a2)
-
- j ra
- nop
-
-END(jsimd_convsamp_dspr2)
-
-
-#ifndef __mips_soft_float
-
-/*****************************************************************************/
-LEAF_DSPR2(jsimd_convsamp_float_dspr2)
-/*
- * a0 = sample_data
- * a1 = start_col
- * a2 = workspace
- */
- .set at
-
- lw t0, 0(a0)
- addu t0, t0, a1
- lbu t1, 0(t0)
- lbu t2, 1(t0)
- lbu t3, 2(t0)
- lbu t4, 3(t0)
- lbu t5, 4(t0)
- lbu t6, 5(t0)
- lbu t7, 6(t0)
- lbu t8, 7(t0)
- addiu t1, t1, -128
- addiu t2, t2, -128
- addiu t3, t3, -128
- addiu t4, t4, -128
- addiu t5, t5, -128
- addiu t6, t6, -128
- addiu t7, t7, -128
- addiu t8, t8, -128
- mtc1 t1, f2
- mtc1 t2, f4
- mtc1 t3, f6
- mtc1 t4, f8
- mtc1 t5, f10
- mtc1 t6, f12
- mtc1 t7, f14
- mtc1 t8, f16
- cvt.s.w f2, f2
- cvt.s.w f4, f4
- cvt.s.w f6, f6
- cvt.s.w f8, f8
- cvt.s.w f10, f10
- cvt.s.w f12, f12
- cvt.s.w f14, f14
- cvt.s.w f16, f16
- lw t0, 4(a0)
- swc1 f2, 0(a2)
- swc1 f4, 4(a2)
- swc1 f6, 8(a2)
- addu t0, t0, a1
- swc1 f8, 12(a2)
- swc1 f10, 16(a2)
- swc1 f12, 20(a2)
- swc1 f14, 24(a2)
- swc1 f16, 28(a2)
- // elemr 1
- lbu t1, 0(t0)
- lbu t2, 1(t0)
- lbu t3, 2(t0)
- lbu t4, 3(t0)
- lbu t5, 4(t0)
- lbu t6, 5(t0)
- lbu t7, 6(t0)
- lbu t8, 7(t0)
- addiu t1, t1, -128
- addiu t2, t2, -128
- addiu t3, t3, -128
- addiu t4, t4, -128
- addiu t5, t5, -128
- addiu t6, t6, -128
- addiu t7, t7, -128
- addiu t8, t8, -128
- mtc1 t1, f2
- mtc1 t2, f4
- mtc1 t3, f6
- mtc1 t4, f8
- mtc1 t5, f10
- mtc1 t6, f12
- mtc1 t7, f14
- mtc1 t8, f16
- cvt.s.w f2, f2
- cvt.s.w f4, f4
- cvt.s.w f6, f6
- cvt.s.w f8, f8
- cvt.s.w f10, f10
- cvt.s.w f12, f12
- cvt.s.w f14, f14
- cvt.s.w f16, f16
- lw t0, 8(a0)
- swc1 f2, 32(a2)
- swc1 f4, 36(a2)
- swc1 f6, 40(a2)
- addu t0, t0, a1
- swc1 f8, 44(a2)
- swc1 f10, 48(a2)
- swc1 f12, 52(a2)
- swc1 f14, 56(a2)
- swc1 f16, 60(a2)
- // elemr 2
- lbu t1, 0(t0)
- lbu t2, 1(t0)
- lbu t3, 2(t0)
- lbu t4, 3(t0)
- lbu t5, 4(t0)
- lbu t6, 5(t0)
- lbu t7, 6(t0)
- lbu t8, 7(t0)
- addiu t1, t1, -128
- addiu t2, t2, -128
- addiu t3, t3, -128
- addiu t4, t4, -128
- addiu t5, t5, -128
- addiu t6, t6, -128
- addiu t7, t7, -128
- addiu t8, t8, -128
- mtc1 t1, f2
- mtc1 t2, f4
- mtc1 t3, f6
- mtc1 t4, f8
- mtc1 t5, f10
- mtc1 t6, f12
- mtc1 t7, f14
- mtc1 t8, f16
- cvt.s.w f2, f2
- cvt.s.w f4, f4
- cvt.s.w f6, f6
- cvt.s.w f8, f8
- cvt.s.w f10, f10
- cvt.s.w f12, f12
- cvt.s.w f14, f14
- cvt.s.w f16, f16
- lw t0, 12(a0)
- swc1 f2, 64(a2)
- swc1 f4, 68(a2)
- swc1 f6, 72(a2)
- addu t0, t0, a1
- swc1 f8, 76(a2)
- swc1 f10, 80(a2)
- swc1 f12, 84(a2)
- swc1 f14, 88(a2)
- swc1 f16, 92(a2)
- // elemr 3
- lbu t1, 0(t0)
- lbu t2, 1(t0)
- lbu t3, 2(t0)
- lbu t4, 3(t0)
- lbu t5, 4(t0)
- lbu t6, 5(t0)
- lbu t7, 6(t0)
- lbu t8, 7(t0)
- addiu t1, t1, -128
- addiu t2, t2, -128
- addiu t3, t3, -128
- addiu t4, t4, -128
- addiu t5, t5, -128
- addiu t6, t6, -128
- addiu t7, t7, -128
- addiu t8, t8, -128
- mtc1 t1, f2
- mtc1 t2, f4
- mtc1 t3, f6
- mtc1 t4, f8
- mtc1 t5, f10
- mtc1 t6, f12
- mtc1 t7, f14
- mtc1 t8, f16
- cvt.s.w f2, f2
- cvt.s.w f4, f4
- cvt.s.w f6, f6
- cvt.s.w f8, f8
- cvt.s.w f10, f10
- cvt.s.w f12, f12
- cvt.s.w f14, f14
- cvt.s.w f16, f16
- lw t0, 16(a0)
- swc1 f2, 96(a2)
- swc1 f4, 100(a2)
- swc1 f6, 104(a2)
- addu t0, t0, a1
- swc1 f8, 108(a2)
- swc1 f10, 112(a2)
- swc1 f12, 116(a2)
- swc1 f14, 120(a2)
- swc1 f16, 124(a2)
- // elemr 4
- lbu t1, 0(t0)
- lbu t2, 1(t0)
- lbu t3, 2(t0)
- lbu t4, 3(t0)
- lbu t5, 4(t0)
- lbu t6, 5(t0)
- lbu t7, 6(t0)
- lbu t8, 7(t0)
- addiu t1, t1, -128
- addiu t2, t2, -128
- addiu t3, t3, -128
- addiu t4, t4, -128
- addiu t5, t5, -128
- addiu t6, t6, -128
- addiu t7, t7, -128
- addiu t8, t8, -128
- mtc1 t1, f2
- mtc1 t2, f4
- mtc1 t3, f6
- mtc1 t4, f8
- mtc1 t5, f10
- mtc1 t6, f12
- mtc1 t7, f14
- mtc1 t8, f16
- cvt.s.w f2, f2
- cvt.s.w f4, f4
- cvt.s.w f6, f6
- cvt.s.w f8, f8
- cvt.s.w f10, f10
- cvt.s.w f12, f12
- cvt.s.w f14, f14
- cvt.s.w f16, f16
- lw t0, 20(a0)
- swc1 f2, 128(a2)
- swc1 f4, 132(a2)
- swc1 f6, 136(a2)
- addu t0, t0, a1
- swc1 f8, 140(a2)
- swc1 f10, 144(a2)
- swc1 f12, 148(a2)
- swc1 f14, 152(a2)
- swc1 f16, 156(a2)
- // elemr 5
- lbu t1, 0(t0)
- lbu t2, 1(t0)
- lbu t3, 2(t0)
- lbu t4, 3(t0)
- lbu t5, 4(t0)
- lbu t6, 5(t0)
- lbu t7, 6(t0)
- lbu t8, 7(t0)
- addiu t1, t1, -128
- addiu t2, t2, -128
- addiu t3, t3, -128
- addiu t4, t4, -128
- addiu t5, t5, -128
- addiu t6, t6, -128
- addiu t7, t7, -128
- addiu t8, t8, -128
- mtc1 t1, f2
- mtc1 t2, f4
- mtc1 t3, f6
- mtc1 t4, f8
- mtc1 t5, f10
- mtc1 t6, f12
- mtc1 t7, f14
- mtc1 t8, f16
- cvt.s.w f2, f2
- cvt.s.w f4, f4
- cvt.s.w f6, f6
- cvt.s.w f8, f8
- cvt.s.w f10, f10
- cvt.s.w f12, f12
- cvt.s.w f14, f14
- cvt.s.w f16, f16
- lw t0, 24(a0)
- swc1 f2, 160(a2)
- swc1 f4, 164(a2)
- swc1 f6, 168(a2)
- addu t0, t0, a1
- swc1 f8, 172(a2)
- swc1 f10, 176(a2)
- swc1 f12, 180(a2)
- swc1 f14, 184(a2)
- swc1 f16, 188(a2)
- // elemr 6
- lbu t1, 0(t0)
- lbu t2, 1(t0)
- lbu t3, 2(t0)
- lbu t4, 3(t0)
- lbu t5, 4(t0)
- lbu t6, 5(t0)
- lbu t7, 6(t0)
- lbu t8, 7(t0)
- addiu t1, t1, -128
- addiu t2, t2, -128
- addiu t3, t3, -128
- addiu t4, t4, -128
- addiu t5, t5, -128
- addiu t6, t6, -128
- addiu t7, t7, -128
- addiu t8, t8, -128
- mtc1 t1, f2
- mtc1 t2, f4
- mtc1 t3, f6
- mtc1 t4, f8
- mtc1 t5, f10
- mtc1 t6, f12
- mtc1 t7, f14
- mtc1 t8, f16
- cvt.s.w f2, f2
- cvt.s.w f4, f4
- cvt.s.w f6, f6
- cvt.s.w f8, f8
- cvt.s.w f10, f10
- cvt.s.w f12, f12
- cvt.s.w f14, f14
- cvt.s.w f16, f16
- lw t0, 28(a0)
- swc1 f2, 192(a2)
- swc1 f4, 196(a2)
- swc1 f6, 200(a2)
- addu t0, t0, a1
- swc1 f8, 204(a2)
- swc1 f10, 208(a2)
- swc1 f12, 212(a2)
- swc1 f14, 216(a2)
- swc1 f16, 220(a2)
- // elemr 7
- lbu t1, 0(t0)
- lbu t2, 1(t0)
- lbu t3, 2(t0)
- lbu t4, 3(t0)
- lbu t5, 4(t0)
- lbu t6, 5(t0)
- lbu t7, 6(t0)
- lbu t8, 7(t0)
- addiu t1, t1, -128
- addiu t2, t2, -128
- addiu t3, t3, -128
- addiu t4, t4, -128
- addiu t5, t5, -128
- addiu t6, t6, -128
- addiu t7, t7, -128
- addiu t8, t8, -128
- mtc1 t1, f2
- mtc1 t2, f4
- mtc1 t3, f6
- mtc1 t4, f8
- mtc1 t5, f10
- mtc1 t6, f12
- mtc1 t7, f14
- mtc1 t8, f16
- cvt.s.w f2, f2
- cvt.s.w f4, f4
- cvt.s.w f6, f6
- cvt.s.w f8, f8
- cvt.s.w f10, f10
- cvt.s.w f12, f12
- cvt.s.w f14, f14
- cvt.s.w f16, f16
- swc1 f2, 224(a2)
- swc1 f4, 228(a2)
- swc1 f6, 232(a2)
- swc1 f8, 236(a2)
- swc1 f10, 240(a2)
- swc1 f12, 244(a2)
- swc1 f14, 248(a2)
- swc1 f16, 252(a2)
-
- j ra
- nop
-
-END(jsimd_convsamp_float_dspr2)
-
-#endif
-
-/*****************************************************************************/
diff --git a/simd/mips/jsimd_dspr2_asm.h b/simd/mips/jsimd_dspr2_asm.h
deleted file mode 100644
index 12cfda4..0000000
--- a/simd/mips/jsimd_dspr2_asm.h
+++ /dev/null
@@ -1,292 +0,0 @@
-/*
- * MIPS DSPr2 optimizations for libjpeg-turbo
- *
- * Copyright (C) 2013, MIPS Technologies, Inc., California.
- * Copyright (C) 2018, Matthieu Darbois.
- * All Rights Reserved.
- * Authors: Teodora Novkovic (teodora.novkovic@imgtec.com)
- * Darko Laus (darko.laus@imgtec.com)
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-#define zero $0
-#define AT $1
-#define v0 $2
-#define v1 $3
-#define a0 $4
-#define a1 $5
-#define a2 $6
-#define a3 $7
-#define t0 $8
-#define t1 $9
-#define t2 $10
-#define t3 $11
-#define t4 $12
-#define t5 $13
-#define t6 $14
-#define t7 $15
-#define s0 $16
-#define s1 $17
-#define s2 $18
-#define s3 $19
-#define s4 $20
-#define s5 $21
-#define s6 $22
-#define s7 $23
-#define t8 $24
-#define t9 $25
-#define k0 $26
-#define k1 $27
-#define gp $28
-#define sp $29
-#define fp $30
-#define s8 $30
-#define ra $31
-
-#define f0 $f0
-#define f1 $f1
-#define f2 $f2
-#define f3 $f3
-#define f4 $f4
-#define f5 $f5
-#define f6 $f6
-#define f7 $f7
-#define f8 $f8
-#define f9 $f9
-#define f10 $f10
-#define f11 $f11
-#define f12 $f12
-#define f13 $f13
-#define f14 $f14
-#define f15 $f15
-#define f16 $f16
-#define f17 $f17
-#define f18 $f18
-#define f19 $f19
-#define f20 $f20
-#define f21 $f21
-#define f22 $f22
-#define f23 $f23
-#define f24 $f24
-#define f25 $f25
-#define f26 $f26
-#define f27 $f27
-#define f28 $f28
-#define f29 $f29
-#define f30 $f30
-#define f31 $f31
-
-#ifdef __ELF__
-#define HIDDEN_SYMBOL(symbol) .hidden symbol;
-#else
-#define HIDDEN_SYMBOL(symbol)
-#endif
-
-/*
- * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2
- */
-#define LEAF_MIPS32R2(symbol) \
- .globl symbol; \
- HIDDEN_SYMBOL(symbol) \
- .align 2; \
- .type symbol, @function; \
- .ent symbol, 0; \
-symbol: \
- .frame sp, 0, ra; \
- .set push; \
- .set arch = mips32r2; \
- .set noreorder; \
- .set noat;
-
-/*
- * LEAF_DSPR2 - declare leaf routine for MIPS DSPr2
- */
-#define LEAF_DSPR2(symbol) \
-LEAF_MIPS32R2(symbol) \
- .set dspr2;
-
-/*
- * END - mark end of function
- */
-#define END(function) \
- .set pop; \
- .end function; \
- .size function, .-function
-
-/*
- * Checks if stack offset is big enough for storing/restoring regs_num
- * number of register to/from stack. Stack offset must be greater than
- * or equal to the number of bytes needed for storing registers (regs_num*4).
- * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is
- * preserved for input arguments of the functions, already stored in a0-a3),
- * stack size can be further optimized by utilizing this space.
- */
-.macro CHECK_STACK_OFFSET regs_num, stack_offset
-.if \stack_offset < \regs_num * 4 - 16
-.error "Stack offset too small."
-.endif
-.endm
-
-/*
- * Saves set of registers on stack. Maximum number of registers that
- * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
- * Stack offset is number of bytes that are added to stack pointer (sp)
- * before registers are pushed in order to provide enough space on stack
- * (offset must be multiple of 4, and must be big enough, as described by
- * CHECK_STACK_OFFSET macro). This macro is intended to be used in
- * combination with RESTORE_REGS_FROM_STACK macro. Example:
- * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1
- * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
- */
-.macro SAVE_REGS_ON_STACK stack_offset = 0, r1, \
- r2 = 0, r3 = 0, r4 = 0, \
- r5 = 0, r6 = 0, r7 = 0, \
- r8 = 0, r9 = 0, r10 = 0, \
- r11 = 0, r12 = 0, r13 = 0, \
- r14 = 0
-.if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
- .error "Stack offset must be pozitive and multiple of 4."
-.endif
-.if \stack_offset != 0
- addiu sp, sp, -\stack_offset
-.endif
- sw \r1, 0(sp)
-.if \r2 != 0
- sw \r2, 4(sp)
-.endif
-.if \r3 != 0
- sw \r3, 8(sp)
-.endif
-.if \r4 != 0
- sw \r4, 12(sp)
-.endif
-.if \r5 != 0
- CHECK_STACK_OFFSET 5, \stack_offset
- sw \r5, 16(sp)
-.endif
-.if \r6 != 0
- CHECK_STACK_OFFSET 6, \stack_offset
- sw \r6, 20(sp)
-.endif
-.if \r7 != 0
- CHECK_STACK_OFFSET 7, \stack_offset
- sw \r7, 24(sp)
-.endif
-.if \r8 != 0
- CHECK_STACK_OFFSET 8, \stack_offset
- sw \r8, 28(sp)
-.endif
-.if \r9 != 0
- CHECK_STACK_OFFSET 9, \stack_offset
- sw \r9, 32(sp)
-.endif
-.if \r10 != 0
- CHECK_STACK_OFFSET 10, \stack_offset
- sw \r10, 36(sp)
-.endif
-.if \r11 != 0
- CHECK_STACK_OFFSET 11, \stack_offset
- sw \r11, 40(sp)
-.endif
-.if \r12 != 0
- CHECK_STACK_OFFSET 12, \stack_offset
- sw \r12, 44(sp)
-.endif
-.if \r13 != 0
- CHECK_STACK_OFFSET 13, \stack_offset
- sw \r13, 48(sp)
-.endif
-.if \r14 != 0
- CHECK_STACK_OFFSET 14, \stack_offset
- sw \r14, 52(sp)
-.endif
-.endm
-
-/*
- * Restores set of registers from stack. Maximum number of registers that
- * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
- * Stack offset is number of bytes that are added to stack pointer (sp)
- * after registers are restored (offset must be multiple of 4, and must
- * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is
- * intended to be used in combination with RESTORE_REGS_FROM_STACK macro.
- * Example:
- * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1
- * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
- */
-.macro RESTORE_REGS_FROM_STACK stack_offset = 0, r1, \
- r2 = 0, r3 = 0, r4 = 0, \
- r5 = 0, r6 = 0, r7 = 0, \
- r8 = 0, r9 = 0, r10 = 0, \
- r11 = 0, r12 = 0, r13 = 0, \
- r14 = 0
-.if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
- .error "Stack offset must be pozitive and multiple of 4."
-.endif
- lw \r1, 0(sp)
-.if \r2 != 0
- lw \r2, 4(sp)
-.endif
-.if \r3 != 0
- lw \r3, 8(sp)
-.endif
-.if \r4 != 0
- lw \r4, 12(sp)
-.endif
-.if \r5 != 0
- CHECK_STACK_OFFSET 5, \stack_offset
- lw \r5, 16(sp)
-.endif
-.if \r6 != 0
- CHECK_STACK_OFFSET 6, \stack_offset
- lw \r6, 20(sp)
-.endif
-.if \r7 != 0
- CHECK_STACK_OFFSET 7, \stack_offset
- lw \r7, 24(sp)
-.endif
-.if \r8 != 0
- CHECK_STACK_OFFSET 8, \stack_offset
- lw \r8, 28(sp)
-.endif
-.if \r9 != 0
- CHECK_STACK_OFFSET 9, \stack_offset
- lw \r9, 32(sp)
-.endif
-.if \r10 != 0
- CHECK_STACK_OFFSET 10, \stack_offset
- lw \r10, 36(sp)
-.endif
-.if \r11 != 0
- CHECK_STACK_OFFSET 11, \stack_offset
- lw \r11, 40(sp)
-.endif
-.if \r12 != 0
- CHECK_STACK_OFFSET 12, \stack_offset
- lw \r12, 44(sp)
-.endif
-.if \r13 != 0
- CHECK_STACK_OFFSET 13, \stack_offset
- lw \r13, 48(sp)
-.endif
-.if \r14 != 0
- CHECK_STACK_OFFSET 14, \stack_offset
- lw \r14, 52(sp)
-.endif
-.if \stack_offset != 0
- addiu sp, sp, \stack_offset
-.endif
-.endm
diff --git a/simd/nasm/jcolsamp.inc b/simd/nasm/jcolsamp.inc
index a2d5b49..6f6d7f2 100644
--- a/simd/nasm/jcolsamp.inc
+++ b/simd/nasm/jcolsamp.inc
@@ -7,8 +7,6 @@
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; [TAB8]
; --------------------------------------------------------------------------
diff --git a/simd/nasm/jdct.inc b/simd/nasm/jdct.inc
index 79d5146..9192f66 100644
--- a/simd/nasm/jdct.inc
+++ b/simd/nasm/jdct.inc
@@ -7,8 +7,6 @@
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; [TAB8]
; Each IDCT routine is responsible for range-limiting its results and
; converting them to unsigned form (0..MAXJSAMPLE). The raw outputs could
diff --git a/simd/nasm/jpeg_nbits_table.inc b/simd/nasm/jpeg_nbits_table.inc
deleted file mode 100644
index 2ce6c28..0000000
--- a/simd/nasm/jpeg_nbits_table.inc
+++ /dev/null
@@ -1,4097 +0,0 @@
-jpeg_nbits_table db \
- 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, \
- 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, \
- 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, \
- 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, \
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, \
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, \
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, \
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, \
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
- 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
diff --git a/simd/nasm/jsimdext.inc b/simd/nasm/jsimdext.inc
index b40901f..9930d80 100644
--- a/simd/nasm/jsimdext.inc
+++ b/simd/nasm/jsimdext.inc
@@ -2,7 +2,7 @@
; jsimdext.inc - common declarations
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2010, 2016, D. R. Commander.
+; Copyright (C) 2010, 2016, 2019, D. R. Commander.
; Copyright (C) 2018, Matthieu Darbois.
;
; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
@@ -24,8 +24,6 @@
; 2. Altered source versions must be plainly marked as such, and must not be
; misrepresented as being the original software.
; 3. This notice may not be removed or altered from any source distribution.
-;
-; [TAB8]
; ==========================================================================
; System-dependent configurations
@@ -167,19 +165,19 @@ section .note.GNU-stack noalloc noexec nowrite progbits
%define XMM_DWORD
%define XMM_MMWORD
-%define SIZEOF_BYTE 1 ; sizeof(BYTE)
-%define SIZEOF_WORD 2 ; sizeof(WORD)
-%define SIZEOF_DWORD 4 ; sizeof(DWORD)
-%define SIZEOF_QWORD 8 ; sizeof(QWORD)
-%define SIZEOF_OWORD 16 ; sizeof(OWORD)
-%define SIZEOF_YWORD 32 ; sizeof(YWORD)
+%define SIZEOF_BYTE 1 ; sizeof(byte)
+%define SIZEOF_WORD 2 ; sizeof(word)
+%define SIZEOF_DWORD 4 ; sizeof(dword)
+%define SIZEOF_QWORD 8 ; sizeof(qword)
+%define SIZEOF_OWORD 16 ; sizeof(oword)
+%define SIZEOF_YWORD 32 ; sizeof(yword)
%define BYTE_BIT 8 ; CHAR_BIT in C
-%define WORD_BIT 16 ; sizeof(WORD)*BYTE_BIT
-%define DWORD_BIT 32 ; sizeof(DWORD)*BYTE_BIT
-%define QWORD_BIT 64 ; sizeof(QWORD)*BYTE_BIT
-%define OWORD_BIT 128 ; sizeof(OWORD)*BYTE_BIT
-%define YWORD_BIT 256 ; sizeof(YWORD)*BYTE_BIT
+%define WORD_BIT 16 ; sizeof(word)*BYTE_BIT
+%define DWORD_BIT 32 ; sizeof(dword)*BYTE_BIT
+%define QWORD_BIT 64 ; sizeof(qword)*BYTE_BIT
+%define OWORD_BIT 128 ; sizeof(oword)*BYTE_BIT
+%define YWORD_BIT 256 ; sizeof(yword)*BYTE_BIT
; --------------------------------------------------------------------------
; External Symbol Name
@@ -198,6 +196,11 @@ section .note.GNU-stack noalloc noexec nowrite progbits
%ifdef __YASM_VER__
%define GLOBAL_FUNCTION(name) global EXTN(name):private_extern
%define GLOBAL_DATA(name) global EXTN(name):private_extern
+%else
+%if __NASM_VERSION_ID__ >= 0x020E0000
+%define GLOBAL_FUNCTION(name) global EXTN(name):private_extern
+%define GLOBAL_DATA(name) global EXTN(name):private_extern
+%endif
%endif
%endif
diff --git a/simd/powerpc/jccolext-altivec.c b/simd/powerpc/jccolext-altivec.c
deleted file mode 100644
index 170f90f..0000000
--- a/simd/powerpc/jccolext-altivec.c
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
- * Copyright (C) 2014, Jay Foad. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* This file is included by jccolor-altivec.c */
-
-
-void jsimd_rgb_ycc_convert_altivec(JDIMENSION img_width, JSAMPARRAY input_buf,
- JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows)
-{
- JSAMPROW inptr, outptr0, outptr1, outptr2;
- int pitch = img_width * RGB_PIXELSIZE, num_cols;
-#if __BIG_ENDIAN__
- int offset;
-#endif
- unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
-
- __vector unsigned char rgb0, rgb1 = { 0 }, rgb2 = { 0 },
- rgbg0, rgbg1, rgbg2, rgbg3, y, cb, cr;
-#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
- __vector unsigned char rgb3 = { 0 };
-#endif
-#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
- __vector unsigned char rgb4 = { 0 };
-#endif
- __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
- __vector unsigned short yl, yh, crl, crh, cbl, cbh;
- __vector int y0, y1, y2, y3, cr0, cr1, cr2, cr3, cb0, cb1, cb2, cb3;
-
- /* Constants */
- __vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) },
- pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) },
- pw_mf016_mf033 = { __4X2(-F_0_168, -F_0_331) },
- pw_mf008_mf041 = { __4X2(-F_0_081, -F_0_418) };
- __vector unsigned short pw_f050_f000 = { __4X2(F_0_500, 0) };
- __vector int pd_onehalf = { __4X(ONE_HALF) },
- pd_onehalfm1_cj = { __4X(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS)) };
- __vector unsigned char pb_zero = { __16X(0) },
-#if __BIG_ENDIAN__
- shift_pack_index =
- { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
-#else
- shift_pack_index =
- { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
-#endif
-
- while (--num_rows >= 0) {
- inptr = *input_buf++;
- outptr0 = output_buf[0][output_row];
- outptr1 = output_buf[1][output_row];
- outptr2 = output_buf[2][output_row];
- output_row++;
-
- for (num_cols = pitch; num_cols > 0;
- num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
- outptr0 += 16, outptr1 += 16, outptr2 += 16) {
-
-#if __BIG_ENDIAN__
- /* Load 16 pixels == 48 or 64 bytes */
- offset = (size_t)inptr & 15;
- if (offset) {
- __vector unsigned char unaligned_shift_index;
- int bytes = num_cols + offset;
-
- if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
- /* Slow path to prevent buffer overread. Since there is no way to
- * read a partial AltiVec register, overread would occur on the last
- * chunk of the last image row if the right edge is not on a 16-byte
- * boundary. It could also occur on other rows if the bytes per row
- * is low enough. Since we can't determine whether we're on the last
- * image row, we have to assume every row is the last.
- */
- memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
- rgb0 = vec_ld(0, tmpbuf);
- rgb1 = vec_ld(16, tmpbuf);
- rgb2 = vec_ld(32, tmpbuf);
-#if RGB_PIXELSIZE == 4
- rgb3 = vec_ld(48, tmpbuf);
-#endif
- } else {
- /* Fast path */
- rgb0 = vec_ld(0, inptr);
- if (bytes > 16)
- rgb1 = vec_ld(16, inptr);
- if (bytes > 32)
- rgb2 = vec_ld(32, inptr);
- if (bytes > 48)
- rgb3 = vec_ld(48, inptr);
-#if RGB_PIXELSIZE == 4
- if (bytes > 64)
- rgb4 = vec_ld(64, inptr);
-#endif
- unaligned_shift_index = vec_lvsl(0, inptr);
- rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
- rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
- rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
-#if RGB_PIXELSIZE == 4
- rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
-#endif
- }
- } else {
-#endif /* __BIG_ENDIAN__ */
- if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
- /* Slow path */
- memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
- rgb0 = VEC_LD(0, tmpbuf);
- rgb1 = VEC_LD(16, tmpbuf);
- rgb2 = VEC_LD(32, tmpbuf);
-#if RGB_PIXELSIZE == 4
- rgb3 = VEC_LD(48, tmpbuf);
-#endif
- } else {
- /* Fast path */
- rgb0 = VEC_LD(0, inptr);
- if (num_cols > 16)
- rgb1 = VEC_LD(16, inptr);
- if (num_cols > 32)
- rgb2 = VEC_LD(32, inptr);
-#if RGB_PIXELSIZE == 4
- if (num_cols > 48)
- rgb3 = VEC_LD(48, inptr);
-#endif
- }
-#if __BIG_ENDIAN__
- }
-#endif
-
-#if RGB_PIXELSIZE == 3
- /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
- * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
- * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
- *
- * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
- * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
- * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
- * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
- */
- rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0);
- rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1);
- rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2);
- rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3);
-#else
- /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
- * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
- * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
- * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
- *
- * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
- * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
- * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
- * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
- */
- rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX);
- rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX);
- rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX);
- rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX);
-#endif
-
- /* rg0 = R0 G0 R1 G1 R2 G2 R3 G3
- * bg0 = B0 G0 B1 G1 B2 G2 B3 G3
- * ...
- *
- * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
- * support unsigned vectors.
- */
- rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0);
- bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0);
- rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1);
- bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1);
- rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2);
- bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2);
- rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3);
- bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3);
-
- /* (Original)
- * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
- * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
- * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
- *
- * (This implementation)
- * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
- * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
- * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
- */
-
- /* Calculate Y values */
-
- y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf);
- y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf);
- y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf);
- y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf);
- y0 = vec_msums(bg0, pw_f0114_f0250, y0);
- y1 = vec_msums(bg1, pw_f0114_f0250, y1);
- y2 = vec_msums(bg2, pw_f0114_f0250, y2);
- y3 = vec_msums(bg3, pw_f0114_f0250, y3);
- /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from
- * each dword into a new 16-bit vector, which is the equivalent of
- * descaling the 32-bit results (right-shifting by 16 bits) and then
- * packing them.
- */
- yl = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1,
- shift_pack_index);
- yh = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3,
- shift_pack_index);
- y = vec_pack(yl, yh);
- vec_st(y, 0, outptr0);
-
- /* Calculate Cb values */
- cb0 = vec_msums(rg0, pw_mf016_mf033, pd_onehalfm1_cj);
- cb1 = vec_msums(rg1, pw_mf016_mf033, pd_onehalfm1_cj);
- cb2 = vec_msums(rg2, pw_mf016_mf033, pd_onehalfm1_cj);
- cb3 = vec_msums(rg3, pw_mf016_mf033, pd_onehalfm1_cj);
- cb0 = (__vector int)vec_msum((__vector unsigned short)bg0, pw_f050_f000,
- (__vector unsigned int)cb0);
- cb1 = (__vector int)vec_msum((__vector unsigned short)bg1, pw_f050_f000,
- (__vector unsigned int)cb1);
- cb2 = (__vector int)vec_msum((__vector unsigned short)bg2, pw_f050_f000,
- (__vector unsigned int)cb2);
- cb3 = (__vector int)vec_msum((__vector unsigned short)bg3, pw_f050_f000,
- (__vector unsigned int)cb3);
- cbl = vec_perm((__vector unsigned short)cb0,
- (__vector unsigned short)cb1, shift_pack_index);
- cbh = vec_perm((__vector unsigned short)cb2,
- (__vector unsigned short)cb3, shift_pack_index);
- cb = vec_pack(cbl, cbh);
- vec_st(cb, 0, outptr1);
-
- /* Calculate Cr values */
- cr0 = vec_msums(bg0, pw_mf008_mf041, pd_onehalfm1_cj);
- cr1 = vec_msums(bg1, pw_mf008_mf041, pd_onehalfm1_cj);
- cr2 = vec_msums(bg2, pw_mf008_mf041, pd_onehalfm1_cj);
- cr3 = vec_msums(bg3, pw_mf008_mf041, pd_onehalfm1_cj);
- cr0 = (__vector int)vec_msum((__vector unsigned short)rg0, pw_f050_f000,
- (__vector unsigned int)cr0);
- cr1 = (__vector int)vec_msum((__vector unsigned short)rg1, pw_f050_f000,
- (__vector unsigned int)cr1);
- cr2 = (__vector int)vec_msum((__vector unsigned short)rg2, pw_f050_f000,
- (__vector unsigned int)cr2);
- cr3 = (__vector int)vec_msum((__vector unsigned short)rg3, pw_f050_f000,
- (__vector unsigned int)cr3);
- crl = vec_perm((__vector unsigned short)cr0,
- (__vector unsigned short)cr1, shift_pack_index);
- crh = vec_perm((__vector unsigned short)cr2,
- (__vector unsigned short)cr3, shift_pack_index);
- cr = vec_pack(crl, crh);
- vec_st(cr, 0, outptr2);
- }
- }
-}
diff --git a/simd/powerpc/jccolor-altivec.c b/simd/powerpc/jccolor-altivec.c
deleted file mode 100644
index d670dbc..0000000
--- a/simd/powerpc/jccolor-altivec.c
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2014, D. R. Commander. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* RGB --> YCC CONVERSION */
-
-#include "jsimd_altivec.h"
-
-
-#define F_0_081 5329 /* FIX(0.08131) */
-#define F_0_114 7471 /* FIX(0.11400) */
-#define F_0_168 11059 /* FIX(0.16874) */
-#define F_0_250 16384 /* FIX(0.25000) */
-#define F_0_299 19595 /* FIX(0.29900) */
-#define F_0_331 21709 /* FIX(0.33126) */
-#define F_0_418 27439 /* FIX(0.41869) */
-#define F_0_500 32768 /* FIX(0.50000) */
-#define F_0_587 38470 /* FIX(0.58700) */
-#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */
-
-#define SCALEBITS 16
-#define ONE_HALF (1 << (SCALEBITS - 1))
-
-
-#define RGBG_INDEX0 \
- { 0, 1, 3, 4, 6, 7, 9, 10, 2, 1, 5, 4, 8, 7, 11, 10 }
-#define RGBG_INDEX1 \
- { 12, 13, 15, 16, 18, 19, 21, 22, 14, 13, 17, 16, 20, 19, 23, 22 }
-#define RGBG_INDEX2 \
- { 8, 9, 11, 12, 14, 15, 17, 18, 10, 9, 13, 12, 16, 15, 19, 18 }
-#define RGBG_INDEX3 \
- { 4, 5, 7, 8, 10, 11, 13, 14, 6, 5, 9, 8, 12, 11, 15, 14 }
-#include "jccolext-altivec.c"
-#undef RGB_PIXELSIZE
-
-#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-#define jsimd_rgb_ycc_convert_altivec jsimd_extrgb_ycc_convert_altivec
-#include "jccolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX0
-#undef RGBG_INDEX1
-#undef RGBG_INDEX2
-#undef RGBG_INDEX3
-#undef jsimd_rgb_ycc_convert_altivec
-
-#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define RGBG_INDEX \
- { 0, 1, 4, 5, 8, 9, 12, 13, 2, 1, 6, 5, 10, 9, 14, 13 }
-#define jsimd_rgb_ycc_convert_altivec jsimd_extrgbx_ycc_convert_altivec
-#include "jccolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_ycc_convert_altivec
-
-#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define RGBG_INDEX0 \
- { 2, 1, 5, 4, 8, 7, 11, 10, 0, 1, 3, 4, 6, 7, 9, 10 }
-#define RGBG_INDEX1 \
- { 14, 13, 17, 16, 20, 19, 23, 22, 12, 13, 15, 16, 18, 19, 21, 22 }
-#define RGBG_INDEX2 \
- { 10, 9, 13, 12, 16, 15, 19, 18, 8, 9, 11, 12, 14, 15, 17, 18 }
-#define RGBG_INDEX3 \
- { 6, 5, 9, 8, 12, 11, 15, 14, 4, 5, 7, 8, 10, 11, 13, 14 }
-#define jsimd_rgb_ycc_convert_altivec jsimd_extbgr_ycc_convert_altivec
-#include "jccolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX0
-#undef RGBG_INDEX1
-#undef RGBG_INDEX2
-#undef RGBG_INDEX3
-#undef jsimd_rgb_ycc_convert_altivec
-
-#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define RGBG_INDEX \
- { 2, 1, 6, 5, 10, 9, 14, 13, 0, 1, 4, 5, 8, 9, 12, 13 }
-#define jsimd_rgb_ycc_convert_altivec jsimd_extbgrx_ycc_convert_altivec
-#include "jccolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_ycc_convert_altivec
-
-#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define RGBG_INDEX \
- { 3, 2, 7, 6, 11, 10, 15, 14, 1, 2, 5, 6, 9, 10, 13, 14 }
-#define jsimd_rgb_ycc_convert_altivec jsimd_extxbgr_ycc_convert_altivec
-#include "jccolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_ycc_convert_altivec
-
-#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define RGBG_INDEX \
- { 1, 2, 5, 6, 9, 10, 13, 14, 3, 2, 7, 6, 11, 10, 15, 14 }
-#define jsimd_rgb_ycc_convert_altivec jsimd_extxrgb_ycc_convert_altivec
-#include "jccolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_ycc_convert_altivec
diff --git a/simd/powerpc/jcgray-altivec.c b/simd/powerpc/jcgray-altivec.c
deleted file mode 100644
index a11a7e7..0000000
--- a/simd/powerpc/jcgray-altivec.c
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2014, D. R. Commander. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* RGB --> GRAYSCALE CONVERSION */
-
-#include "jsimd_altivec.h"
-
-
-#define F_0_114 7471 /* FIX(0.11400) */
-#define F_0_250 16384 /* FIX(0.25000) */
-#define F_0_299 19595 /* FIX(0.29900) */
-#define F_0_587 38470 /* FIX(0.58700) */
-#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */
-
-#define SCALEBITS 16
-#define ONE_HALF (1 << (SCALEBITS - 1))
-
-
-#define RGBG_INDEX0 \
- { 0, 1, 3, 4, 6, 7, 9, 10, 2, 1, 5, 4, 8, 7, 11, 10 }
-#define RGBG_INDEX1 \
- { 12, 13, 15, 16, 18, 19, 21, 22, 14, 13, 17, 16, 20, 19, 23, 22 }
-#define RGBG_INDEX2 \
- { 8, 9, 11, 12, 14, 15, 17, 18, 10, 9, 13, 12, 16, 15, 19, 18 }
-#define RGBG_INDEX3 \
- { 4, 5, 7, 8, 10, 11, 13, 14, 6, 5, 9, 8, 12, 11, 15, 14 }
-#include "jcgryext-altivec.c"
-#undef RGB_PIXELSIZE
-
-#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-#define jsimd_rgb_gray_convert_altivec jsimd_extrgb_gray_convert_altivec
-#include "jcgryext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX0
-#undef RGBG_INDEX1
-#undef RGBG_INDEX2
-#undef RGBG_INDEX3
-#undef jsimd_rgb_gray_convert_altivec
-
-#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define RGBG_INDEX \
- { 0, 1, 4, 5, 8, 9, 12, 13, 2, 1, 6, 5, 10, 9, 14, 13 }
-#define jsimd_rgb_gray_convert_altivec jsimd_extrgbx_gray_convert_altivec
-#include "jcgryext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_gray_convert_altivec
-
-#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define RGBG_INDEX0 \
- { 2, 1, 5, 4, 8, 7, 11, 10, 0, 1, 3, 4, 6, 7, 9, 10 }
-#define RGBG_INDEX1 \
- { 14, 13, 17, 16, 20, 19, 23, 22, 12, 13, 15, 16, 18, 19, 21, 22 }
-#define RGBG_INDEX2 \
- { 10, 9, 13, 12, 16, 15, 19, 18, 8, 9, 11, 12, 14, 15, 17, 18 }
-#define RGBG_INDEX3 \
- { 6, 5, 9, 8, 12, 11, 15, 14, 4, 5, 7, 8, 10, 11, 13, 14 }
-#define jsimd_rgb_gray_convert_altivec jsimd_extbgr_gray_convert_altivec
-#include "jcgryext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX0
-#undef RGBG_INDEX1
-#undef RGBG_INDEX2
-#undef RGBG_INDEX3
-#undef jsimd_rgb_gray_convert_altivec
-
-#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define RGBG_INDEX \
- { 2, 1, 6, 5, 10, 9, 14, 13, 0, 1, 4, 5, 8, 9, 12, 13 }
-#define jsimd_rgb_gray_convert_altivec jsimd_extbgrx_gray_convert_altivec
-#include "jcgryext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_gray_convert_altivec
-
-#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define RGBG_INDEX \
- { 3, 2, 7, 6, 11, 10, 15, 14, 1, 2, 5, 6, 9, 10, 13, 14 }
-#define jsimd_rgb_gray_convert_altivec jsimd_extxbgr_gray_convert_altivec
-#include "jcgryext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_gray_convert_altivec
-
-#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define RGBG_INDEX \
- { 1, 2, 5, 6, 9, 10, 13, 14, 3, 2, 7, 6, 11, 10, 15, 14 }
-#define jsimd_rgb_gray_convert_altivec jsimd_extxrgb_gray_convert_altivec
-#include "jcgryext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_gray_convert_altivec
diff --git a/simd/powerpc/jcgryext-altivec.c b/simd/powerpc/jcgryext-altivec.c
deleted file mode 100644
index b280cbb..0000000
--- a/simd/powerpc/jcgryext-altivec.c
+++ /dev/null
@@ -1,228 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
- * Copyright (C) 2014, Jay Foad. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* This file is included by jcgray-altivec.c */
-
-
-void jsimd_rgb_gray_convert_altivec(JDIMENSION img_width, JSAMPARRAY input_buf,
- JSAMPIMAGE output_buf,
- JDIMENSION output_row, int num_rows)
-{
- JSAMPROW inptr, outptr;
- int pitch = img_width * RGB_PIXELSIZE, num_cols;
-#if __BIG_ENDIAN__
- int offset;
- unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
-#endif
-
- __vector unsigned char rgb0, rgb1 = { 0 }, rgb2 = { 0 },
- rgbg0, rgbg1, rgbg2, rgbg3, y;
-#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
- __vector unsigned char rgb3 = { 0 };
-#endif
-#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
- __vector unsigned char rgb4 = { 0 };
-#endif
- __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
- __vector unsigned short yl, yh;
- __vector int y0, y1, y2, y3;
-
- /* Constants */
- __vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) },
- pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) };
- __vector int pd_onehalf = { __4X(ONE_HALF) };
- __vector unsigned char pb_zero = { __16X(0) },
-#if __BIG_ENDIAN__
- shift_pack_index =
- { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
-#else
- shift_pack_index =
- { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
-#endif
-
- while (--num_rows >= 0) {
- inptr = *input_buf++;
- outptr = output_buf[0][output_row];
- output_row++;
-
- for (num_cols = pitch; num_cols > 0;
- num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
- outptr += 16) {
-
-#if __BIG_ENDIAN__
- /* Load 16 pixels == 48 or 64 bytes */
- offset = (size_t)inptr & 15;
- if (offset) {
- __vector unsigned char unaligned_shift_index;
- int bytes = num_cols + offset;
-
- if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
- /* Slow path to prevent buffer overread. Since there is no way to
- * read a partial AltiVec register, overread would occur on the last
- * chunk of the last image row if the right edge is not on a 16-byte
- * boundary. It could also occur on other rows if the bytes per row
- * is low enough. Since we can't determine whether we're on the last
- * image row, we have to assume every row is the last.
- */
- memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
- rgb0 = vec_ld(0, tmpbuf);
- rgb1 = vec_ld(16, tmpbuf);
- rgb2 = vec_ld(32, tmpbuf);
-#if RGB_PIXELSIZE == 4
- rgb3 = vec_ld(48, tmpbuf);
-#endif
- } else {
- /* Fast path */
- rgb0 = vec_ld(0, inptr);
- if (bytes > 16)
- rgb1 = vec_ld(16, inptr);
- if (bytes > 32)
- rgb2 = vec_ld(32, inptr);
- if (bytes > 48)
- rgb3 = vec_ld(48, inptr);
-#if RGB_PIXELSIZE == 4
- if (bytes > 64)
- rgb4 = vec_ld(64, inptr);
-#endif
- unaligned_shift_index = vec_lvsl(0, inptr);
- rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
- rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
- rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
-#if RGB_PIXELSIZE == 4
- rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
-#endif
- }
- } else {
- if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
- /* Slow path */
- memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16));
- rgb0 = vec_ld(0, tmpbuf);
- rgb1 = vec_ld(16, tmpbuf);
- rgb2 = vec_ld(32, tmpbuf);
-#if RGB_PIXELSIZE == 4
- rgb3 = vec_ld(48, tmpbuf);
-#endif
- } else {
- /* Fast path */
- rgb0 = vec_ld(0, inptr);
- if (num_cols > 16)
- rgb1 = vec_ld(16, inptr);
- if (num_cols > 32)
- rgb2 = vec_ld(32, inptr);
-#if RGB_PIXELSIZE == 4
- if (num_cols > 48)
- rgb3 = vec_ld(48, inptr);
-#endif
- }
- }
-#else
- /* Little endian */
- rgb0 = vec_vsx_ld(0, inptr);
- if (num_cols > 16)
- rgb1 = vec_vsx_ld(16, inptr);
- if (num_cols > 32)
- rgb2 = vec_vsx_ld(32, inptr);
-#if RGB_PIXELSIZE == 4
- if (num_cols > 48)
- rgb3 = vec_vsx_ld(48, inptr);
-#endif
-#endif
-
-#if RGB_PIXELSIZE == 3
- /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
- * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
- * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
- *
- * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
- * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
- * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
- * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
- */
- rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0);
- rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1);
- rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2);
- rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3);
-#else
- /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
- * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
- * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
- * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
- *
- * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
- * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
- * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
- * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
- */
- rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX);
- rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX);
- rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX);
- rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX);
-#endif
-
- /* rg0 = R0 G0 R1 G1 R2 G2 R3 G3
- * bg0 = B0 G0 B1 G1 B2 G2 B3 G3
- * ...
- *
- * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
- * support unsigned vectors.
- */
- rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0);
- bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0);
- rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1);
- bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1);
- rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2);
- bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2);
- rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3);
- bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3);
-
- /* (Original)
- * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
- *
- * (This implementation)
- * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
- */
-
- /* Calculate Y values */
-
- y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf);
- y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf);
- y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf);
- y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf);
- y0 = vec_msums(bg0, pw_f0114_f0250, y0);
- y1 = vec_msums(bg1, pw_f0114_f0250, y1);
- y2 = vec_msums(bg2, pw_f0114_f0250, y2);
- y3 = vec_msums(bg3, pw_f0114_f0250, y3);
- /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from
- * each dword into a new 16-bit vector, which is the equivalent of
- * descaling the 32-bit results (right-shifting by 16 bits) and then
- * packing them.
- */
- yl = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1,
- shift_pack_index);
- yh = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3,
- shift_pack_index);
- y = vec_pack(yl, yh);
- vec_st(y, 0, outptr);
- }
- }
-}
diff --git a/simd/powerpc/jcsample-altivec.c b/simd/powerpc/jcsample-altivec.c
deleted file mode 100644
index 6e25b8d..0000000
--- a/simd/powerpc/jcsample-altivec.c
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* CHROMA DOWNSAMPLING */
-
-#include "jsimd_altivec.h"
-#include "jcsample.h"
-
-
-void jsimd_h2v1_downsample_altivec(JDIMENSION image_width,
- int max_v_samp_factor,
- JDIMENSION v_samp_factor,
- JDIMENSION width_in_blocks,
- JSAMPARRAY input_data,
- JSAMPARRAY output_data)
-{
- int outrow, outcol;
- JDIMENSION output_cols = width_in_blocks * DCTSIZE;
- JSAMPROW inptr, outptr;
-
- __vector unsigned char this0, next0, out;
- __vector unsigned short this0e, this0o, next0e, next0o, outl, outh;
-
- /* Constants */
- __vector unsigned short pw_bias = { __4X2(0, 1) },
- pw_one = { __8X(1) };
- __vector unsigned char even_odd_index =
- { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
- pb_zero = { __16X(0) };
-
- expand_right_edge(input_data, max_v_samp_factor, image_width,
- output_cols * 2);
-
- for (outrow = 0; outrow < v_samp_factor; outrow++) {
- outptr = output_data[outrow];
- inptr = input_data[outrow];
-
- for (outcol = output_cols; outcol > 0;
- outcol -= 16, inptr += 32, outptr += 16) {
-
- this0 = vec_ld(0, inptr);
- this0 = vec_perm(this0, this0, even_odd_index);
- this0e = (__vector unsigned short)VEC_UNPACKHU(this0);
- this0o = (__vector unsigned short)VEC_UNPACKLU(this0);
- outl = vec_add(this0e, this0o);
- outl = vec_add(outl, pw_bias);
- outl = vec_sr(outl, pw_one);
-
- if (outcol > 8) {
- next0 = vec_ld(16, inptr);
- next0 = vec_perm(next0, next0, even_odd_index);
- next0e = (__vector unsigned short)VEC_UNPACKHU(next0);
- next0o = (__vector unsigned short)VEC_UNPACKLU(next0);
- outh = vec_add(next0e, next0o);
- outh = vec_add(outh, pw_bias);
- outh = vec_sr(outh, pw_one);
- } else
- outh = vec_splat_u16(0);
-
- out = vec_pack(outl, outh);
- vec_st(out, 0, outptr);
- }
- }
-}
-
-
-void
-jsimd_h2v2_downsample_altivec(JDIMENSION image_width, int max_v_samp_factor,
- JDIMENSION v_samp_factor,
- JDIMENSION width_in_blocks,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
- int inrow, outrow, outcol;
- JDIMENSION output_cols = width_in_blocks * DCTSIZE;
- JSAMPROW inptr0, inptr1, outptr;
-
- __vector unsigned char this0, next0, this1, next1, out;
- __vector unsigned short this0e, this0o, next0e, next0o, this1e, this1o,
- next1e, next1o, out0l, out0h, out1l, out1h, outl, outh;
-
- /* Constants */
- __vector unsigned short pw_bias = { __4X2(1, 2) },
- pw_two = { __8X(2) };
- __vector unsigned char even_odd_index =
- { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
- pb_zero = { __16X(0) };
-
- expand_right_edge(input_data, max_v_samp_factor, image_width,
- output_cols * 2);
-
- for (inrow = 0, outrow = 0; outrow < v_samp_factor;
- inrow += 2, outrow++) {
-
- inptr0 = input_data[inrow];
- inptr1 = input_data[inrow + 1];
- outptr = output_data[outrow];
-
- for (outcol = output_cols; outcol > 0;
- outcol -= 16, inptr0 += 32, inptr1 += 32, outptr += 16) {
-
- this0 = vec_ld(0, inptr0);
- this0 = vec_perm(this0, this0, even_odd_index);
- this0e = (__vector unsigned short)VEC_UNPACKHU(this0);
- this0o = (__vector unsigned short)VEC_UNPACKLU(this0);
- out0l = vec_add(this0e, this0o);
-
- this1 = vec_ld(0, inptr1);
- this1 = vec_perm(this1, this1, even_odd_index);
- this1e = (__vector unsigned short)VEC_UNPACKHU(this1);
- this1o = (__vector unsigned short)VEC_UNPACKLU(this1);
- out1l = vec_add(this1e, this1o);
-
- outl = vec_add(out0l, out1l);
- outl = vec_add(outl, pw_bias);
- outl = vec_sr(outl, pw_two);
-
- if (outcol > 8) {
- next0 = vec_ld(16, inptr0);
- next0 = vec_perm(next0, next0, even_odd_index);
- next0e = (__vector unsigned short)VEC_UNPACKHU(next0);
- next0o = (__vector unsigned short)VEC_UNPACKLU(next0);
- out0h = vec_add(next0e, next0o);
-
- next1 = vec_ld(16, inptr1);
- next1 = vec_perm(next1, next1, even_odd_index);
- next1e = (__vector unsigned short)VEC_UNPACKHU(next1);
- next1o = (__vector unsigned short)VEC_UNPACKLU(next1);
- out1h = vec_add(next1e, next1o);
-
- outh = vec_add(out0h, out1h);
- outh = vec_add(outh, pw_bias);
- outh = vec_sr(outh, pw_two);
- } else
- outh = vec_splat_u16(0);
-
- out = vec_pack(outl, outh);
- vec_st(out, 0, outptr);
- }
- }
-}
diff --git a/simd/powerpc/jcsample.h b/simd/powerpc/jcsample.h
deleted file mode 100644
index 2ac4816..0000000
--- a/simd/powerpc/jcsample.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * jcsample.h
- *
- * This file was part of the Independent JPEG Group's software:
- * Copyright (C) 1991-1996, Thomas G. Lane.
- * For conditions of distribution and use, see the accompanying README.ijg
- * file.
- */
-
-LOCAL(void)
-expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
- JDIMENSION output_cols)
-{
- register JSAMPROW ptr;
- register JSAMPLE pixval;
- register int count;
- int row;
- int numcols = (int)(output_cols - input_cols);
-
- if (numcols > 0) {
- for (row = 0; row < num_rows; row++) {
- ptr = image_data[row] + input_cols;
- pixval = ptr[-1]; /* don't need GETJSAMPLE() here */
- for (count = numcols; count > 0; count--)
- *ptr++ = pixval;
- }
- }
-}
diff --git a/simd/powerpc/jdcolext-altivec.c b/simd/powerpc/jdcolext-altivec.c
deleted file mode 100644
index 68d52bd..0000000
--- a/simd/powerpc/jdcolext-altivec.c
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* This file is included by jdcolor-altivec.c */
-
-
-void jsimd_ycc_rgb_convert_altivec(JDIMENSION out_width, JSAMPIMAGE input_buf,
- JDIMENSION input_row, JSAMPARRAY output_buf,
- int num_rows)
-{
- JSAMPROW outptr, inptr0, inptr1, inptr2;
- int pitch = out_width * RGB_PIXELSIZE, num_cols;
-#if __BIG_ENDIAN__
- int offset;
-#endif
- unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
-
- __vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3,
- y, cb, cr;
-#if __BIG_ENDIAN__
- __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3;
-#if RGB_PIXELSIZE == 4
- __vector unsigned char out4;
-#endif
-#endif
-#if RGB_PIXELSIZE == 4
- __vector unsigned char rgb3;
-#endif
- __vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, yl, yh, cbl, cbh,
- crl, crh, rl, rh, gl, gh, bl, bh, g0w, g1w, g2w, g3w;
- __vector int g0, g1, g2, g3;
-
- /* Constants
- * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17
- * high-order bits, not 16.
- */
- __vector short pw_f0402 = { __8X(F_0_402 >> 1) },
- pw_mf0228 = { __8X(-F_0_228 >> 1) },
- pw_mf0344_f0285 = { __4X2(-F_0_344, F_0_285) },
- pw_one = { __8X(1) }, pw_255 = { __8X(255) },
- pw_cj = { __8X(CENTERJSAMPLE) };
- __vector int pd_onehalf = { __4X(ONE_HALF) };
- __vector unsigned char pb_zero = { __16X(0) },
-#if __BIG_ENDIAN__
- shift_pack_index =
- { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
-#else
- shift_pack_index =
- { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
-#endif
-
- while (--num_rows >= 0) {
- inptr0 = input_buf[0][input_row];
- inptr1 = input_buf[1][input_row];
- inptr2 = input_buf[2][input_row];
- input_row++;
- outptr = *output_buf++;
-
- for (num_cols = pitch; num_cols > 0;
- num_cols -= RGB_PIXELSIZE * 16, outptr += RGB_PIXELSIZE * 16,
- inptr0 += 16, inptr1 += 16, inptr2 += 16) {
-
- y = vec_ld(0, inptr0);
- /* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
- * support unsigned vectors.
- */
- yl = (__vector signed short)VEC_UNPACKHU(y);
- yh = (__vector signed short)VEC_UNPACKLU(y);
-
- cb = vec_ld(0, inptr1);
- cbl = (__vector signed short)VEC_UNPACKHU(cb);
- cbh = (__vector signed short)VEC_UNPACKLU(cb);
- cbl = vec_sub(cbl, pw_cj);
- cbh = vec_sub(cbh, pw_cj);
-
- cr = vec_ld(0, inptr2);
- crl = (__vector signed short)VEC_UNPACKHU(cr);
- crh = (__vector signed short)VEC_UNPACKLU(cr);
- crl = vec_sub(crl, pw_cj);
- crh = vec_sub(crh, pw_cj);
-
- /* (Original)
- * R = Y + 1.40200 * Cr
- * G = Y - 0.34414 * Cb - 0.71414 * Cr
- * B = Y + 1.77200 * Cb
- *
- * (This implementation)
- * R = Y + 0.40200 * Cr + Cr
- * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
- * B = Y - 0.22800 * Cb + Cb + Cb
- */
- bl = vec_add(cbl, cbl);
- bh = vec_add(cbh, cbh);
- bl = vec_madds(bl, pw_mf0228, pw_one);
- bh = vec_madds(bh, pw_mf0228, pw_one);
- bl = vec_sra(bl, (__vector unsigned short)pw_one);
- bh = vec_sra(bh, (__vector unsigned short)pw_one);
- bl = vec_add(bl, cbl);
- bh = vec_add(bh, cbh);
- bl = vec_add(bl, cbl);
- bh = vec_add(bh, cbh);
- bl = vec_add(bl, yl);
- bh = vec_add(bh, yh);
-
- rl = vec_add(crl, crl);
- rh = vec_add(crh, crh);
- rl = vec_madds(rl, pw_f0402, pw_one);
- rh = vec_madds(rh, pw_f0402, pw_one);
- rl = vec_sra(rl, (__vector unsigned short)pw_one);
- rh = vec_sra(rh, (__vector unsigned short)pw_one);
- rl = vec_add(rl, crl);
- rh = vec_add(rh, crh);
- rl = vec_add(rl, yl);
- rh = vec_add(rh, yh);
-
- g0w = vec_mergeh(cbl, crl);
- g1w = vec_mergel(cbl, crl);
- g0 = vec_msums(g0w, pw_mf0344_f0285, pd_onehalf);
- g1 = vec_msums(g1w, pw_mf0344_f0285, pd_onehalf);
- g2w = vec_mergeh(cbh, crh);
- g3w = vec_mergel(cbh, crh);
- g2 = vec_msums(g2w, pw_mf0344_f0285, pd_onehalf);
- g3 = vec_msums(g3w, pw_mf0344_f0285, pd_onehalf);
- /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from
- * each dword into a new 16-bit vector, which is the equivalent of
- * descaling the 32-bit results (right-shifting by 16 bits) and then
- * packing them.
- */
- gl = vec_perm((__vector short)g0, (__vector short)g1, shift_pack_index);
- gh = vec_perm((__vector short)g2, (__vector short)g3, shift_pack_index);
- gl = vec_sub(gl, crl);
- gh = vec_sub(gh, crh);
- gl = vec_add(gl, yl);
- gh = vec_add(gh, yh);
-
- rg0 = vec_mergeh(rl, gl);
- bx0 = vec_mergeh(bl, pw_255);
- rg1 = vec_mergel(rl, gl);
- bx1 = vec_mergel(bl, pw_255);
- rg2 = vec_mergeh(rh, gh);
- bx2 = vec_mergeh(bh, pw_255);
- rg3 = vec_mergel(rh, gh);
- bx3 = vec_mergel(bh, pw_255);
-
- rgbx0 = vec_packsu(rg0, bx0);
- rgbx1 = vec_packsu(rg1, bx1);
- rgbx2 = vec_packsu(rg2, bx2);
- rgbx3 = vec_packsu(rg3, bx3);
-
-#if RGB_PIXELSIZE == 3
- /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
- * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
- * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
- * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
- *
- * rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
- * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
- * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
- */
- rgb0 = vec_perm(rgbx0, rgbx1, (__vector unsigned char)RGB_INDEX0);
- rgb1 = vec_perm(rgbx1, rgbx2, (__vector unsigned char)RGB_INDEX1);
- rgb2 = vec_perm(rgbx2, rgbx3, (__vector unsigned char)RGB_INDEX2);
-#else
- /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
- * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
- * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
- * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
- *
- * rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
- * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
- * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
- * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
- */
- rgb0 = vec_perm(rgbx0, rgbx0, (__vector unsigned char)RGB_INDEX);
- rgb1 = vec_perm(rgbx1, rgbx1, (__vector unsigned char)RGB_INDEX);
- rgb2 = vec_perm(rgbx2, rgbx2, (__vector unsigned char)RGB_INDEX);
- rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX);
-#endif
-
-#if __BIG_ENDIAN__
- offset = (size_t)outptr & 15;
- if (offset) {
- __vector unsigned char unaligned_shift_index;
- int bytes = num_cols + offset;
-
- if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
- /* Slow path to prevent buffer overwrite. Since there is no way to
- * write a partial AltiVec register, overwrite would occur on the
- * last chunk of the last image row if the right edge is not on a
- * 16-byte boundary. It could also occur on other rows if the bytes
- * per row is low enough. Since we can't determine whether we're on
- * the last image row, we have to assume every row is the last.
- */
- vec_st(rgb0, 0, tmpbuf);
- vec_st(rgb1, 16, tmpbuf);
- vec_st(rgb2, 32, tmpbuf);
-#if RGB_PIXELSIZE == 4
- vec_st(rgb3, 48, tmpbuf);
-#endif
- memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
- } else {
- /* Fast path */
- unaligned_shift_index = vec_lvsl(0, outptr);
- edgel = vec_ld(0, outptr);
- edgeh = vec_ld(min(num_cols - 1, RGB_PIXELSIZE * 16), outptr);
- edges = vec_perm(edgeh, edgel, unaligned_shift_index);
- unaligned_shift_index = vec_lvsr(0, outptr);
- out0 = vec_perm(edges, rgb0, unaligned_shift_index);
- out1 = vec_perm(rgb0, rgb1, unaligned_shift_index);
- out2 = vec_perm(rgb1, rgb2, unaligned_shift_index);
-#if RGB_PIXELSIZE == 4
- out3 = vec_perm(rgb2, rgb3, unaligned_shift_index);
- out4 = vec_perm(rgb3, edges, unaligned_shift_index);
-#else
- out3 = vec_perm(rgb2, edges, unaligned_shift_index);
-#endif
- vec_st(out0, 0, outptr);
- if (bytes > 16)
- vec_st(out1, 16, outptr);
- if (bytes > 32)
- vec_st(out2, 32, outptr);
- if (bytes > 48)
- vec_st(out3, 48, outptr);
-#if RGB_PIXELSIZE == 4
- if (bytes > 64)
- vec_st(out4, 64, outptr);
-#endif
- }
- } else {
-#endif /* __BIG_ENDIAN__ */
- if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
- /* Slow path */
- VEC_ST(rgb0, 0, tmpbuf);
- VEC_ST(rgb1, 16, tmpbuf);
- VEC_ST(rgb2, 32, tmpbuf);
-#if RGB_PIXELSIZE == 4
- VEC_ST(rgb3, 48, tmpbuf);
-#endif
- memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
- } else {
- /* Fast path */
- VEC_ST(rgb0, 0, outptr);
- if (num_cols > 16)
- VEC_ST(rgb1, 16, outptr);
- if (num_cols > 32)
- VEC_ST(rgb2, 32, outptr);
-#if RGB_PIXELSIZE == 4
- if (num_cols > 48)
- VEC_ST(rgb3, 48, outptr);
-#endif
- }
-#if __BIG_ENDIAN__
- }
-#endif
- }
- }
-}
diff --git a/simd/powerpc/jdcolor-altivec.c b/simd/powerpc/jdcolor-altivec.c
deleted file mode 100644
index eb35b67..0000000
--- a/simd/powerpc/jdcolor-altivec.c
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* YCC --> RGB CONVERSION */
-
-#include "jsimd_altivec.h"
-
-
-#define F_0_344 22554 /* FIX(0.34414) */
-#define F_0_714 46802 /* FIX(0.71414) */
-#define F_1_402 91881 /* FIX(1.40200) */
-#define F_1_772 116130 /* FIX(1.77200) */
-#define F_0_402 (F_1_402 - 65536) /* FIX(1.40200) - FIX(1) */
-#define F_0_285 (65536 - F_0_714) /* FIX(1) - FIX(0.71414) */
-#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
-
-#define SCALEBITS 16
-#define ONE_HALF (1 << (SCALEBITS - 1))
-
-#define RGB_INDEX0 \
- { 0, 1, 8, 2, 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18 }
-#define RGB_INDEX1 \
- { 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21 }
-#define RGB_INDEX2 \
- { 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21, 28, 22, 23, 30 }
-#include "jdcolext-altivec.c"
-#undef RGB_PIXELSIZE
-
-#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgb_convert_altivec
-#include "jdcolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX0
-#undef RGB_INDEX1
-#undef RGB_INDEX2
-#undef jsimd_ycc_rgb_convert_altivec
-
-#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define RGB_INDEX \
- { 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 }
-#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgbx_convert_altivec
-#include "jdcolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_ycc_rgb_convert_altivec
-
-#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define RGB_INDEX0 \
- { 8, 1, 0, 10, 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26 }
-#define RGB_INDEX1 \
- { 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21 }
-#define RGB_INDEX2 \
- { 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21, 20, 30, 23, 22 }
-#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgr_convert_altivec
-#include "jdcolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX0
-#undef RGB_INDEX1
-#undef RGB_INDEX2
-#undef jsimd_ycc_rgb_convert_altivec
-
-#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define RGB_INDEX \
- { 8, 1, 0, 9, 10, 3, 2, 11, 12, 5, 4, 13, 14, 7, 6, 15 }
-#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgrx_convert_altivec
-#include "jdcolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_ycc_rgb_convert_altivec
-
-#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define RGB_INDEX \
- { 9, 8, 1, 0, 11, 10, 3, 2, 13, 12, 5, 4, 15, 14, 7, 6 }
-#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxbgr_convert_altivec
-#include "jdcolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_ycc_rgb_convert_altivec
-
-#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define RGB_INDEX \
- { 9, 0, 1, 8, 11, 2, 3, 10, 13, 4, 5, 12, 15, 6, 7, 14 }
-#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxrgb_convert_altivec
-#include "jdcolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_ycc_rgb_convert_altivec
diff --git a/simd/powerpc/jdmerge-altivec.c b/simd/powerpc/jdmerge-altivec.c
deleted file mode 100644
index 79c577f..0000000
--- a/simd/powerpc/jdmerge-altivec.c
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* MERGED YCC --> RGB CONVERSION AND UPSAMPLING */
-
-#include "jsimd_altivec.h"
-
-
-#define F_0_344 22554 /* FIX(0.34414) */
-#define F_0_714 46802 /* FIX(0.71414) */
-#define F_1_402 91881 /* FIX(1.40200) */
-#define F_1_772 116130 /* FIX(1.77200) */
-#define F_0_402 (F_1_402 - 65536) /* FIX(1.40200) - FIX(1) */
-#define F_0_285 (65536 - F_0_714) /* FIX(1) - FIX(0.71414) */
-#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
-
-#define SCALEBITS 16
-#define ONE_HALF (1 << (SCALEBITS - 1))
-
-#define RGB_INDEX0 \
- { 0, 1, 8, 2, 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18 }
-#define RGB_INDEX1 \
- { 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21 }
-#define RGB_INDEX2 \
- { 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21, 28, 22, 23, 30 }
-#include "jdmrgext-altivec.c"
-#undef RGB_PIXELSIZE
-
-#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-#define jsimd_h2v1_merged_upsample_altivec \
- jsimd_h2v1_extrgb_merged_upsample_altivec
-#define jsimd_h2v2_merged_upsample_altivec \
- jsimd_h2v2_extrgb_merged_upsample_altivec
-#include "jdmrgext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX0
-#undef RGB_INDEX1
-#undef RGB_INDEX2
-#undef jsimd_h2v1_merged_upsample_altivec
-#undef jsimd_h2v2_merged_upsample_altivec
-
-#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define RGB_INDEX \
- { 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 }
-#define jsimd_h2v1_merged_upsample_altivec \
- jsimd_h2v1_extrgbx_merged_upsample_altivec
-#define jsimd_h2v2_merged_upsample_altivec \
- jsimd_h2v2_extrgbx_merged_upsample_altivec
-#include "jdmrgext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_h2v1_merged_upsample_altivec
-#undef jsimd_h2v2_merged_upsample_altivec
-
-#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define RGB_INDEX0 \
- { 8, 1, 0, 10, 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26 }
-#define RGB_INDEX1 \
- { 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21 }
-#define RGB_INDEX2 \
- { 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21, 20, 30, 23, 22 }
-#define jsimd_h2v1_merged_upsample_altivec \
- jsimd_h2v1_extbgr_merged_upsample_altivec
-#define jsimd_h2v2_merged_upsample_altivec \
- jsimd_h2v2_extbgr_merged_upsample_altivec
-#include "jdmrgext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX0
-#undef RGB_INDEX1
-#undef RGB_INDEX2
-#undef jsimd_h2v1_merged_upsample_altivec
-#undef jsimd_h2v2_merged_upsample_altivec
-
-#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define RGB_INDEX \
- { 8, 1, 0, 9, 10, 3, 2, 11, 12, 5, 4, 13, 14, 7, 6, 15 }
-#define jsimd_h2v1_merged_upsample_altivec \
- jsimd_h2v1_extbgrx_merged_upsample_altivec
-#define jsimd_h2v2_merged_upsample_altivec \
- jsimd_h2v2_extbgrx_merged_upsample_altivec
-#include "jdmrgext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_h2v1_merged_upsample_altivec
-#undef jsimd_h2v2_merged_upsample_altivec
-
-#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define RGB_INDEX \
- { 9, 8, 1, 0, 11, 10, 3, 2, 13, 12, 5, 4, 15, 14, 7, 6 }
-#define jsimd_h2v1_merged_upsample_altivec \
- jsimd_h2v1_extxbgr_merged_upsample_altivec
-#define jsimd_h2v2_merged_upsample_altivec \
- jsimd_h2v2_extxbgr_merged_upsample_altivec
-#include "jdmrgext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_h2v1_merged_upsample_altivec
-#undef jsimd_h2v2_merged_upsample_altivec
-
-#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define RGB_INDEX \
- { 9, 0, 1, 8, 11, 2, 3, 10, 13, 4, 5, 12, 15, 6, 7, 14 }
-#define jsimd_h2v1_merged_upsample_altivec \
- jsimd_h2v1_extxrgb_merged_upsample_altivec
-#define jsimd_h2v2_merged_upsample_altivec \
- jsimd_h2v2_extxrgb_merged_upsample_altivec
-#include "jdmrgext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_h2v1_merged_upsample_altivec
-#undef jsimd_h2v2_merged_upsample_altivec
diff --git a/simd/powerpc/jdmrgext-altivec.c b/simd/powerpc/jdmrgext-altivec.c
deleted file mode 100644
index 40f02c3..0000000
--- a/simd/powerpc/jdmrgext-altivec.c
+++ /dev/null
@@ -1,329 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* This file is included by jdmerge-altivec.c */
-
-
-void jsimd_h2v1_merged_upsample_altivec(JDIMENSION output_width,
- JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr,
- JSAMPARRAY output_buf)
-{
- JSAMPROW outptr, inptr0, inptr1, inptr2;
- int pitch = output_width * RGB_PIXELSIZE, num_cols, yloop;
-#if __BIG_ENDIAN__
- int offset;
-#endif
- unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
-
- __vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3,
- y, cb, cr;
-#if __BIG_ENDIAN__
- __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3;
-#if RGB_PIXELSIZE == 4
- __vector unsigned char out4;
-#endif
-#endif
-#if RGB_PIXELSIZE == 4
- __vector unsigned char rgb3;
-#endif
- __vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, ye, yo, cbl, cbh,
- crl, crh, r_yl, r_yh, g_yl, g_yh, b_yl, b_yh, g_y0w, g_y1w, g_y2w, g_y3w,
- rl, rh, gl, gh, bl, bh, re, ro, ge, go, be, bo;
- __vector int g_y0, g_y1, g_y2, g_y3;
-
- /* Constants
- * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17
- * high-order bits, not 16.
- */
- __vector short pw_f0402 = { __8X(F_0_402 >> 1) },
- pw_mf0228 = { __8X(-F_0_228 >> 1) },
- pw_mf0344_f0285 = { __4X2(-F_0_344, F_0_285) },
- pw_one = { __8X(1) }, pw_255 = { __8X(255) },
- pw_cj = { __8X(CENTERJSAMPLE) };
- __vector int pd_onehalf = { __4X(ONE_HALF) };
- __vector unsigned char pb_zero = { __16X(0) },
-#if __BIG_ENDIAN__
- shift_pack_index =
- { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 },
- even_index =
- { 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26, 0, 28, 0, 30 },
- odd_index =
- { 0, 17, 0, 19, 0, 21, 0, 23, 0, 25, 0, 27, 0, 29, 0, 31 };
-#else
- shift_pack_index =
- { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 },
- even_index =
- { 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26, 0, 28, 0, 30, 0 },
- odd_index =
- { 17, 0, 19, 0, 21, 0, 23, 0, 25, 0, 27, 0, 29, 0, 31, 0 };
-#endif
-
- inptr0 = input_buf[0][in_row_group_ctr];
- inptr1 = input_buf[1][in_row_group_ctr];
- inptr2 = input_buf[2][in_row_group_ctr];
- outptr = output_buf[0];
-
- for (num_cols = pitch; num_cols > 0; inptr1 += 16, inptr2 += 16) {
-
- cb = vec_ld(0, inptr1);
- /* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
- * support unsigned vectors.
- */
- cbl = (__vector signed short)VEC_UNPACKHU(cb);
- cbh = (__vector signed short)VEC_UNPACKLU(cb);
- cbl = vec_sub(cbl, pw_cj);
- cbh = vec_sub(cbh, pw_cj);
-
- cr = vec_ld(0, inptr2);
- crl = (__vector signed short)VEC_UNPACKHU(cr);
- crh = (__vector signed short)VEC_UNPACKLU(cr);
- crl = vec_sub(crl, pw_cj);
- crh = vec_sub(crh, pw_cj);
-
- /* (Original)
- * R = Y + 1.40200 * Cr
- * G = Y - 0.34414 * Cb - 0.71414 * Cr
- * B = Y + 1.77200 * Cb
- *
- * (This implementation)
- * R = Y + 0.40200 * Cr + Cr
- * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
- * B = Y - 0.22800 * Cb + Cb + Cb
- */
- b_yl = vec_add(cbl, cbl);
- b_yh = vec_add(cbh, cbh);
- b_yl = vec_madds(b_yl, pw_mf0228, pw_one);
- b_yh = vec_madds(b_yh, pw_mf0228, pw_one);
- b_yl = vec_sra(b_yl, (__vector unsigned short)pw_one);
- b_yh = vec_sra(b_yh, (__vector unsigned short)pw_one);
- b_yl = vec_add(b_yl, cbl);
- b_yh = vec_add(b_yh, cbh);
- b_yl = vec_add(b_yl, cbl);
- b_yh = vec_add(b_yh, cbh);
-
- r_yl = vec_add(crl, crl);
- r_yh = vec_add(crh, crh);
- r_yl = vec_madds(r_yl, pw_f0402, pw_one);
- r_yh = vec_madds(r_yh, pw_f0402, pw_one);
- r_yl = vec_sra(r_yl, (__vector unsigned short)pw_one);
- r_yh = vec_sra(r_yh, (__vector unsigned short)pw_one);
- r_yl = vec_add(r_yl, crl);
- r_yh = vec_add(r_yh, crh);
-
- g_y0w = vec_mergeh(cbl, crl);
- g_y1w = vec_mergel(cbl, crl);
- g_y0 = vec_msums(g_y0w, pw_mf0344_f0285, pd_onehalf);
- g_y1 = vec_msums(g_y1w, pw_mf0344_f0285, pd_onehalf);
- g_y2w = vec_mergeh(cbh, crh);
- g_y3w = vec_mergel(cbh, crh);
- g_y2 = vec_msums(g_y2w, pw_mf0344_f0285, pd_onehalf);
- g_y3 = vec_msums(g_y3w, pw_mf0344_f0285, pd_onehalf);
- /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from
- * each dword into a new 16-bit vector, which is the equivalent of
- * descaling the 32-bit results (right-shifting by 16 bits) and then
- * packing them.
- */
- g_yl = vec_perm((__vector short)g_y0, (__vector short)g_y1,
- shift_pack_index);
- g_yh = vec_perm((__vector short)g_y2, (__vector short)g_y3,
- shift_pack_index);
- g_yl = vec_sub(g_yl, crl);
- g_yh = vec_sub(g_yh, crh);
-
- for (yloop = 0; yloop < 2 && num_cols > 0; yloop++,
- num_cols -= RGB_PIXELSIZE * 16,
- outptr += RGB_PIXELSIZE * 16, inptr0 += 16) {
-
- y = vec_ld(0, inptr0);
- ye = (__vector signed short)vec_perm(pb_zero, y, even_index);
- yo = (__vector signed short)vec_perm(pb_zero, y, odd_index);
-
- if (yloop == 0) {
- be = vec_add(b_yl, ye);
- bo = vec_add(b_yl, yo);
- re = vec_add(r_yl, ye);
- ro = vec_add(r_yl, yo);
- ge = vec_add(g_yl, ye);
- go = vec_add(g_yl, yo);
- } else {
- be = vec_add(b_yh, ye);
- bo = vec_add(b_yh, yo);
- re = vec_add(r_yh, ye);
- ro = vec_add(r_yh, yo);
- ge = vec_add(g_yh, ye);
- go = vec_add(g_yh, yo);
- }
-
- rl = vec_mergeh(re, ro);
- rh = vec_mergel(re, ro);
- gl = vec_mergeh(ge, go);
- gh = vec_mergel(ge, go);
- bl = vec_mergeh(be, bo);
- bh = vec_mergel(be, bo);
-
- rg0 = vec_mergeh(rl, gl);
- bx0 = vec_mergeh(bl, pw_255);
- rg1 = vec_mergel(rl, gl);
- bx1 = vec_mergel(bl, pw_255);
- rg2 = vec_mergeh(rh, gh);
- bx2 = vec_mergeh(bh, pw_255);
- rg3 = vec_mergel(rh, gh);
- bx3 = vec_mergel(bh, pw_255);
-
- rgbx0 = vec_packsu(rg0, bx0);
- rgbx1 = vec_packsu(rg1, bx1);
- rgbx2 = vec_packsu(rg2, bx2);
- rgbx3 = vec_packsu(rg3, bx3);
-
-#if RGB_PIXELSIZE == 3
- /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
- * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
- * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
- * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
- *
- * rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
- * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
- * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
- */
- rgb0 = vec_perm(rgbx0, rgbx1, (__vector unsigned char)RGB_INDEX0);
- rgb1 = vec_perm(rgbx1, rgbx2, (__vector unsigned char)RGB_INDEX1);
- rgb2 = vec_perm(rgbx2, rgbx3, (__vector unsigned char)RGB_INDEX2);
-#else
- /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
- * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
- * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
- * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
- *
- * rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
- * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
- * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
- * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
- */
- rgb0 = vec_perm(rgbx0, rgbx0, (__vector unsigned char)RGB_INDEX);
- rgb1 = vec_perm(rgbx1, rgbx1, (__vector unsigned char)RGB_INDEX);
- rgb2 = vec_perm(rgbx2, rgbx2, (__vector unsigned char)RGB_INDEX);
- rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX);
-#endif
-
-#if __BIG_ENDIAN__
- offset = (size_t)outptr & 15;
- if (offset) {
- __vector unsigned char unaligned_shift_index;
- int bytes = num_cols + offset;
-
- if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
- /* Slow path to prevent buffer overwrite. Since there is no way to
- * write a partial AltiVec register, overwrite would occur on the
- * last chunk of the last image row if the right edge is not on a
- * 16-byte boundary. It could also occur on other rows if the bytes
- * per row is low enough. Since we can't determine whether we're on
- * the last image row, we have to assume every row is the last.
- */
- vec_st(rgb0, 0, tmpbuf);
- vec_st(rgb1, 16, tmpbuf);
- vec_st(rgb2, 32, tmpbuf);
-#if RGB_PIXELSIZE == 4
- vec_st(rgb3, 48, tmpbuf);
-#endif
- memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
- } else {
- /* Fast path */
- unaligned_shift_index = vec_lvsl(0, outptr);
- edgel = vec_ld(0, outptr);
- edgeh = vec_ld(min(num_cols - 1, RGB_PIXELSIZE * 16), outptr);
- edges = vec_perm(edgeh, edgel, unaligned_shift_index);
- unaligned_shift_index = vec_lvsr(0, outptr);
- out0 = vec_perm(edges, rgb0, unaligned_shift_index);
- out1 = vec_perm(rgb0, rgb1, unaligned_shift_index);
- out2 = vec_perm(rgb1, rgb2, unaligned_shift_index);
-#if RGB_PIXELSIZE == 4
- out3 = vec_perm(rgb2, rgb3, unaligned_shift_index);
- out4 = vec_perm(rgb3, edges, unaligned_shift_index);
-#else
- out3 = vec_perm(rgb2, edges, unaligned_shift_index);
-#endif
- vec_st(out0, 0, outptr);
- if (bytes > 16)
- vec_st(out1, 16, outptr);
- if (bytes > 32)
- vec_st(out2, 32, outptr);
- if (bytes > 48)
- vec_st(out3, 48, outptr);
-#if RGB_PIXELSIZE == 4
- if (bytes > 64)
- vec_st(out4, 64, outptr);
-#endif
- }
- } else {
-#endif /* __BIG_ENDIAN__ */
- if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
- /* Slow path */
- VEC_ST(rgb0, 0, tmpbuf);
- VEC_ST(rgb1, 16, tmpbuf);
- VEC_ST(rgb2, 32, tmpbuf);
-#if RGB_PIXELSIZE == 4
- VEC_ST(rgb3, 48, tmpbuf);
-#endif
- memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
- } else {
- /* Fast path */
- VEC_ST(rgb0, 0, outptr);
- if (num_cols > 16)
- VEC_ST(rgb1, 16, outptr);
- if (num_cols > 32)
- VEC_ST(rgb2, 32, outptr);
-#if RGB_PIXELSIZE == 4
- if (num_cols > 48)
- VEC_ST(rgb3, 48, outptr);
-#endif
- }
-#if __BIG_ENDIAN__
- }
-#endif
- }
- }
-}
-
-
-void jsimd_h2v2_merged_upsample_altivec(JDIMENSION output_width,
- JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr,
- JSAMPARRAY output_buf)
-{
- JSAMPROW inptr, outptr;
-
- inptr = input_buf[0][in_row_group_ctr];
- outptr = output_buf[0];
-
- input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2];
- jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr,
- output_buf);
-
- input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2 + 1];
- output_buf[0] = output_buf[1];
- jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr,
- output_buf);
-
- input_buf[0][in_row_group_ctr] = inptr;
- output_buf[0] = outptr;
-}
diff --git a/simd/powerpc/jdsample-altivec.c b/simd/powerpc/jdsample-altivec.c
deleted file mode 100644
index 04df0cf..0000000
--- a/simd/powerpc/jdsample-altivec.c
+++ /dev/null
@@ -1,400 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* CHROMA UPSAMPLING */
-
-#include "jsimd_altivec.h"
-
-
-void jsimd_h2v1_fancy_upsample_altivec(int max_v_samp_factor,
- JDIMENSION downsampled_width,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
- JSAMPARRAY output_data = *output_data_ptr;
- JSAMPROW inptr, outptr;
- int inrow, incol;
-
- __vector unsigned char this0, last0, p_last0, next0 = { 0 }, p_next0,
- out;
- __vector short this0e, this0o, this0l, this0h, last0l, last0h,
- next0l, next0h, outle, outhe, outlo, outho;
-
- /* Constants */
- __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) },
- last_index_col0 =
- { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
- last_index =
- { 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 },
- next_index =
- { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 },
- next_index_lastcol =
- { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 },
-#if __BIG_ENDIAN__
- merge_pack_index =
- { 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 };
-#else
- merge_pack_index =
- { 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 };
-#endif
- __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) };
-
- for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
- inptr = input_data[inrow];
- outptr = output_data[inrow];
-
- if (downsampled_width & 15)
- inptr[downsampled_width] = inptr[downsampled_width - 1];
-
- this0 = vec_ld(0, inptr);
- p_last0 = vec_perm(this0, this0, last_index_col0);
- last0 = this0;
-
- for (incol = downsampled_width; incol > 0;
- incol -= 16, inptr += 16, outptr += 32) {
-
- if (downsampled_width - incol > 0) {
- p_last0 = vec_perm(last0, this0, last_index);
- last0 = this0;
- }
-
- if (incol <= 16)
- p_next0 = vec_perm(this0, this0, next_index_lastcol);
- else {
- next0 = vec_ld(16, inptr);
- p_next0 = vec_perm(this0, next0, next_index);
- }
-
- this0e = (__vector short)vec_mule(this0, pb_three);
- this0o = (__vector short)vec_mulo(this0, pb_three);
- this0l = vec_mergeh(this0e, this0o);
- this0h = vec_mergel(this0e, this0o);
-
- last0l = (__vector short)VEC_UNPACKHU(p_last0);
- last0h = (__vector short)VEC_UNPACKLU(p_last0);
- last0l = vec_add(last0l, pw_one);
-
- next0l = (__vector short)VEC_UNPACKHU(p_next0);
- next0h = (__vector short)VEC_UNPACKLU(p_next0);
- next0l = vec_add(next0l, pw_two);
-
- outle = vec_add(this0l, last0l);
- outlo = vec_add(this0l, next0l);
- outle = vec_sr(outle, (__vector unsigned short)pw_two);
- outlo = vec_sr(outlo, (__vector unsigned short)pw_two);
-
- out = vec_perm((__vector unsigned char)outle,
- (__vector unsigned char)outlo, merge_pack_index);
- vec_st(out, 0, outptr);
-
- if (incol > 8) {
- last0h = vec_add(last0h, pw_one);
- next0h = vec_add(next0h, pw_two);
-
- outhe = vec_add(this0h, last0h);
- outho = vec_add(this0h, next0h);
- outhe = vec_sr(outhe, (__vector unsigned short)pw_two);
- outho = vec_sr(outho, (__vector unsigned short)pw_two);
-
- out = vec_perm((__vector unsigned char)outhe,
- (__vector unsigned char)outho, merge_pack_index);
- vec_st(out, 16, outptr);
- }
-
- this0 = next0;
- }
- }
-}
-
-
-void jsimd_h2v2_fancy_upsample_altivec(int max_v_samp_factor,
- JDIMENSION downsampled_width,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
- JSAMPARRAY output_data = *output_data_ptr;
- JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
- int inrow, outrow, incol;
-
- __vector unsigned char this_1, this0, this1, out;
- __vector short this_1l, this_1h, this0l, this0h, this1l, this1h,
- lastcolsum_1h, lastcolsum1h,
- p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,
- thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,
- nextcolsum_1l = { 0 }, nextcolsum_1h = { 0 },
- nextcolsum1l = { 0 }, nextcolsum1h = { 0 },
- p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,
- tmpl, tmph, outle, outhe, outlo, outho;
-
- /* Constants */
- __vector unsigned char pb_zero = { __16X(0) },
- last_index_col0 =
- { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
- last_index =
- { 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 },
- next_index =
- { 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 },
- next_index_lastcol =
- { 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15 },
-#if __BIG_ENDIAN__
- merge_pack_index =
- { 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 };
-#else
- merge_pack_index =
- { 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 };
-#endif
- __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) },
- pw_seven = { __8X(7) }, pw_eight = { __8X(8) };
- __vector unsigned short pw_four = { __8X(4) };
-
- for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
-
- inptr_1 = input_data[inrow - 1];
- inptr0 = input_data[inrow];
- inptr1 = input_data[inrow + 1];
- outptr0 = output_data[outrow++];
- outptr1 = output_data[outrow++];
-
- if (downsampled_width & 15) {
- inptr_1[downsampled_width] = inptr_1[downsampled_width - 1];
- inptr0[downsampled_width] = inptr0[downsampled_width - 1];
- inptr1[downsampled_width] = inptr1[downsampled_width - 1];
- }
-
- this0 = vec_ld(0, inptr0);
- this0l = (__vector short)VEC_UNPACKHU(this0);
- this0h = (__vector short)VEC_UNPACKLU(this0);
- this0l = vec_mladd(this0l, pw_three, pw_zero);
- this0h = vec_mladd(this0h, pw_three, pw_zero);
-
- this_1 = vec_ld(0, inptr_1);
- this_1l = (__vector short)VEC_UNPACKHU(this_1);
- this_1h = (__vector short)VEC_UNPACKLU(this_1);
- thiscolsum_1l = vec_add(this0l, this_1l);
- thiscolsum_1h = vec_add(this0h, this_1h);
- lastcolsum_1h = thiscolsum_1h;
- p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0);
- p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
-
- this1 = vec_ld(0, inptr1);
- this1l = (__vector short)VEC_UNPACKHU(this1);
- this1h = (__vector short)VEC_UNPACKLU(this1);
- thiscolsum1l = vec_add(this0l, this1l);
- thiscolsum1h = vec_add(this0h, this1h);
- lastcolsum1h = thiscolsum1h;
- p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0);
- p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
-
- for (incol = downsampled_width; incol > 0;
- incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16,
- outptr0 += 32, outptr1 += 32) {
-
- if (downsampled_width - incol > 0) {
- p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index);
- p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
- p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index);
- p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
- lastcolsum_1h = thiscolsum_1h; lastcolsum1h = thiscolsum1h;
- }
-
- if (incol <= 16) {
- p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
- p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h,
- next_index_lastcol);
- p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
- p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h,
- next_index_lastcol);
- } else {
- this0 = vec_ld(16, inptr0);
- this0l = (__vector short)VEC_UNPACKHU(this0);
- this0h = (__vector short)VEC_UNPACKLU(this0);
- this0l = vec_mladd(this0l, pw_three, pw_zero);
- this0h = vec_mladd(this0h, pw_three, pw_zero);
-
- this_1 = vec_ld(16, inptr_1);
- this_1l = (__vector short)VEC_UNPACKHU(this_1);
- this_1h = (__vector short)VEC_UNPACKLU(this_1);
- nextcolsum_1l = vec_add(this0l, this_1l);
- nextcolsum_1h = vec_add(this0h, this_1h);
- p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
- p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index);
-
- this1 = vec_ld(16, inptr1);
- this1l = (__vector short)VEC_UNPACKHU(this1);
- this1h = (__vector short)VEC_UNPACKLU(this1);
- nextcolsum1l = vec_add(this0l, this1l);
- nextcolsum1h = vec_add(this0h, this1h);
- p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
- p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index);
- }
-
- /* Process the upper row */
-
- tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero);
- outle = vec_add(tmpl, p_lastcolsum_1l);
- outle = vec_add(outle, pw_eight);
- outle = vec_sr(outle, pw_four);
-
- outlo = vec_add(tmpl, p_nextcolsum_1l);
- outlo = vec_add(outlo, pw_seven);
- outlo = vec_sr(outlo, pw_four);
-
- out = vec_perm((__vector unsigned char)outle,
- (__vector unsigned char)outlo, merge_pack_index);
- vec_st(out, 0, outptr0);
-
- if (incol > 8) {
- tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero);
- outhe = vec_add(tmph, p_lastcolsum_1h);
- outhe = vec_add(outhe, pw_eight);
- outhe = vec_sr(outhe, pw_four);
-
- outho = vec_add(tmph, p_nextcolsum_1h);
- outho = vec_add(outho, pw_seven);
- outho = vec_sr(outho, pw_four);
-
- out = vec_perm((__vector unsigned char)outhe,
- (__vector unsigned char)outho, merge_pack_index);
- vec_st(out, 16, outptr0);
- }
-
- /* Process the lower row */
-
- tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero);
- outle = vec_add(tmpl, p_lastcolsum1l);
- outle = vec_add(outle, pw_eight);
- outle = vec_sr(outle, pw_four);
-
- outlo = vec_add(tmpl, p_nextcolsum1l);
- outlo = vec_add(outlo, pw_seven);
- outlo = vec_sr(outlo, pw_four);
-
- out = vec_perm((__vector unsigned char)outle,
- (__vector unsigned char)outlo, merge_pack_index);
- vec_st(out, 0, outptr1);
-
- if (incol > 8) {
- tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero);
- outhe = vec_add(tmph, p_lastcolsum1h);
- outhe = vec_add(outhe, pw_eight);
- outhe = vec_sr(outhe, pw_four);
-
- outho = vec_add(tmph, p_nextcolsum1h);
- outho = vec_add(outho, pw_seven);
- outho = vec_sr(outho, pw_four);
-
- out = vec_perm((__vector unsigned char)outhe,
- (__vector unsigned char)outho, merge_pack_index);
- vec_st(out, 16, outptr1);
- }
-
- thiscolsum_1l = nextcolsum_1l; thiscolsum_1h = nextcolsum_1h;
- thiscolsum1l = nextcolsum1l; thiscolsum1h = nextcolsum1h;
- }
- }
-}
-
-
-/* These are rarely used (mainly just for decompressing YCCK images) */
-
-void jsimd_h2v1_upsample_altivec(int max_v_samp_factor,
- JDIMENSION output_width,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
- JSAMPARRAY output_data = *output_data_ptr;
- JSAMPROW inptr, outptr;
- int inrow, incol;
-
- __vector unsigned char in, inl, inh;
-
- for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
- inptr = input_data[inrow];
- outptr = output_data[inrow];
-
- for (incol = (output_width + 31) & (~31); incol > 0;
- incol -= 64, inptr += 32, outptr += 64) {
-
- in = vec_ld(0, inptr);
- inl = vec_mergeh(in, in);
- inh = vec_mergel(in, in);
-
- vec_st(inl, 0, outptr);
- vec_st(inh, 16, outptr);
-
- if (incol > 32) {
- in = vec_ld(16, inptr);
- inl = vec_mergeh(in, in);
- inh = vec_mergel(in, in);
-
- vec_st(inl, 32, outptr);
- vec_st(inh, 48, outptr);
- }
- }
- }
-}
-
-
-void jsimd_h2v2_upsample_altivec(int max_v_samp_factor,
- JDIMENSION output_width,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
- JSAMPARRAY output_data = *output_data_ptr;
- JSAMPROW inptr, outptr0, outptr1;
- int inrow, outrow, incol;
-
- __vector unsigned char in, inl, inh;
-
- for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
-
- inptr = input_data[inrow];
- outptr0 = output_data[outrow++];
- outptr1 = output_data[outrow++];
-
- for (incol = (output_width + 31) & (~31); incol > 0;
- incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) {
-
- in = vec_ld(0, inptr);
- inl = vec_mergeh(in, in);
- inh = vec_mergel(in, in);
-
- vec_st(inl, 0, outptr0);
- vec_st(inl, 0, outptr1);
-
- vec_st(inh, 16, outptr0);
- vec_st(inh, 16, outptr1);
-
- if (incol > 32) {
- in = vec_ld(16, inptr);
- inl = vec_mergeh(in, in);
- inh = vec_mergel(in, in);
-
- vec_st(inl, 32, outptr0);
- vec_st(inl, 32, outptr1);
-
- vec_st(inh, 48, outptr0);
- vec_st(inh, 48, outptr1);
- }
- }
- }
-}
diff --git a/simd/powerpc/jfdctfst-altivec.c b/simd/powerpc/jfdctfst-altivec.c
deleted file mode 100644
index ad9af81..0000000
--- a/simd/powerpc/jfdctfst-altivec.c
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2014, D. R. Commander. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* FAST INTEGER FORWARD DCT
- *
- * This is similar to the SSE2 implementation, except that we left-shift the
- * constants by 1 less bit (the -1 in CONST_SHIFT.) This is because
- * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
- * the elements in arg3 + the most significant 17 bits of
- * (the elements in arg1 * the elements in arg2).
- */
-
-#include "jsimd_altivec.h"
-
-
-#define F_0_382 98 /* FIX(0.382683433) */
-#define F_0_541 139 /* FIX(0.541196100) */
-#define F_0_707 181 /* FIX(0.707106781) */
-#define F_1_306 334 /* FIX(1.306562965) */
-
-#define CONST_BITS 8
-#define PRE_MULTIPLY_SCALE_BITS 2
-#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
-
-
-#define DO_FDCT() { \
- /* Even part */ \
- \
- tmp10 = vec_add(tmp0, tmp3); \
- tmp13 = vec_sub(tmp0, tmp3); \
- tmp11 = vec_add(tmp1, tmp2); \
- tmp12 = vec_sub(tmp1, tmp2); \
- \
- out0 = vec_add(tmp10, tmp11); \
- out4 = vec_sub(tmp10, tmp11); \
- \
- z1 = vec_add(tmp12, tmp13); \
- z1 = vec_sl(z1, pre_multiply_scale_bits); \
- z1 = vec_madds(z1, pw_0707, pw_zero); \
- \
- out2 = vec_add(tmp13, z1); \
- out6 = vec_sub(tmp13, z1); \
- \
- /* Odd part */ \
- \
- tmp10 = vec_add(tmp4, tmp5); \
- tmp11 = vec_add(tmp5, tmp6); \
- tmp12 = vec_add(tmp6, tmp7); \
- \
- tmp10 = vec_sl(tmp10, pre_multiply_scale_bits); \
- tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
- z5 = vec_sub(tmp10, tmp12); \
- z5 = vec_madds(z5, pw_0382, pw_zero); \
- \
- z2 = vec_madds(tmp10, pw_0541, z5); \
- z4 = vec_madds(tmp12, pw_1306, z5); \
- \
- tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
- z3 = vec_madds(tmp11, pw_0707, pw_zero); \
- \
- z11 = vec_add(tmp7, z3); \
- z13 = vec_sub(tmp7, z3); \
- \
- out5 = vec_add(z13, z2); \
- out3 = vec_sub(z13, z2); \
- out1 = vec_add(z11, z4); \
- out7 = vec_sub(z11, z4); \
-}
-
-
-void jsimd_fdct_ifast_altivec(DCTELEM *data)
-{
- __vector short row0, row1, row2, row3, row4, row5, row6, row7,
- col0, col1, col2, col3, col4, col5, col6, col7,
- tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
- z1, z2, z3, z4, z5, z11, z13,
- out0, out1, out2, out3, out4, out5, out6, out7;
-
- /* Constants */
- __vector short pw_zero = { __8X(0) },
- pw_0382 = { __8X(F_0_382 << CONST_SHIFT) },
- pw_0541 = { __8X(F_0_541 << CONST_SHIFT) },
- pw_0707 = { __8X(F_0_707 << CONST_SHIFT) },
- pw_1306 = { __8X(F_1_306 << CONST_SHIFT) };
- __vector unsigned short
- pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) };
-
- /* Pass 1: process rows */
-
- row0 = vec_ld(0, data);
- row1 = vec_ld(16, data);
- row2 = vec_ld(32, data);
- row3 = vec_ld(48, data);
- row4 = vec_ld(64, data);
- row5 = vec_ld(80, data);
- row6 = vec_ld(96, data);
- row7 = vec_ld(112, data);
-
- TRANSPOSE(row, col);
-
- tmp0 = vec_add(col0, col7);
- tmp7 = vec_sub(col0, col7);
- tmp1 = vec_add(col1, col6);
- tmp6 = vec_sub(col1, col6);
- tmp2 = vec_add(col2, col5);
- tmp5 = vec_sub(col2, col5);
- tmp3 = vec_add(col3, col4);
- tmp4 = vec_sub(col3, col4);
-
- DO_FDCT();
-
- /* Pass 2: process columns */
-
- TRANSPOSE(out, row);
-
- tmp0 = vec_add(row0, row7);
- tmp7 = vec_sub(row0, row7);
- tmp1 = vec_add(row1, row6);
- tmp6 = vec_sub(row1, row6);
- tmp2 = vec_add(row2, row5);
- tmp5 = vec_sub(row2, row5);
- tmp3 = vec_add(row3, row4);
- tmp4 = vec_sub(row3, row4);
-
- DO_FDCT();
-
- vec_st(out0, 0, data);
- vec_st(out1, 16, data);
- vec_st(out2, 32, data);
- vec_st(out3, 48, data);
- vec_st(out4, 64, data);
- vec_st(out5, 80, data);
- vec_st(out6, 96, data);
- vec_st(out7, 112, data);
-}
diff --git a/simd/powerpc/jfdctint-altivec.c b/simd/powerpc/jfdctint-altivec.c
deleted file mode 100644
index 6e63cc1..0000000
--- a/simd/powerpc/jfdctint-altivec.c
+++ /dev/null
@@ -1,258 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2014, D. R. Commander. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* SLOW INTEGER FORWARD DCT */
-
-#include "jsimd_altivec.h"
-
-
-#define F_0_298 2446 /* FIX(0.298631336) */
-#define F_0_390 3196 /* FIX(0.390180644) */
-#define F_0_541 4433 /* FIX(0.541196100) */
-#define F_0_765 6270 /* FIX(0.765366865) */
-#define F_0_899 7373 /* FIX(0.899976223) */
-#define F_1_175 9633 /* FIX(1.175875602) */
-#define F_1_501 12299 /* FIX(1.501321110) */
-#define F_1_847 15137 /* FIX(1.847759065) */
-#define F_1_961 16069 /* FIX(1.961570560) */
-#define F_2_053 16819 /* FIX(2.053119869) */
-#define F_2_562 20995 /* FIX(2.562915447) */
-#define F_3_072 25172 /* FIX(3.072711026) */
-
-#define CONST_BITS 13
-#define PASS1_BITS 2
-#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
-#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
-
-
-#define DO_FDCT_COMMON(PASS) { \
- /* (Original) \
- * z1 = (tmp12 + tmp13) * 0.541196100; \
- * data2 = z1 + tmp13 * 0.765366865; \
- * data6 = z1 + tmp12 * -1.847759065; \
- * \
- * (This implementation) \
- * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \
- * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \
- */ \
- \
- tmp1312l = vec_mergeh(tmp13, tmp12); \
- tmp1312h = vec_mergel(tmp13, tmp12); \
- \
- out2l = vec_msums(tmp1312l, pw_f130_f054, pd_descale_p##PASS); \
- out2h = vec_msums(tmp1312h, pw_f130_f054, pd_descale_p##PASS); \
- out6l = vec_msums(tmp1312l, pw_f054_mf130, pd_descale_p##PASS); \
- out6h = vec_msums(tmp1312h, pw_f054_mf130, pd_descale_p##PASS); \
- \
- out2l = vec_sra(out2l, descale_p##PASS); \
- out2h = vec_sra(out2h, descale_p##PASS); \
- out6l = vec_sra(out6l, descale_p##PASS); \
- out6h = vec_sra(out6h, descale_p##PASS); \
- \
- out2 = vec_pack(out2l, out2h); \
- out6 = vec_pack(out6l, out6h); \
- \
- /* Odd part */ \
- \
- z3 = vec_add(tmp4, tmp6); \
- z4 = vec_add(tmp5, tmp7); \
- \
- /* (Original) \
- * z5 = (z3 + z4) * 1.175875602; \
- * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
- * z3 += z5; z4 += z5; \
- * \
- * (This implementation) \
- * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
- * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
- */ \
- \
- z34l = vec_mergeh(z3, z4); \
- z34h = vec_mergel(z3, z4); \
- \
- z3l = vec_msums(z34l, pw_mf078_f117, pd_descale_p##PASS); \
- z3h = vec_msums(z34h, pw_mf078_f117, pd_descale_p##PASS); \
- z4l = vec_msums(z34l, pw_f117_f078, pd_descale_p##PASS); \
- z4h = vec_msums(z34h, pw_f117_f078, pd_descale_p##PASS); \
- \
- /* (Original) \
- * z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; \
- * tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; \
- * tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; \
- * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
- * data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; \
- * data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; \
- * \
- * (This implementation) \
- * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \
- * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \
- * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \
- * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \
- * data7 = tmp4 + z3; data5 = tmp5 + z4; \
- * data3 = tmp6 + z3; data1 = tmp7 + z4; \
- */ \
- \
- tmp47l = vec_mergeh(tmp4, tmp7); \
- tmp47h = vec_mergel(tmp4, tmp7); \
- \
- out7l = vec_msums(tmp47l, pw_mf060_mf089, z3l); \
- out7h = vec_msums(tmp47h, pw_mf060_mf089, z3h); \
- out1l = vec_msums(tmp47l, pw_mf089_f060, z4l); \
- out1h = vec_msums(tmp47h, pw_mf089_f060, z4h); \
- \
- out7l = vec_sra(out7l, descale_p##PASS); \
- out7h = vec_sra(out7h, descale_p##PASS); \
- out1l = vec_sra(out1l, descale_p##PASS); \
- out1h = vec_sra(out1h, descale_p##PASS); \
- \
- out7 = vec_pack(out7l, out7h); \
- out1 = vec_pack(out1l, out1h); \
- \
- tmp56l = vec_mergeh(tmp5, tmp6); \
- tmp56h = vec_mergel(tmp5, tmp6); \
- \
- out5l = vec_msums(tmp56l, pw_mf050_mf256, z4l); \
- out5h = vec_msums(tmp56h, pw_mf050_mf256, z4h); \
- out3l = vec_msums(tmp56l, pw_mf256_f050, z3l); \
- out3h = vec_msums(tmp56h, pw_mf256_f050, z3h); \
- \
- out5l = vec_sra(out5l, descale_p##PASS); \
- out5h = vec_sra(out5h, descale_p##PASS); \
- out3l = vec_sra(out3l, descale_p##PASS); \
- out3h = vec_sra(out3h, descale_p##PASS); \
- \
- out5 = vec_pack(out5l, out5h); \
- out3 = vec_pack(out3l, out3h); \
-}
-
-#define DO_FDCT_PASS1() { \
- /* Even part */ \
- \
- tmp10 = vec_add(tmp0, tmp3); \
- tmp13 = vec_sub(tmp0, tmp3); \
- tmp11 = vec_add(tmp1, tmp2); \
- tmp12 = vec_sub(tmp1, tmp2); \
- \
- out0 = vec_add(tmp10, tmp11); \
- out0 = vec_sl(out0, pass1_bits); \
- out4 = vec_sub(tmp10, tmp11); \
- out4 = vec_sl(out4, pass1_bits); \
- \
- DO_FDCT_COMMON(1); \
-}
-
-#define DO_FDCT_PASS2() { \
- /* Even part */ \
- \
- tmp10 = vec_add(tmp0, tmp3); \
- tmp13 = vec_sub(tmp0, tmp3); \
- tmp11 = vec_add(tmp1, tmp2); \
- tmp12 = vec_sub(tmp1, tmp2); \
- \
- out0 = vec_add(tmp10, tmp11); \
- out0 = vec_add(out0, pw_descale_p2x); \
- out0 = vec_sra(out0, pass1_bits); \
- out4 = vec_sub(tmp10, tmp11); \
- out4 = vec_add(out4, pw_descale_p2x); \
- out4 = vec_sra(out4, pass1_bits); \
- \
- DO_FDCT_COMMON(2); \
-}
-
-
-void jsimd_fdct_islow_altivec(DCTELEM *data)
-{
- __vector short row0, row1, row2, row3, row4, row5, row6, row7,
- col0, col1, col2, col3, col4, col5, col6, col7,
- tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
- tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
- z3, z4, z34l, z34h,
- out0, out1, out2, out3, out4, out5, out6, out7;
- __vector int z3l, z3h, z4l, z4h,
- out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
- out7l, out7h;
-
- /* Constants */
- __vector short
- pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
- pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
- pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
- pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
- pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
- pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
- pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
- pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) },
- pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) };
- __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
- __vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
- pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
- __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
- descale_p2 = { __4X(DESCALE_P2) };
-
- /* Pass 1: process rows */
-
- row0 = vec_ld(0, data);
- row1 = vec_ld(16, data);
- row2 = vec_ld(32, data);
- row3 = vec_ld(48, data);
- row4 = vec_ld(64, data);
- row5 = vec_ld(80, data);
- row6 = vec_ld(96, data);
- row7 = vec_ld(112, data);
-
- TRANSPOSE(row, col);
-
- tmp0 = vec_add(col0, col7);
- tmp7 = vec_sub(col0, col7);
- tmp1 = vec_add(col1, col6);
- tmp6 = vec_sub(col1, col6);
- tmp2 = vec_add(col2, col5);
- tmp5 = vec_sub(col2, col5);
- tmp3 = vec_add(col3, col4);
- tmp4 = vec_sub(col3, col4);
-
- DO_FDCT_PASS1();
-
- /* Pass 2: process columns */
-
- TRANSPOSE(out, row);
-
- tmp0 = vec_add(row0, row7);
- tmp7 = vec_sub(row0, row7);
- tmp1 = vec_add(row1, row6);
- tmp6 = vec_sub(row1, row6);
- tmp2 = vec_add(row2, row5);
- tmp5 = vec_sub(row2, row5);
- tmp3 = vec_add(row3, row4);
- tmp4 = vec_sub(row3, row4);
-
- DO_FDCT_PASS2();
-
- vec_st(out0, 0, data);
- vec_st(out1, 16, data);
- vec_st(out2, 32, data);
- vec_st(out3, 48, data);
- vec_st(out4, 64, data);
- vec_st(out5, 80, data);
- vec_st(out6, 96, data);
- vec_st(out7, 112, data);
-}
diff --git a/simd/powerpc/jidctfst-altivec.c b/simd/powerpc/jidctfst-altivec.c
deleted file mode 100644
index 456c6c6..0000000
--- a/simd/powerpc/jidctfst-altivec.c
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* FAST INTEGER INVERSE DCT
- *
- * This is similar to the SSE2 implementation, except that we left-shift the
- * constants by 1 less bit (the -1 in CONST_SHIFT.) This is because
- * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
- * the elements in arg3 + the most significant 17 bits of
- * (the elements in arg1 * the elements in arg2).
- */
-
-#include "jsimd_altivec.h"
-
-
-#define F_1_082 277 /* FIX(1.082392200) */
-#define F_1_414 362 /* FIX(1.414213562) */
-#define F_1_847 473 /* FIX(1.847759065) */
-#define F_2_613 669 /* FIX(2.613125930) */
-#define F_1_613 (F_2_613 - 256) /* FIX(2.613125930) - FIX(1) */
-
-#define CONST_BITS 8
-#define PASS1_BITS 2
-#define PRE_MULTIPLY_SCALE_BITS 2
-#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
-
-
-#define DO_IDCT(in) { \
- /* Even part */ \
- \
- tmp10 = vec_add(in##0, in##4); \
- tmp11 = vec_sub(in##0, in##4); \
- tmp13 = vec_add(in##2, in##6); \
- \
- tmp12 = vec_sub(in##2, in##6); \
- tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
- tmp12 = vec_madds(tmp12, pw_F1414, pw_zero); \
- tmp12 = vec_sub(tmp12, tmp13); \
- \
- tmp0 = vec_add(tmp10, tmp13); \
- tmp3 = vec_sub(tmp10, tmp13); \
- tmp1 = vec_add(tmp11, tmp12); \
- tmp2 = vec_sub(tmp11, tmp12); \
- \
- /* Odd part */ \
- \
- z13 = vec_add(in##5, in##3); \
- z10 = vec_sub(in##5, in##3); \
- z10s = vec_sl(z10, pre_multiply_scale_bits); \
- z11 = vec_add(in##1, in##7); \
- z12s = vec_sub(in##1, in##7); \
- z12s = vec_sl(z12s, pre_multiply_scale_bits); \
- \
- tmp11 = vec_sub(z11, z13); \
- tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
- tmp11 = vec_madds(tmp11, pw_F1414, pw_zero); \
- \
- tmp7 = vec_add(z11, z13); \
- \
- /* To avoid overflow... \
- * \
- * (Original) \
- * tmp12 = -2.613125930 * z10 + z5; \
- * \
- * (This implementation) \
- * tmp12 = (-1.613125930 - 1) * z10 + z5; \
- * = -1.613125930 * z10 - z10 + z5; \
- */ \
- \
- z5 = vec_add(z10s, z12s); \
- z5 = vec_madds(z5, pw_F1847, pw_zero); \
- \
- tmp10 = vec_madds(z12s, pw_F1082, pw_zero); \
- tmp10 = vec_sub(tmp10, z5); \
- tmp12 = vec_madds(z10s, pw_MF1613, z5); \
- tmp12 = vec_sub(tmp12, z10); \
- \
- tmp6 = vec_sub(tmp12, tmp7); \
- tmp5 = vec_sub(tmp11, tmp6); \
- tmp4 = vec_add(tmp10, tmp5); \
- \
- out0 = vec_add(tmp0, tmp7); \
- out1 = vec_add(tmp1, tmp6); \
- out2 = vec_add(tmp2, tmp5); \
- out3 = vec_sub(tmp3, tmp4); \
- out4 = vec_add(tmp3, tmp4); \
- out5 = vec_sub(tmp2, tmp5); \
- out6 = vec_sub(tmp1, tmp6); \
- out7 = vec_sub(tmp0, tmp7); \
-}
-
-
-void jsimd_idct_ifast_altivec(void *dct_table_, JCOEFPTR coef_block,
- JSAMPARRAY output_buf, JDIMENSION output_col)
-{
- short *dct_table = (short *)dct_table_;
- int *outptr;
-
- __vector short row0, row1, row2, row3, row4, row5, row6, row7,
- col0, col1, col2, col3, col4, col5, col6, col7,
- quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
- tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
- z5, z10, z10s, z11, z12s, z13,
- out0, out1, out2, out3, out4, out5, out6, out7;
- __vector signed char outb;
-
- /* Constants */
- __vector short pw_zero = { __8X(0) },
- pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) },
- pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) },
- pw_MF1613 = { __8X(-F_1_613 << CONST_SHIFT) },
- pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) };
- __vector unsigned short
- pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) },
- pass1_bits3 = { __8X(PASS1_BITS + 3) };
- __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
-
- /* Pass 1: process columns */
-
- col0 = vec_ld(0, coef_block);
- col1 = vec_ld(16, coef_block);
- col2 = vec_ld(32, coef_block);
- col3 = vec_ld(48, coef_block);
- col4 = vec_ld(64, coef_block);
- col5 = vec_ld(80, coef_block);
- col6 = vec_ld(96, coef_block);
- col7 = vec_ld(112, coef_block);
-
- tmp1 = vec_or(col1, col2);
- tmp2 = vec_or(col3, col4);
- tmp1 = vec_or(tmp1, tmp2);
- tmp3 = vec_or(col5, col6);
- tmp3 = vec_or(tmp3, col7);
- tmp1 = vec_or(tmp1, tmp3);
-
- quant0 = vec_ld(0, dct_table);
- col0 = vec_mladd(col0, quant0, pw_zero);
-
- if (vec_all_eq(tmp1, pw_zero)) {
- /* AC terms all zero */
-
- row0 = vec_splat(col0, 0);
- row1 = vec_splat(col0, 1);
- row2 = vec_splat(col0, 2);
- row3 = vec_splat(col0, 3);
- row4 = vec_splat(col0, 4);
- row5 = vec_splat(col0, 5);
- row6 = vec_splat(col0, 6);
- row7 = vec_splat(col0, 7);
-
- } else {
-
- quant1 = vec_ld(16, dct_table);
- quant2 = vec_ld(32, dct_table);
- quant3 = vec_ld(48, dct_table);
- quant4 = vec_ld(64, dct_table);
- quant5 = vec_ld(80, dct_table);
- quant6 = vec_ld(96, dct_table);
- quant7 = vec_ld(112, dct_table);
-
- col1 = vec_mladd(col1, quant1, pw_zero);
- col2 = vec_mladd(col2, quant2, pw_zero);
- col3 = vec_mladd(col3, quant3, pw_zero);
- col4 = vec_mladd(col4, quant4, pw_zero);
- col5 = vec_mladd(col5, quant5, pw_zero);
- col6 = vec_mladd(col6, quant6, pw_zero);
- col7 = vec_mladd(col7, quant7, pw_zero);
-
- DO_IDCT(col);
-
- TRANSPOSE(out, row);
- }
-
- /* Pass 2: process rows */
-
- DO_IDCT(row);
-
- out0 = vec_sra(out0, pass1_bits3);
- out1 = vec_sra(out1, pass1_bits3);
- out2 = vec_sra(out2, pass1_bits3);
- out3 = vec_sra(out3, pass1_bits3);
- out4 = vec_sra(out4, pass1_bits3);
- out5 = vec_sra(out5, pass1_bits3);
- out6 = vec_sra(out6, pass1_bits3);
- out7 = vec_sra(out7, pass1_bits3);
-
- TRANSPOSE(out, col);
-
- outb = vec_packs(col0, col0);
- outb = vec_add(outb, pb_centerjsamp);
- outptr = (int *)(output_buf[0] + output_col);
- vec_ste((__vector int)outb, 0, outptr);
- vec_ste((__vector int)outb, 4, outptr);
-
- outb = vec_packs(col1, col1);
- outb = vec_add(outb, pb_centerjsamp);
- outptr = (int *)(output_buf[1] + output_col);
- vec_ste((__vector int)outb, 0, outptr);
- vec_ste((__vector int)outb, 4, outptr);
-
- outb = vec_packs(col2, col2);
- outb = vec_add(outb, pb_centerjsamp);
- outptr = (int *)(output_buf[2] + output_col);
- vec_ste((__vector int)outb, 0, outptr);
- vec_ste((__vector int)outb, 4, outptr);
-
- outb = vec_packs(col3, col3);
- outb = vec_add(outb, pb_centerjsamp);
- outptr = (int *)(output_buf[3] + output_col);
- vec_ste((__vector int)outb, 0, outptr);
- vec_ste((__vector int)outb, 4, outptr);
-
- outb = vec_packs(col4, col4);
- outb = vec_add(outb, pb_centerjsamp);
- outptr = (int *)(output_buf[4] + output_col);
- vec_ste((__vector int)outb, 0, outptr);
- vec_ste((__vector int)outb, 4, outptr);
-
- outb = vec_packs(col5, col5);
- outb = vec_add(outb, pb_centerjsamp);
- outptr = (int *)(output_buf[5] + output_col);
- vec_ste((__vector int)outb, 0, outptr);
- vec_ste((__vector int)outb, 4, outptr);
-
- outb = vec_packs(col6, col6);
- outb = vec_add(outb, pb_centerjsamp);
- outptr = (int *)(output_buf[6] + output_col);
- vec_ste((__vector int)outb, 0, outptr);
- vec_ste((__vector int)outb, 4, outptr);
-
- outb = vec_packs(col7, col7);
- outb = vec_add(outb, pb_centerjsamp);
- outptr = (int *)(output_buf[7] + output_col);
- vec_ste((__vector int)outb, 0, outptr);
- vec_ste((__vector int)outb, 4, outptr);
-}
diff --git a/simd/powerpc/jidctint-altivec.c b/simd/powerpc/jidctint-altivec.c
deleted file mode 100644
index 0e5dd58..0000000
--- a/simd/powerpc/jidctint-altivec.c
+++ /dev/null
@@ -1,357 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* SLOW INTEGER INVERSE DCT */
-
-#include "jsimd_altivec.h"
-
-
-#define F_0_298 2446 /* FIX(0.298631336) */
-#define F_0_390 3196 /* FIX(0.390180644) */
-#define F_0_541 4433 /* FIX(0.541196100) */
-#define F_0_765 6270 /* FIX(0.765366865) */
-#define F_0_899 7373 /* FIX(0.899976223) */
-#define F_1_175 9633 /* FIX(1.175875602) */
-#define F_1_501 12299 /* FIX(1.501321110) */
-#define F_1_847 15137 /* FIX(1.847759065) */
-#define F_1_961 16069 /* FIX(1.961570560) */
-#define F_2_053 16819 /* FIX(2.053119869) */
-#define F_2_562 20995 /* FIX(2.562915447) */
-#define F_3_072 25172 /* FIX(3.072711026) */
-
-#define CONST_BITS 13
-#define PASS1_BITS 2
-#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
-#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
-
-
-#define DO_IDCT(in, PASS) { \
- /* Even part \
- * \
- * (Original) \
- * z1 = (z2 + z3) * 0.541196100; \
- * tmp2 = z1 + z3 * -1.847759065; \
- * tmp3 = z1 + z2 * 0.765366865; \
- * \
- * (This implementation) \
- * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
- * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
- */ \
- \
- in##26l = vec_mergeh(in##2, in##6); \
- in##26h = vec_mergel(in##2, in##6); \
- \
- tmp3l = vec_msums(in##26l, pw_f130_f054, pd_zero); \
- tmp3h = vec_msums(in##26h, pw_f130_f054, pd_zero); \
- tmp2l = vec_msums(in##26l, pw_f054_mf130, pd_zero); \
- tmp2h = vec_msums(in##26h, pw_f054_mf130, pd_zero); \
- \
- tmp0 = vec_add(in##0, in##4); \
- tmp1 = vec_sub(in##0, in##4); \
- \
- tmp0l = vec_unpackh(tmp0); \
- tmp0h = vec_unpackl(tmp0); \
- tmp0l = vec_sl(tmp0l, const_bits); \
- tmp0h = vec_sl(tmp0h, const_bits); \
- tmp0l = vec_add(tmp0l, pd_descale_p##PASS); \
- tmp0h = vec_add(tmp0h, pd_descale_p##PASS); \
- \
- tmp10l = vec_add(tmp0l, tmp3l); \
- tmp10h = vec_add(tmp0h, tmp3h); \
- tmp13l = vec_sub(tmp0l, tmp3l); \
- tmp13h = vec_sub(tmp0h, tmp3h); \
- \
- tmp1l = vec_unpackh(tmp1); \
- tmp1h = vec_unpackl(tmp1); \
- tmp1l = vec_sl(tmp1l, const_bits); \
- tmp1h = vec_sl(tmp1h, const_bits); \
- tmp1l = vec_add(tmp1l, pd_descale_p##PASS); \
- tmp1h = vec_add(tmp1h, pd_descale_p##PASS); \
- \
- tmp11l = vec_add(tmp1l, tmp2l); \
- tmp11h = vec_add(tmp1h, tmp2h); \
- tmp12l = vec_sub(tmp1l, tmp2l); \
- tmp12h = vec_sub(tmp1h, tmp2h); \
- \
- /* Odd part */ \
- \
- z3 = vec_add(in##3, in##7); \
- z4 = vec_add(in##1, in##5); \
- \
- /* (Original) \
- * z5 = (z3 + z4) * 1.175875602; \
- * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
- * z3 += z5; z4 += z5; \
- * \
- * (This implementation) \
- * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
- * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
- */ \
- \
- z34l = vec_mergeh(z3, z4); \
- z34h = vec_mergel(z3, z4); \
- \
- z3l = vec_msums(z34l, pw_mf078_f117, pd_zero); \
- z3h = vec_msums(z34h, pw_mf078_f117, pd_zero); \
- z4l = vec_msums(z34l, pw_f117_f078, pd_zero); \
- z4h = vec_msums(z34h, pw_f117_f078, pd_zero); \
- \
- /* (Original) \
- * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; \
- * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; \
- * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; \
- * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
- * tmp0 += z1 + z3; tmp1 += z2 + z4; \
- * tmp2 += z2 + z3; tmp3 += z1 + z4; \
- * \
- * (This implementation) \
- * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
- * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
- * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
- * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
- * tmp0 += z3; tmp1 += z4; \
- * tmp2 += z3; tmp3 += z4; \
- */ \
- \
- in##71l = vec_mergeh(in##7, in##1); \
- in##71h = vec_mergel(in##7, in##1); \
- \
- tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l); \
- tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h); \
- tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l); \
- tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h); \
- \
- in##53l = vec_mergeh(in##5, in##3); \
- in##53h = vec_mergel(in##5, in##3); \
- \
- tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l); \
- tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h); \
- tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l); \
- tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h); \
- \
- /* Final output stage */ \
- \
- out0l = vec_add(tmp10l, tmp3l); \
- out0h = vec_add(tmp10h, tmp3h); \
- out7l = vec_sub(tmp10l, tmp3l); \
- out7h = vec_sub(tmp10h, tmp3h); \
- \
- out0l = vec_sra(out0l, descale_p##PASS); \
- out0h = vec_sra(out0h, descale_p##PASS); \
- out7l = vec_sra(out7l, descale_p##PASS); \
- out7h = vec_sra(out7h, descale_p##PASS); \
- \
- out0 = vec_pack(out0l, out0h); \
- out7 = vec_pack(out7l, out7h); \
- \
- out1l = vec_add(tmp11l, tmp2l); \
- out1h = vec_add(tmp11h, tmp2h); \
- out6l = vec_sub(tmp11l, tmp2l); \
- out6h = vec_sub(tmp11h, tmp2h); \
- \
- out1l = vec_sra(out1l, descale_p##PASS); \
- out1h = vec_sra(out1h, descale_p##PASS); \
- out6l = vec_sra(out6l, descale_p##PASS); \
- out6h = vec_sra(out6h, descale_p##PASS); \
- \
- out1 = vec_pack(out1l, out1h); \
- out6 = vec_pack(out6l, out6h); \
- \
- out2l = vec_add(tmp12l, tmp1l); \
- out2h = vec_add(tmp12h, tmp1h); \
- out5l = vec_sub(tmp12l, tmp1l); \
- out5h = vec_sub(tmp12h, tmp1h); \
- \
- out2l = vec_sra(out2l, descale_p##PASS); \
- out2h = vec_sra(out2h, descale_p##PASS); \
- out5l = vec_sra(out5l, descale_p##PASS); \
- out5h = vec_sra(out5h, descale_p##PASS); \
- \
- out2 = vec_pack(out2l, out2h); \
- out5 = vec_pack(out5l, out5h); \
- \
- out3l = vec_add(tmp13l, tmp0l); \
- out3h = vec_add(tmp13h, tmp0h); \
- out4l = vec_sub(tmp13l, tmp0l); \
- out4h = vec_sub(tmp13h, tmp0h); \
- \
- out3l = vec_sra(out3l, descale_p##PASS); \
- out3h = vec_sra(out3h, descale_p##PASS); \
- out4l = vec_sra(out4l, descale_p##PASS); \
- out4h = vec_sra(out4h, descale_p##PASS); \
- \
- out3 = vec_pack(out3l, out3h); \
- out4 = vec_pack(out4l, out4h); \
-}
-
-
-void jsimd_idct_islow_altivec(void *dct_table_, JCOEFPTR coef_block,
- JSAMPARRAY output_buf, JDIMENSION output_col)
-{
- short *dct_table = (short *)dct_table_;
- int *outptr;
-
- __vector short row0, row1, row2, row3, row4, row5, row6, row7,
- col0, col1, col2, col3, col4, col5, col6, col7,
- quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
- tmp0, tmp1, tmp2, tmp3, z3, z4,
- z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
- row71l, row71h, row26l, row26h, row53l, row53h,
- out0, out1, out2, out3, out4, out5, out6, out7;
- __vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
- tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
- z3l, z3h, z4l, z4h,
- out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
- out5l, out5h, out6l, out6h, out7l, out7h;
- __vector signed char outb;
-
- /* Constants */
- __vector short pw_zero = { __8X(0) },
- pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
- pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
- pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
- pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
- pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
- pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
- pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
- pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) };
- __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
- __vector int pd_zero = { __4X(0) },
- pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
- pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
- __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
- descale_p2 = { __4X(DESCALE_P2) },
- const_bits = { __4X(CONST_BITS) };
- __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
-
- /* Pass 1: process columns */
-
- col0 = vec_ld(0, coef_block);
- col1 = vec_ld(16, coef_block);
- col2 = vec_ld(32, coef_block);
- col3 = vec_ld(48, coef_block);
- col4 = vec_ld(64, coef_block);
- col5 = vec_ld(80, coef_block);
- col6 = vec_ld(96, coef_block);
- col7 = vec_ld(112, coef_block);
-
- tmp1 = vec_or(col1, col2);
- tmp2 = vec_or(col3, col4);
- tmp1 = vec_or(tmp1, tmp2);
- tmp3 = vec_or(col5, col6);
- tmp3 = vec_or(tmp3, col7);
- tmp1 = vec_or(tmp1, tmp3);
-
- quant0 = vec_ld(0, dct_table);
- col0 = vec_mladd(col0, quant0, pw_zero);
-
- if (vec_all_eq(tmp1, pw_zero)) {
- /* AC terms all zero */
-
- col0 = vec_sl(col0, pass1_bits);
-
- row0 = vec_splat(col0, 0);
- row1 = vec_splat(col0, 1);
- row2 = vec_splat(col0, 2);
- row3 = vec_splat(col0, 3);
- row4 = vec_splat(col0, 4);
- row5 = vec_splat(col0, 5);
- row6 = vec_splat(col0, 6);
- row7 = vec_splat(col0, 7);
-
- } else {
-
- quant1 = vec_ld(16, dct_table);
- quant2 = vec_ld(32, dct_table);
- quant3 = vec_ld(48, dct_table);
- quant4 = vec_ld(64, dct_table);
- quant5 = vec_ld(80, dct_table);
- quant6 = vec_ld(96, dct_table);
- quant7 = vec_ld(112, dct_table);
-
- col1 = vec_mladd(col1, quant1, pw_zero);
- col2 = vec_mladd(col2, quant2, pw_zero);
- col3 = vec_mladd(col3, quant3, pw_zero);
- col4 = vec_mladd(col4, quant4, pw_zero);
- col5 = vec_mladd(col5, quant5, pw_zero);
- col6 = vec_mladd(col6, quant6, pw_zero);
- col7 = vec_mladd(col7, quant7, pw_zero);
-
- DO_IDCT(col, 1);
-
- TRANSPOSE(out, row);
- }
-
- /* Pass 2: process rows */
-
- DO_IDCT(row, 2);
-
- TRANSPOSE(out, col);
-
- outb = vec_packs(col0, col0);
- outb = vec_add(outb, pb_centerjsamp);
- outptr = (int *)(output_buf[0] + output_col);
- vec_ste((__vector int)outb, 0, outptr);
- vec_ste((__vector int)outb, 4, outptr);
-
- outb = vec_packs(col1, col1);
- outb = vec_add(outb, pb_centerjsamp);
- outptr = (int *)(output_buf[1] + output_col);
- vec_ste((__vector int)outb, 0, outptr);
- vec_ste((__vector int)outb, 4, outptr);
-
- outb = vec_packs(col2, col2);
- outb = vec_add(outb, pb_centerjsamp);
- outptr = (int *)(output_buf[2] + output_col);
- vec_ste((__vector int)outb, 0, outptr);
- vec_ste((__vector int)outb, 4, outptr);
-
- outb = vec_packs(col3, col3);
- outb = vec_add(outb, pb_centerjsamp);
- outptr = (int *)(output_buf[3] + output_col);
- vec_ste((__vector int)outb, 0, outptr);
- vec_ste((__vector int)outb, 4, outptr);
-
- outb = vec_packs(col4, col4);
- outb = vec_add(outb, pb_centerjsamp);
- outptr = (int *)(output_buf[4] + output_col);
- vec_ste((__vector int)outb, 0, outptr);
- vec_ste((__vector int)outb, 4, outptr);
-
- outb = vec_packs(col5, col5);
- outb = vec_add(outb, pb_centerjsamp);
- outptr = (int *)(output_buf[5] + output_col);
- vec_ste((__vector int)outb, 0, outptr);
- vec_ste((__vector int)outb, 4, outptr);
-
- outb = vec_packs(col6, col6);
- outb = vec_add(outb, pb_centerjsamp);
- outptr = (int *)(output_buf[6] + output_col);
- vec_ste((__vector int)outb, 0, outptr);
- vec_ste((__vector int)outb, 4, outptr);
-
- outb = vec_packs(col7, col7);
- outb = vec_add(outb, pb_centerjsamp);
- outptr = (int *)(output_buf[7] + output_col);
- vec_ste((__vector int)outb, 0, outptr);
- vec_ste((__vector int)outb, 4, outptr);
-}
diff --git a/simd/powerpc/jquanti-altivec.c b/simd/powerpc/jquanti-altivec.c
deleted file mode 100644
index 7d6e325..0000000
--- a/simd/powerpc/jquanti-altivec.c
+++ /dev/null
@@ -1,250 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
-
-#include "jsimd_altivec.h"
-
-
-/* NOTE: The address will either be aligned or offset by 8 bytes, so we can
- * always get the data we want by using a single vector load (although we may
- * have to permute the result.)
- */
-#if __BIG_ENDIAN__
-
-#define LOAD_ROW(row) { \
- elemptr = sample_data[row] + start_col; \
- in##row = vec_ld(0, elemptr); \
- if ((size_t)elemptr & 15) \
- in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr)); \
-}
-
-#else
-
-#define LOAD_ROW(row) { \
- elemptr = sample_data[row] + start_col; \
- in##row = vec_vsx_ld(0, elemptr); \
-}
-
-#endif
-
-
-void jsimd_convsamp_altivec(JSAMPARRAY sample_data, JDIMENSION start_col,
- DCTELEM *workspace)
-{
- JSAMPROW elemptr;
-
- __vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7;
- __vector short out0, out1, out2, out3, out4, out5, out6, out7;
-
- /* Constants */
- __vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) };
- __vector unsigned char pb_zero = { __16X(0) };
-
- LOAD_ROW(0);
- LOAD_ROW(1);
- LOAD_ROW(2);
- LOAD_ROW(3);
- LOAD_ROW(4);
- LOAD_ROW(5);
- LOAD_ROW(6);
- LOAD_ROW(7);
-
- out0 = (__vector short)VEC_UNPACKHU(in0);
- out1 = (__vector short)VEC_UNPACKHU(in1);
- out2 = (__vector short)VEC_UNPACKHU(in2);
- out3 = (__vector short)VEC_UNPACKHU(in3);
- out4 = (__vector short)VEC_UNPACKHU(in4);
- out5 = (__vector short)VEC_UNPACKHU(in5);
- out6 = (__vector short)VEC_UNPACKHU(in6);
- out7 = (__vector short)VEC_UNPACKHU(in7);
-
- out0 = vec_sub(out0, pw_centerjsamp);
- out1 = vec_sub(out1, pw_centerjsamp);
- out2 = vec_sub(out2, pw_centerjsamp);
- out3 = vec_sub(out3, pw_centerjsamp);
- out4 = vec_sub(out4, pw_centerjsamp);
- out5 = vec_sub(out5, pw_centerjsamp);
- out6 = vec_sub(out6, pw_centerjsamp);
- out7 = vec_sub(out7, pw_centerjsamp);
-
- vec_st(out0, 0, workspace);
- vec_st(out1, 16, workspace);
- vec_st(out2, 32, workspace);
- vec_st(out3, 48, workspace);
- vec_st(out4, 64, workspace);
- vec_st(out5, 80, workspace);
- vec_st(out6, 96, workspace);
- vec_st(out7, 112, workspace);
-}
-
-
-#define WORD_BIT 16
-
-/* There is no AltiVec 16-bit unsigned multiply instruction, hence this.
- We basically need an unsigned equivalent of vec_madds(). */
-
-#define MULTIPLY(vs0, vs1, out) { \
- tmpe = vec_mule((__vector unsigned short)vs0, \
- (__vector unsigned short)vs1); \
- tmpo = vec_mulo((__vector unsigned short)vs0, \
- (__vector unsigned short)vs1); \
- out = (__vector short)vec_perm((__vector unsigned short)tmpe, \
- (__vector unsigned short)tmpo, \
- shift_pack_index); \
-}
-
-void jsimd_quantize_altivec(JCOEFPTR coef_block, DCTELEM *divisors,
- DCTELEM *workspace)
-{
- __vector short row0, row1, row2, row3, row4, row5, row6, row7,
- row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s,
- corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7,
- recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7,
- scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7;
- __vector unsigned int tmpe, tmpo;
-
- /* Constants */
- __vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) };
-#if __BIG_ENDIAN__
- __vector unsigned char shift_pack_index =
- { 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29 };
-#else
- __vector unsigned char shift_pack_index =
- { 2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31 };
-#endif
-
- row0 = vec_ld(0, workspace);
- row1 = vec_ld(16, workspace);
- row2 = vec_ld(32, workspace);
- row3 = vec_ld(48, workspace);
- row4 = vec_ld(64, workspace);
- row5 = vec_ld(80, workspace);
- row6 = vec_ld(96, workspace);
- row7 = vec_ld(112, workspace);
-
- /* Branch-less absolute value */
- row0s = vec_sra(row0, pw_word_bit_m1);
- row1s = vec_sra(row1, pw_word_bit_m1);
- row2s = vec_sra(row2, pw_word_bit_m1);
- row3s = vec_sra(row3, pw_word_bit_m1);
- row4s = vec_sra(row4, pw_word_bit_m1);
- row5s = vec_sra(row5, pw_word_bit_m1);
- row6s = vec_sra(row6, pw_word_bit_m1);
- row7s = vec_sra(row7, pw_word_bit_m1);
- row0 = vec_xor(row0, row0s);
- row1 = vec_xor(row1, row1s);
- row2 = vec_xor(row2, row2s);
- row3 = vec_xor(row3, row3s);
- row4 = vec_xor(row4, row4s);
- row5 = vec_xor(row5, row5s);
- row6 = vec_xor(row6, row6s);
- row7 = vec_xor(row7, row7s);
- row0 = vec_sub(row0, row0s);
- row1 = vec_sub(row1, row1s);
- row2 = vec_sub(row2, row2s);
- row3 = vec_sub(row3, row3s);
- row4 = vec_sub(row4, row4s);
- row5 = vec_sub(row5, row5s);
- row6 = vec_sub(row6, row6s);
- row7 = vec_sub(row7, row7s);
-
- corr0 = vec_ld(DCTSIZE2 * 2, divisors);
- corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors);
- corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors);
- corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors);
- corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors);
- corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors);
- corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors);
- corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors);
-
- row0 = vec_add(row0, corr0);
- row1 = vec_add(row1, corr1);
- row2 = vec_add(row2, corr2);
- row3 = vec_add(row3, corr3);
- row4 = vec_add(row4, corr4);
- row5 = vec_add(row5, corr5);
- row6 = vec_add(row6, corr6);
- row7 = vec_add(row7, corr7);
-
- recip0 = vec_ld(0, divisors);
- recip1 = vec_ld(16, divisors);
- recip2 = vec_ld(32, divisors);
- recip3 = vec_ld(48, divisors);
- recip4 = vec_ld(64, divisors);
- recip5 = vec_ld(80, divisors);
- recip6 = vec_ld(96, divisors);
- recip7 = vec_ld(112, divisors);
-
- MULTIPLY(row0, recip0, row0);
- MULTIPLY(row1, recip1, row1);
- MULTIPLY(row2, recip2, row2);
- MULTIPLY(row3, recip3, row3);
- MULTIPLY(row4, recip4, row4);
- MULTIPLY(row5, recip5, row5);
- MULTIPLY(row6, recip6, row6);
- MULTIPLY(row7, recip7, row7);
-
- scale0 = vec_ld(DCTSIZE2 * 4, divisors);
- scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors);
- scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors);
- scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors);
- scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors);
- scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors);
- scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors);
- scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors);
-
- MULTIPLY(row0, scale0, row0);
- MULTIPLY(row1, scale1, row1);
- MULTIPLY(row2, scale2, row2);
- MULTIPLY(row3, scale3, row3);
- MULTIPLY(row4, scale4, row4);
- MULTIPLY(row5, scale5, row5);
- MULTIPLY(row6, scale6, row6);
- MULTIPLY(row7, scale7, row7);
-
- row0 = vec_xor(row0, row0s);
- row1 = vec_xor(row1, row1s);
- row2 = vec_xor(row2, row2s);
- row3 = vec_xor(row3, row3s);
- row4 = vec_xor(row4, row4s);
- row5 = vec_xor(row5, row5s);
- row6 = vec_xor(row6, row6s);
- row7 = vec_xor(row7, row7s);
- row0 = vec_sub(row0, row0s);
- row1 = vec_sub(row1, row1s);
- row2 = vec_sub(row2, row2s);
- row3 = vec_sub(row3, row3s);
- row4 = vec_sub(row4, row4s);
- row5 = vec_sub(row5, row5s);
- row6 = vec_sub(row6, row6s);
- row7 = vec_sub(row7, row7s);
-
- vec_st(row0, 0, coef_block);
- vec_st(row1, 16, coef_block);
- vec_st(row2, 32, coef_block);
- vec_st(row3, 48, coef_block);
- vec_st(row4, 64, coef_block);
- vec_st(row5, 80, coef_block);
- vec_st(row6, 96, coef_block);
- vec_st(row7, 112, coef_block);
-}
diff --git a/simd/powerpc/jsimd.c b/simd/powerpc/jsimd.c
deleted file mode 100644
index d0d3981..0000000
--- a/simd/powerpc/jsimd.c
+++ /dev/null
@@ -1,872 +0,0 @@
-/*
- * jsimd_powerpc.c
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2014-2016, 2018, D. R. Commander.
- * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
- *
- * Based on the x86 SIMD extension for IJG JPEG library,
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- * For conditions of distribution and use, see copyright notice in jsimdext.inc
- *
- * This file contains the interface between the "normal" portions
- * of the library and the SIMD implementations when running on a
- * PowerPC architecture.
- */
-
-#ifdef __amigaos4__
-/* This must be defined first as it re-defines GLOBAL otherwise */
-#include <proto/exec.h>
-#endif
-
-#define JPEG_INTERNALS
-#include "../../jinclude.h"
-#include "../../jpeglib.h"
-#include "../../jsimd.h"
-#include "../../jdct.h"
-#include "../../jsimddct.h"
-#include "../jsimd.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-
-#if defined(__OpenBSD__)
-#include <sys/param.h>
-#include <sys/sysctl.h>
-#include <machine/cpu.h>
-#endif
-
-static unsigned int simd_support = ~0;
-
-#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
-
-#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
-
-LOCAL(int)
-check_feature(char *buffer, char *feature)
-{
- char *p;
-
- if (*feature == 0)
- return 0;
- if (strncmp(buffer, "cpu", 3) != 0)
- return 0;
- buffer += 3;
- while (isspace(*buffer))
- buffer++;
-
- /* Check if 'feature' is present in the buffer as a separate word */
- while ((p = strstr(buffer, feature))) {
- if (p > buffer && !isspace(*(p - 1))) {
- buffer++;
- continue;
- }
- p += strlen(feature);
- if (*p != 0 && !isspace(*p)) {
- buffer++;
- continue;
- }
- return 1;
- }
- return 0;
-}
-
-LOCAL(int)
-parse_proc_cpuinfo(int bufsize)
-{
- char *buffer = (char *)malloc(bufsize);
- FILE *fd;
-
- simd_support = 0;
-
- if (!buffer)
- return 0;
-
- fd = fopen("/proc/cpuinfo", "r");
- if (fd) {
- while (fgets(buffer, bufsize, fd)) {
- if (!strchr(buffer, '\n') && !feof(fd)) {
- /* "impossible" happened - insufficient size of the buffer! */
- fclose(fd);
- free(buffer);
- return 0;
- }
- if (check_feature(buffer, "altivec"))
- simd_support |= JSIMD_ALTIVEC;
- }
- fclose(fd);
- }
- free(buffer);
- return 1;
-}
-
-#endif
-
-/*
- * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
- */
-LOCAL(void)
-init_simd(void)
-{
-#ifndef NO_GETENV
- char *env = NULL;
-#endif
-#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
- int bufsize = 1024; /* an initial guess for the line buffer size limit */
-#elif defined(__amigaos4__)
- uint32 altivec = 0;
-#elif defined(__OpenBSD__)
- int mib[2] = { CTL_MACHDEP, CPU_ALTIVEC };
- int altivec;
- size_t len = sizeof(altivec);
-#endif
-
- if (simd_support != ~0U)
- return;
-
- simd_support = 0;
-
-#if defined(__ALTIVEC__) || defined(__APPLE__)
- simd_support |= JSIMD_ALTIVEC;
-#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
- while (!parse_proc_cpuinfo(bufsize)) {
- bufsize *= 2;
- if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
- break;
- }
-#elif defined(__amigaos4__)
- IExec->GetCPUInfoTags(GCIT_VectorUnit, &altivec, TAG_DONE);
- if (altivec == VECTORTYPE_ALTIVEC)
- simd_support |= JSIMD_ALTIVEC;
-#elif defined(__OpenBSD__)
- if (sysctl(mib, 2, &altivec, &len, NULL, 0) == 0 && altivec != 0)
- simd_support |= JSIMD_ALTIVEC;
-#endif
-
-#ifndef NO_GETENV
- /* Force different settings through environment variables */
- env = getenv("JSIMD_FORCEALTIVEC");
- if ((env != NULL) && (strcmp(env, "1") == 0))
- simd_support = JSIMD_ALTIVEC;
- env = getenv("JSIMD_FORCENONE");
- if ((env != NULL) && (strcmp(env, "1") == 0))
- simd_support = 0;
-#endif
-}
-
-GLOBAL(int)
-jsimd_can_rgb_ycc(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_gray(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb565(void)
-{
- return 0;
-}
-
-GLOBAL(void)
-jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
- JSAMPIMAGE output_buf, JDIMENSION output_row,
- int num_rows)
-{
- void (*altivecfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
- switch (cinfo->in_color_space) {
- case JCS_EXT_RGB:
- altivecfct = jsimd_extrgb_ycc_convert_altivec;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- altivecfct = jsimd_extrgbx_ycc_convert_altivec;
- break;
- case JCS_EXT_BGR:
- altivecfct = jsimd_extbgr_ycc_convert_altivec;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- altivecfct = jsimd_extbgrx_ycc_convert_altivec;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- altivecfct = jsimd_extxbgr_ycc_convert_altivec;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- altivecfct = jsimd_extxrgb_ycc_convert_altivec;
- break;
- default:
- altivecfct = jsimd_rgb_ycc_convert_altivec;
- break;
- }
-
- altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
- JSAMPIMAGE output_buf, JDIMENSION output_row,
- int num_rows)
-{
- void (*altivecfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
- switch (cinfo->in_color_space) {
- case JCS_EXT_RGB:
- altivecfct = jsimd_extrgb_gray_convert_altivec;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- altivecfct = jsimd_extrgbx_gray_convert_altivec;
- break;
- case JCS_EXT_BGR:
- altivecfct = jsimd_extbgr_gray_convert_altivec;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- altivecfct = jsimd_extbgrx_gray_convert_altivec;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- altivecfct = jsimd_extxbgr_gray_convert_altivec;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- altivecfct = jsimd_extxrgb_gray_convert_altivec;
- break;
- default:
- altivecfct = jsimd_rgb_gray_convert_altivec;
- break;
- }
-
- altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
- JDIMENSION input_row, JSAMPARRAY output_buf,
- int num_rows)
-{
- void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
-
- switch (cinfo->out_color_space) {
- case JCS_EXT_RGB:
- altivecfct = jsimd_ycc_extrgb_convert_altivec;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- altivecfct = jsimd_ycc_extrgbx_convert_altivec;
- break;
- case JCS_EXT_BGR:
- altivecfct = jsimd_ycc_extbgr_convert_altivec;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- altivecfct = jsimd_ycc_extbgrx_convert_altivec;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- altivecfct = jsimd_ycc_extxbgr_convert_altivec;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- altivecfct = jsimd_ycc_extxrgb_convert_altivec;
- break;
- default:
- altivecfct = jsimd_ycc_rgb_convert_altivec;
- break;
- }
-
- altivecfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
- JDIMENSION input_row, JSAMPARRAY output_buf,
- int num_rows)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_downsample(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_downsample(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
- jsimd_h2v2_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
- compptr->v_samp_factor,
- compptr->width_in_blocks, input_data,
- output_data);
-}
-
-GLOBAL(void)
-jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
- jsimd_h2v1_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
- compptr->v_samp_factor,
- compptr->width_in_blocks, input_data,
- output_data);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_upsample(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_upsample(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
-{
- jsimd_h2v2_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
- input_data, output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
-{
- jsimd_h2v1_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
- input_data, output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
-{
- jsimd_h2v2_fancy_upsample_altivec(cinfo->max_v_samp_factor,
- compptr->downsampled_width, input_data,
- output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
-{
- jsimd_h2v1_fancy_upsample_altivec(cinfo->max_v_samp_factor,
- compptr->downsampled_width, input_data,
- output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_merged_upsample(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_merged_upsample(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
-{
- void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-
- switch (cinfo->out_color_space) {
- case JCS_EXT_RGB:
- altivecfct = jsimd_h2v2_extrgb_merged_upsample_altivec;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- altivecfct = jsimd_h2v2_extrgbx_merged_upsample_altivec;
- break;
- case JCS_EXT_BGR:
- altivecfct = jsimd_h2v2_extbgr_merged_upsample_altivec;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- altivecfct = jsimd_h2v2_extbgrx_merged_upsample_altivec;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- altivecfct = jsimd_h2v2_extxbgr_merged_upsample_altivec;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- altivecfct = jsimd_h2v2_extxrgb_merged_upsample_altivec;
- break;
- default:
- altivecfct = jsimd_h2v2_merged_upsample_altivec;
- break;
- }
-
- altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-}
-
-GLOBAL(void)
-jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
- JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
-{
- void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-
- switch (cinfo->out_color_space) {
- case JCS_EXT_RGB:
- altivecfct = jsimd_h2v1_extrgb_merged_upsample_altivec;
- break;
- case JCS_EXT_RGBX:
- case JCS_EXT_RGBA:
- altivecfct = jsimd_h2v1_extrgbx_merged_upsample_altivec;
- break;
- case JCS_EXT_BGR:
- altivecfct = jsimd_h2v1_extbgr_merged_upsample_altivec;
- break;
- case JCS_EXT_BGRX:
- case JCS_EXT_BGRA:
- altivecfct = jsimd_h2v1_extbgrx_merged_upsample_altivec;
- break;
- case JCS_EXT_XBGR:
- case JCS_EXT_ABGR:
- altivecfct = jsimd_h2v1_extxbgr_merged_upsample_altivec;
- break;
- case JCS_EXT_XRGB:
- case JCS_EXT_ARGB:
- altivecfct = jsimd_h2v1_extxrgb_merged_upsample_altivec;
- break;
- default:
- altivecfct = jsimd_h2v1_merged_upsample_altivec;
- break;
- }
-
- altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-}
-
-GLOBAL(int)
-jsimd_can_convsamp(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (BITS_IN_JSAMPLE != 8)
- return 0;
- if (sizeof(JDIMENSION) != 4)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_convsamp_float(void)
-{
- return 0;
-}
-
-GLOBAL(void)
-jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
- DCTELEM *workspace)
-{
- jsimd_convsamp_altivec(sample_data, start_col, workspace);
-}
-
-GLOBAL(void)
-jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
- FAST_FLOAT *workspace)
-{
-}
-
-GLOBAL(int)
-jsimd_can_fdct_islow(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_ifast(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_float(void)
-{
- return 0;
-}
-
-GLOBAL(void)
-jsimd_fdct_islow(DCTELEM *data)
-{
- jsimd_fdct_islow_altivec(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_ifast(DCTELEM *data)
-{
- jsimd_fdct_ifast_altivec(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_float(FAST_FLOAT *data)
-{
-}
-
-GLOBAL(int)
-jsimd_can_quantize(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
- if (sizeof(DCTELEM) != 2)
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_quantize_float(void)
-{
- return 0;
-}
-
-GLOBAL(void)
-jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
-{
- jsimd_quantize_altivec(coef_block, divisors, workspace);
-}
-
-GLOBAL(void)
-jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
- FAST_FLOAT *workspace)
-{
-}
-
-GLOBAL(int)
-jsimd_can_idct_2x2(void)
-{
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_4x4(void)
-{
- return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
-}
-
-GLOBAL(void)
-jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
-}
-
-GLOBAL(int)
-jsimd_can_idct_islow(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_ifast(void)
-{
- init_simd();
-
- /* The code is optimised for these values only */
- if (DCTSIZE != 8)
- return 0;
- if (sizeof(JCOEF) != 2)
- return 0;
-
- if (simd_support & JSIMD_ALTIVEC)
- return 1;
-
- return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_float(void)
-{
- return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- jsimd_idct_islow_altivec(compptr->dct_table, coef_block, output_buf,
- output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
- jsimd_idct_ifast_altivec(compptr->dct_table, coef_block, output_buf,
- output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JCOEFPTR coef_block, JSAMPARRAY output_buf,
- JDIMENSION output_col)
-{
-}
-
-GLOBAL(int)
-jsimd_can_huff_encode_one_block(void)
-{
- return 0;
-}
-
-GLOBAL(JOCTET *)
-jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
- int last_dc_val, c_derived_tbl *dctbl,
- c_derived_tbl *actbl)
-{
- return NULL;
-}
-
-GLOBAL(int)
-jsimd_can_encode_mcu_AC_first_prepare(void)
-{
- return 0;
-}
-
-GLOBAL(void)
-jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
- const int *jpeg_natural_order_start, int Sl,
- int Al, JCOEF *values, size_t *zerobits)
-{
-}
-
-GLOBAL(int)
-jsimd_can_encode_mcu_AC_refine_prepare(void)
-{
- return 0;
-}
-
-GLOBAL(int)
-jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
- const int *jpeg_natural_order_start, int Sl,
- int Al, JCOEF *absvalues, size_t *bits)
-{
- return 0;
-}
diff --git a/simd/powerpc/jsimd_altivec.h b/simd/powerpc/jsimd_altivec.h
deleted file mode 100644
index e8bdb06..0000000
--- a/simd/powerpc/jsimd_altivec.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-#define JPEG_INTERNALS
-#include "../../jinclude.h"
-#include "../../jpeglib.h"
-#include "../../jsimd.h"
-#include "../../jdct.h"
-#include "../../jsimddct.h"
-#include "../jsimd.h"
-#include <altivec.h>
-
-
-/* Common code */
-
-#define __4X(a) a, a, a, a
-#define __4X2(a, b) a, b, a, b, a, b, a, b
-#define __8X(a) __4X(a), __4X(a)
-#define __16X(a) __8X(a), __8X(a)
-
-#define TRANSPOSE(row, col) { \
- __vector short row04l, row04h, row15l, row15h, \
- row26l, row26h, row37l, row37h; \
- __vector short col01e, col01o, col23e, col23o, \
- col45e, col45o, col67e, col67o; \
- \
- /* transpose coefficients (phase 1) */ \
- row04l = vec_mergeh(row##0, row##4); /* row04l=(00 40 01 41 02 42 03 43) */ \
- row04h = vec_mergel(row##0, row##4); /* row04h=(04 44 05 45 06 46 07 47) */ \
- row15l = vec_mergeh(row##1, row##5); /* row15l=(10 50 11 51 12 52 13 53) */ \
- row15h = vec_mergel(row##1, row##5); /* row15h=(14 54 15 55 16 56 17 57) */ \
- row26l = vec_mergeh(row##2, row##6); /* row26l=(20 60 21 61 22 62 23 63) */ \
- row26h = vec_mergel(row##2, row##6); /* row26h=(24 64 25 65 26 66 27 67) */ \
- row37l = vec_mergeh(row##3, row##7); /* row37l=(30 70 31 71 32 72 33 73) */ \
- row37h = vec_mergel(row##3, row##7); /* row37h=(34 74 35 75 36 76 37 77) */ \
- \
- /* transpose coefficients (phase 2) */ \
- col01e = vec_mergeh(row04l, row26l); /* col01e=(00 20 40 60 01 21 41 61) */ \
- col23e = vec_mergel(row04l, row26l); /* col23e=(02 22 42 62 03 23 43 63) */ \
- col45e = vec_mergeh(row04h, row26h); /* col45e=(04 24 44 64 05 25 45 65) */ \
- col67e = vec_mergel(row04h, row26h); /* col67e=(06 26 46 66 07 27 47 67) */ \
- col01o = vec_mergeh(row15l, row37l); /* col01o=(10 30 50 70 11 31 51 71) */ \
- col23o = vec_mergel(row15l, row37l); /* col23o=(12 32 52 72 13 33 53 73) */ \
- col45o = vec_mergeh(row15h, row37h); /* col45o=(14 34 54 74 15 35 55 75) */ \
- col67o = vec_mergel(row15h, row37h); /* col67o=(16 36 56 76 17 37 57 77) */ \
- \
- /* transpose coefficients (phase 3) */ \
- col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */ \
- col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71) */ \
- col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */ \
- col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */ \
- col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */ \
- col##5 = vec_mergel(col45e, col45o); /* col5=(05 15 25 35 45 55 65 75) */ \
- col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */ \
- col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */ \
-}
-
-#ifndef min
-#define min(a, b) ((a) < (b) ? (a) : (b))
-#endif
-
-
-/* Macros to abstract big/little endian bit twiddling */
-
-#if __BIG_ENDIAN__
-
-#define VEC_LD(a, b) vec_ld(a, b)
-#define VEC_ST(a, b, c) vec_st(a, b, c)
-#define VEC_UNPACKHU(a) vec_mergeh(pb_zero, a)
-#define VEC_UNPACKLU(a) vec_mergel(pb_zero, a)
-
-#else
-
-#define VEC_LD(a, b) vec_vsx_ld(a, b)
-#define VEC_ST(a, b, c) vec_vsx_st(a, b, c)
-#define VEC_UNPACKHU(a) vec_mergeh(a, pb_zero)
-#define VEC_UNPACKLU(a) vec_mergel(a, pb_zero)
-
-#endif
diff --git a/simd/x86_64/jccolext-avx2.asm b/simd/x86_64/jccolext-avx2.asm
index 5fa3848..10d2834 100644
--- a/simd/x86_64/jccolext-avx2.asm
+++ b/simd/x86_64/jccolext-avx2.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jcolsamp.inc"
@@ -96,12 +94,12 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
test cl, SIZEOF_BYTE
jz short .column_ld2
sub rcx, byte SIZEOF_BYTE
- movzx rax, BYTE [rsi+rcx]
+ movzx rax, byte [rsi+rcx]
.column_ld2:
test cl, SIZEOF_WORD
jz short .column_ld4
sub rcx, byte SIZEOF_WORD
- movzx rdx, WORD [rsi+rcx]
+ movzx rdx, word [rsi+rcx]
shl rax, WORD_BIT
or rax, rdx
.column_ld4:
diff --git a/simd/x86_64/jccolext-sse2.asm b/simd/x86_64/jccolext-sse2.asm
index b1486c0..2c914d3 100644
--- a/simd/x86_64/jccolext-sse2.asm
+++ b/simd/x86_64/jccolext-sse2.asm
@@ -12,8 +12,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jcolsamp.inc"
@@ -95,12 +93,12 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
test cl, SIZEOF_BYTE
jz short .column_ld2
sub rcx, byte SIZEOF_BYTE
- movzx rax, BYTE [rsi+rcx]
+ movzx rax, byte [rsi+rcx]
.column_ld2:
test cl, SIZEOF_WORD
jz short .column_ld4
sub rcx, byte SIZEOF_WORD
- movzx rdx, WORD [rsi+rcx]
+ movzx rdx, word [rsi+rcx]
shl rax, WORD_BIT
or rax, rdx
.column_ld4:
diff --git a/simd/x86_64/jccolor-avx2.asm b/simd/x86_64/jccolor-avx2.asm
index f9f4be0..16b7829 100644
--- a/simd/x86_64/jccolor-avx2.asm
+++ b/simd/x86_64/jccolor-avx2.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/x86_64/jccolor-sse2.asm b/simd/x86_64/jccolor-sse2.asm
index 3e46601..e2955c2 100644
--- a/simd/x86_64/jccolor-sse2.asm
+++ b/simd/x86_64/jccolor-sse2.asm
@@ -12,8 +12,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/x86_64/jcgray-avx2.asm b/simd/x86_64/jcgray-avx2.asm
index 0ec2410..591255b 100644
--- a/simd/x86_64/jcgray-avx2.asm
+++ b/simd/x86_64/jcgray-avx2.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/x86_64/jcgray-sse2.asm b/simd/x86_64/jcgray-sse2.asm
index edf9222..e389904 100644
--- a/simd/x86_64/jcgray-sse2.asm
+++ b/simd/x86_64/jcgray-sse2.asm
@@ -12,8 +12,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/x86_64/jcgryext-avx2.asm b/simd/x86_64/jcgryext-avx2.asm
index 79e2aa0..175b60d 100644
--- a/simd/x86_64/jcgryext-avx2.asm
+++ b/simd/x86_64/jcgryext-avx2.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jcolsamp.inc"
@@ -88,12 +86,12 @@ EXTN(jsimd_rgb_gray_convert_avx2):
test cl, SIZEOF_BYTE
jz short .column_ld2
sub rcx, byte SIZEOF_BYTE
- movzx rax, BYTE [rsi+rcx]
+ movzx rax, byte [rsi+rcx]
.column_ld2:
test cl, SIZEOF_WORD
jz short .column_ld4
sub rcx, byte SIZEOF_WORD
- movzx rdx, WORD [rsi+rcx]
+ movzx rdx, word [rsi+rcx]
shl rax, WORD_BIT
or rax, rdx
.column_ld4:
diff --git a/simd/x86_64/jcgryext-sse2.asm b/simd/x86_64/jcgryext-sse2.asm
index 9c3ae5e..873be80 100644
--- a/simd/x86_64/jcgryext-sse2.asm
+++ b/simd/x86_64/jcgryext-sse2.asm
@@ -12,8 +12,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jcolsamp.inc"
@@ -87,12 +85,12 @@ EXTN(jsimd_rgb_gray_convert_sse2):
test cl, SIZEOF_BYTE
jz short .column_ld2
sub rcx, byte SIZEOF_BYTE
- movzx rax, BYTE [rsi+rcx]
+ movzx rax, byte [rsi+rcx]
.column_ld2:
test cl, SIZEOF_WORD
jz short .column_ld4
sub rcx, byte SIZEOF_WORD
- movzx rdx, WORD [rsi+rcx]
+ movzx rdx, word [rsi+rcx]
shl rax, WORD_BIT
or rax, rdx
.column_ld4:
diff --git a/simd/x86_64/jchuff-sse2.asm b/simd/x86_64/jchuff-sse2.asm
index 1b091ad..7deab58 100644
--- a/simd/x86_64/jchuff-sse2.asm
+++ b/simd/x86_64/jchuff-sse2.asm
@@ -17,8 +17,6 @@
; This file contains an SSE2 implementation for Huffman coding of one block.
; The following code is based directly on jchuff.c; see jchuff.c for more
; details.
-;
-; [TAB8]
%include "jsimdext.inc"
@@ -27,11 +25,10 @@
alignz 32
GLOBAL_DATA(jconst_huff_encode_one_block)
+ EXTERN EXTN(jpeg_nbits_table)
EXTN(jconst_huff_encode_one_block):
-%include "jpeg_nbits_table.inc"
-
alignz 32
; --------------------------------------------------------------------------
@@ -200,7 +197,7 @@ EXTN(jsimd_huff_encode_one_block_sse2):
mov buffer, r11 ; r11 is now sratch
mov put_buffer, MMWORD [r10+16] ; put_buffer = state->cur.put_buffer;
- mov put_bits, DWORD [r10+24] ; put_bits = state->cur.put_bits;
+ mov put_bits, dword [r10+24] ; put_bits = state->cur.put_bits;
push r10 ; r10 is now scratch
; Encode the DC coefficient difference per section F.1.2.1
@@ -222,7 +219,7 @@ EXTN(jsimd_huff_encode_one_block_sse2):
add ebx, esi ; temp2 += temp3;
; Find the number of bits needed for the magnitude of the coefficient
- lea r11, [rel jpeg_nbits_table]
+ lea r11, [rel EXTN(jpeg_nbits_table)]
movzx rdi, byte [r11 + rdi] ; nbits = JPEG_NBITS(temp);
; Emit the Huffman-coded symbol for the number of bits
mov r11d, INT [r14 + rdi * 4] ; code = dctbl->ehufco[nbits];
@@ -289,7 +286,7 @@ EXTN(jsimd_huff_encode_one_block_sse2):
lea rsi, [rsi+r12*2] ; k += r;
shr r11, cl ; index >>= r;
movzx rdi, word [rsi] ; temp = t1[k];
- lea rbx, [rel jpeg_nbits_table]
+ lea rbx, [rel EXTN(jpeg_nbits_table)]
movzx rdi, byte [rbx + rdi] ; nbits = JPEG_NBITS(temp);
.BRLOOP:
cmp r12, 16 ; while (r > 15) {
@@ -333,7 +330,7 @@ EXTN(jsimd_huff_encode_one_block_sse2):
pop r10
; Save put_buffer & put_bits
mov MMWORD [r10+16], put_buffer ; state->cur.put_buffer = put_buffer;
- mov DWORD [r10+24], put_bits ; state->cur.put_bits = put_bits;
+ mov dword [r10+24], put_bits ; state->cur.put_bits = put_bits;
pop rbx
uncollect_args 6
diff --git a/simd/x86_64/jcphuff-sse2.asm b/simd/x86_64/jcphuff-sse2.asm
index a9446b7..8ed4472 100644
--- a/simd/x86_64/jcphuff-sse2.asm
+++ b/simd/x86_64/jcphuff-sse2.asm
@@ -16,8 +16,6 @@
;
; This file contains an SSE2 implementation of data preparation for progressive
; Huffman encoding. See jcphuff.c for more details.
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/x86_64/jcsample-avx2.asm b/simd/x86_64/jcsample-avx2.asm
index 9d5a861..d9922bb 100644
--- a/simd/x86_64/jcsample-avx2.asm
+++ b/simd/x86_64/jcsample-avx2.asm
@@ -14,8 +14,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/x86_64/jcsample-sse2.asm b/simd/x86_64/jcsample-sse2.asm
index 1b31536..0f107e9 100644
--- a/simd/x86_64/jcsample-sse2.asm
+++ b/simd/x86_64/jcsample-sse2.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/x86_64/jdcolext-avx2.asm b/simd/x86_64/jdcolext-avx2.asm
index e2b96c7..677b8ed 100644
--- a/simd/x86_64/jdcolext-avx2.asm
+++ b/simd/x86_64/jdcolext-avx2.asm
@@ -14,8 +14,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jcolsamp.inc"
@@ -334,7 +332,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
vmovd eax, xmmA
cmp rcx, byte SIZEOF_WORD
jb short .column_st1
- mov WORD [rdi], ax
+ mov word [rdi], ax
add rdi, byte SIZEOF_WORD
sub rcx, byte SIZEOF_WORD
shr rax, 16
@@ -343,7 +341,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
; space.
test rcx, rcx
jz short .nextrow
- mov BYTE [rdi], al
+ mov byte [rdi], al
%else ; RGB_PIXELSIZE == 4 ; -----------
diff --git a/simd/x86_64/jdcolext-sse2.asm b/simd/x86_64/jdcolext-sse2.asm
index a94954b..071aa62 100644
--- a/simd/x86_64/jdcolext-sse2.asm
+++ b/simd/x86_64/jdcolext-sse2.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jcolsamp.inc"
@@ -306,7 +304,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
movd eax, xmmA
cmp rcx, byte SIZEOF_WORD
jb short .column_st1
- mov WORD [rdi], ax
+ mov word [rdi], ax
add rdi, byte SIZEOF_WORD
sub rcx, byte SIZEOF_WORD
shr rax, 16
@@ -315,7 +313,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
; space.
test rcx, rcx
jz short .nextrow
- mov BYTE [rdi], al
+ mov byte [rdi], al
%else ; RGB_PIXELSIZE == 4 ; -----------
diff --git a/simd/x86_64/jdcolor-avx2.asm b/simd/x86_64/jdcolor-avx2.asm
index abad176..43de9db 100644
--- a/simd/x86_64/jdcolor-avx2.asm
+++ b/simd/x86_64/jdcolor-avx2.asm
@@ -14,8 +14,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/x86_64/jdcolor-sse2.asm b/simd/x86_64/jdcolor-sse2.asm
index e7079f6..b3f1fec 100644
--- a/simd/x86_64/jdcolor-sse2.asm
+++ b/simd/x86_64/jdcolor-sse2.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/x86_64/jdmerge-avx2.asm b/simd/x86_64/jdmerge-avx2.asm
index ca3f063..9515a17 100644
--- a/simd/x86_64/jdmerge-avx2.asm
+++ b/simd/x86_64/jdmerge-avx2.asm
@@ -14,8 +14,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/x86_64/jdmerge-sse2.asm b/simd/x86_64/jdmerge-sse2.asm
index f3e09fa..aedccc2 100644
--- a/simd/x86_64/jdmerge-sse2.asm
+++ b/simd/x86_64/jdmerge-sse2.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/x86_64/jdmrgext-avx2.asm b/simd/x86_64/jdmrgext-avx2.asm
index 04e8a94..bb733c5 100644
--- a/simd/x86_64/jdmrgext-avx2.asm
+++ b/simd/x86_64/jdmrgext-avx2.asm
@@ -14,8 +14,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jcolsamp.inc"
@@ -339,7 +337,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
vmovd eax, xmmA
cmp rcx, byte SIZEOF_WORD
jb short .column_st1
- mov WORD [rdi], ax
+ mov word [rdi], ax
add rdi, byte SIZEOF_WORD
sub rcx, byte SIZEOF_WORD
shr rax, 16
@@ -348,7 +346,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
; space.
test rcx, rcx
jz short .endcolumn
- mov BYTE [rdi], al
+ mov byte [rdi], al
%else ; RGB_PIXELSIZE == 4 ; -----------
diff --git a/simd/x86_64/jdmrgext-sse2.asm b/simd/x86_64/jdmrgext-sse2.asm
index 1cc3345..b176a4c 100644
--- a/simd/x86_64/jdmrgext-sse2.asm
+++ b/simd/x86_64/jdmrgext-sse2.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jcolsamp.inc"
@@ -310,7 +308,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
movd eax, xmmA
cmp rcx, byte SIZEOF_WORD
jb short .column_st1
- mov WORD [rdi], ax
+ mov word [rdi], ax
add rdi, byte SIZEOF_WORD
sub rcx, byte SIZEOF_WORD
shr rax, 16
@@ -319,7 +317,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
; space.
test rcx, rcx
jz short .endcolumn
- mov BYTE [rdi], al
+ mov byte [rdi], al
%else ; RGB_PIXELSIZE == 4 ; -----------
diff --git a/simd/x86_64/jdsample-avx2.asm b/simd/x86_64/jdsample-avx2.asm
index 10fa5c4..fc274a9 100644
--- a/simd/x86_64/jdsample-avx2.asm
+++ b/simd/x86_64/jdsample-avx2.asm
@@ -14,8 +14,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/x86_64/jdsample-sse2.asm b/simd/x86_64/jdsample-sse2.asm
index d8ccda9..20e0767 100644
--- a/simd/x86_64/jdsample-sse2.asm
+++ b/simd/x86_64/jdsample-sse2.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
diff --git a/simd/x86_64/jfdctflt-sse.asm b/simd/x86_64/jfdctflt-sse.asm
index 26f9fb6..ef27966 100644
--- a/simd/x86_64/jfdctflt-sse.asm
+++ b/simd/x86_64/jfdctflt-sse.asm
@@ -17,8 +17,6 @@
; This file contains a floating-point implementation of the forward DCT
; (Discrete Cosine Transform). The following code is based directly on
; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
diff --git a/simd/x86_64/jfdctfst-sse2.asm b/simd/x86_64/jfdctfst-sse2.asm
index aaf8b9e..2e1bfe6 100644
--- a/simd/x86_64/jfdctfst-sse2.asm
+++ b/simd/x86_64/jfdctfst-sse2.asm
@@ -18,8 +18,6 @@
; the forward DCT (Discrete Cosine Transform). The following code is
; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
; for more details.
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
diff --git a/simd/x86_64/jfdctint-avx2.asm b/simd/x86_64/jfdctint-avx2.asm
index 448f47d..6ad4cf0 100644
--- a/simd/x86_64/jfdctint-avx2.asm
+++ b/simd/x86_64/jfdctint-avx2.asm
@@ -18,8 +18,6 @@
; forward DCT (Discrete Cosine Transform). The following code is based
; directly on the IJG's original jfdctint.c; see the jfdctint.c for
; more details.
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
diff --git a/simd/x86_64/jfdctint-sse2.asm b/simd/x86_64/jfdctint-sse2.asm
index ef16a52..5d0de3c 100644
--- a/simd/x86_64/jfdctint-sse2.asm
+++ b/simd/x86_64/jfdctint-sse2.asm
@@ -18,8 +18,6 @@
; forward DCT (Discrete Cosine Transform). The following code is based
; directly on the IJG's original jfdctint.c; see the jfdctint.c for
; more details.
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
diff --git a/simd/x86_64/jidctflt-sse2.asm b/simd/x86_64/jidctflt-sse2.asm
index b676ef3..ab95e1a 100644
--- a/simd/x86_64/jidctflt-sse2.asm
+++ b/simd/x86_64/jidctflt-sse2.asm
@@ -17,8 +17,6 @@
; This file contains a floating-point implementation of the inverse DCT
; (Discrete Cosine Transform). The following code is based directly on
; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
@@ -95,8 +93,8 @@ EXTN(jsimd_idct_float_sse2):
mov rcx, DCTSIZE/4 ; ctr
.columnloop:
%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
- mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
jnz near .columnDCT
movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
diff --git a/simd/x86_64/jidctfst-sse2.asm b/simd/x86_64/jidctfst-sse2.asm
index c6c42f9..a66a681 100644
--- a/simd/x86_64/jidctfst-sse2.asm
+++ b/simd/x86_64/jidctfst-sse2.asm
@@ -18,8 +18,6 @@
; the inverse DCT (Discrete Cosine Transform). The following code is
; based directly on the IJG's original jidctfst.c; see the jidctfst.c
; for more details.
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
@@ -111,8 +109,8 @@ EXTN(jsimd_idct_ifast_sse2):
mov rsi, r11 ; inptr
%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
- mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
jnz near .columnDCT
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
diff --git a/simd/x86_64/jidctint-avx2.asm b/simd/x86_64/jidctint-avx2.asm
index b60b44f..50270f4 100644
--- a/simd/x86_64/jidctint-avx2.asm
+++ b/simd/x86_64/jidctint-avx2.asm
@@ -18,8 +18,6 @@
; inverse DCT (Discrete Cosine Transform). The following code is based
; directly on the IJG's original jidctint.c; see the jidctint.c for
; more details.
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
@@ -292,8 +290,8 @@ EXTN(jsimd_idct_islow_avx2):
; ---- Pass 1: process columns.
%ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2
- mov eax, DWORD [DWBLOCK(1,0,r11,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,r11,SIZEOF_JCOEF)]
+ mov eax, dword [DWBLOCK(1,0,r11,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,r11,SIZEOF_JCOEF)]
jnz near .columnDCT
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,r11,SIZEOF_JCOEF)]
diff --git a/simd/x86_64/jidctint-sse2.asm b/simd/x86_64/jidctint-sse2.asm
index 83fc344..034530c 100644
--- a/simd/x86_64/jidctint-sse2.asm
+++ b/simd/x86_64/jidctint-sse2.asm
@@ -18,8 +18,6 @@
; inverse DCT (Discrete Cosine Transform). The following code is based
; directly on the IJG's original jidctint.c; see the jidctint.c for
; more details.
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
@@ -124,8 +122,8 @@ EXTN(jsimd_idct_islow_sse2):
mov rsi, r11 ; inptr
%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
- mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
jnz near .columnDCT
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
diff --git a/simd/x86_64/jidctred-sse2.asm b/simd/x86_64/jidctred-sse2.asm
index af64fdc..7fbfcc5 100644
--- a/simd/x86_64/jidctred-sse2.asm
+++ b/simd/x86_64/jidctred-sse2.asm
@@ -18,8 +18,6 @@
; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
; The following code is based directly on the IJG's original jidctred.c;
; see the jidctred.c for more details.
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
@@ -132,8 +130,8 @@ EXTN(jsimd_idct_4x4_sse2):
mov rsi, r11 ; inptr
%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
- mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
- or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+ mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+ or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
jnz short .columnDCT
movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
@@ -562,8 +560,8 @@ EXTN(jsimd_idct_2x2_sse2):
mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
- mov WORD [rdx+rax*SIZEOF_JSAMPLE], bx
- mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx
+ mov word [rdx+rax*SIZEOF_JSAMPLE], bx
+ mov word [rsi+rax*SIZEOF_JSAMPLE], cx
pop rbx
uncollect_args 4
diff --git a/simd/x86_64/jquantf-sse2.asm b/simd/x86_64/jquantf-sse2.asm
index 4600eec..83596a9 100644
--- a/simd/x86_64/jquantf-sse2.asm
+++ b/simd/x86_64/jquantf-sse2.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
diff --git a/simd/x86_64/jquanti-avx2.asm b/simd/x86_64/jquanti-avx2.asm
index b7243e4..5f04d22 100644
--- a/simd/x86_64/jquanti-avx2.asm
+++ b/simd/x86_64/jquanti-avx2.asm
@@ -14,8 +14,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
diff --git a/simd/x86_64/jquanti-sse2.asm b/simd/x86_64/jquanti-sse2.asm
index 7ff7275..bb6fa69 100644
--- a/simd/x86_64/jquanti-sse2.asm
+++ b/simd/x86_64/jquanti-sse2.asm
@@ -13,8 +13,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
diff --git a/simd/x86_64/jsimd.c b/simd/x86_64/jsimd.c
index 1e5698b..dc639fc 100644
--- a/simd/x86_64/jsimd.c
+++ b/simd/x86_64/jsimd.c
@@ -472,6 +472,12 @@ jsimd_can_h2v1_fancy_upsample(void)
return 0;
}
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+ return 0;
+}
+
GLOBAL(void)
jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
@@ -500,6 +506,12 @@ jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
output_data_ptr);
}
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
GLOBAL(int)
jsimd_can_h2v2_merged_upsample(void)
{
diff --git a/simd/x86_64/jsimdcpu.asm b/simd/x86_64/jsimdcpu.asm
index a905282..705f813 100644
--- a/simd/x86_64/jsimdcpu.asm
+++ b/simd/x86_64/jsimdcpu.asm
@@ -14,8 +14,6 @@
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
%include "jsimdext.inc"