diff options
author | Elliott Hughes <enh@google.com> | 2020-12-02 18:09:57 -0800 |
---|---|---|
committer | Elliott Hughes <enh@google.com> | 2020-12-02 18:24:38 -0800 |
commit | 98e581f8227b9846b7adc92c0c63f5ed2384ff4b (patch) | |
tree | 26effec72dbace7671158aeddc9fde1e3e9cd1a9 /simd | |
parent | ff78ee5a324a7d37e0bfffd6152ea37056f29931 (diff) | |
parent | d5148db386ceb4a608058320071cbed890bd6ad2 (diff) |
Switch to chromium upstream.
Bug: https://issuetracker.google.com/135180511
Test: treehugger
Change-Id: I0c78ec9b07debdb501a96df0d6cb2f9a24b5fc84
Diffstat (limited to 'simd')
156 files changed, 7010 insertions, 25751 deletions
diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt index 3472c0d..5c8009a 100755 --- a/simd/CMakeLists.txt +++ b/simd/CMakeLists.txt @@ -38,6 +38,14 @@ elseif(CPU_TYPE STREQUAL "i386") endif() endif() +if(NOT REQUIRE_SIMD) + include(CheckLanguage) + check_language(ASM_NASM) + if(NOT CMAKE_ASM_NASM_COMPILER) + simd_fail("SIMD extensions disabled: could not find NASM compiler") + return() + endif() +endif() enable_language(ASM_NASM) message(STATUS "CMAKE_ASM_NASM_COMPILER = ${CMAKE_ASM_NASM_COMPILER}") diff --git a/simd/arm/arm/jccolext-neon.c b/simd/arm/arm/jccolext-neon.c new file mode 100644 index 0000000..4f22e1f --- /dev/null +++ b/simd/arm/arm/jccolext-neon.c @@ -0,0 +1,145 @@ +/* + * jccolext-neon.c - colorspace conversion (Arm NEON) + * + * Copyright 2020 The Chromium Authors. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jccolor-neon.c */ + +/* + * RGB -> YCbCr conversion is defined by the following equations: + * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 + * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 + * + * Avoid floating point arithmetic by using shifted integer constants: + * 0.29899597 = 19595 * 2^-16 + * 0.58700561 = 38470 * 2^-16 + * 0.11399841 = 7471 * 2^-16 + * 0.16874695 = 11059 * 2^-16 + * 0.33125305 = 21709 * 2^-16 + * 0.50000000 = 32768 * 2^-16 + * 0.41868592 = 27439 * 2^-16 + * 0.08131409 = 5329 * 2^-16 + * These constants are defined in jccolor-neon.c + * + * To ensure rounding gives correct values, we add 0.5 to Cb and Cr. + */ + +void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, + JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, + JDIMENSION output_row, + int num_rows) +{ + /* Pointer to RGB(X/A) input data. */ + JSAMPROW inptr; + /* Pointers to Y, Cb and Cr output data. */ + JSAMPROW outptr0, outptr1, outptr2; + + /* Setup conversion constants. */ +#if defined(__clang__) + const uint16x4x2_t consts = vld1_u16_x2(jsimd_rgb_ycc_neon_consts); +#else + /* GCC does not currently support the intrinsic vld1_<type>_x2(). */ + const uint16x4_t consts1 = vld1_u16(jsimd_rgb_ycc_neon_consts); + const uint16x4_t consts2 = vld1_u16(jsimd_rgb_ycc_neon_consts + 4); + const uint16x4x2_t consts = { consts1, consts2 }; +#endif + const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767); + + while (--num_rows >= 0) { + inptr = *input_buf++; + outptr0 = output_buf[0][output_row]; + outptr1 = output_buf[1][output_row]; + outptr2 = output_buf[2][output_row]; + output_row++; + + int cols_remaining = image_width; + for (; cols_remaining > 0; cols_remaining -= 8) { + + /* To prevent buffer overread by the vector load instructions, the */ + /* last (image_width % 8) columns of data are first memcopied to a */ + /* temporary buffer large enough to accommodate the vector load. */ + if (cols_remaining < 8) { + ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE]; + memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE); + inptr = tmp_buf; + } + +#if RGB_PIXELSIZE == 4 + uint8x8x4_t input_pixels = vld4_u8(inptr); +#else + uint8x8x3_t input_pixels = vld3_u8(inptr); +#endif + uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]); + uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]); + uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]); + + /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */ + uint32x4_t y_low = vmull_lane_u16(vget_low_u16(r), consts.val[0], 0); + y_low = vmlal_lane_u16(y_low, vget_low_u16(g), consts.val[0], 1); + y_low = vmlal_lane_u16(y_low, vget_low_u16(b), consts.val[0], 2); + uint32x4_t y_high = vmull_lane_u16(vget_high_u16(r), consts.val[0], 0); + y_high = vmlal_lane_u16(y_high, vget_high_u16(g), consts.val[0], 1); + y_high = vmlal_lane_u16(y_high, vget_high_u16(b), consts.val[0], 2); + + /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */ + uint32x4_t cb_low = scaled_128_5; + cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(r), consts.val[0], 3); + cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(g), consts.val[1], 0); + cb_low = vmlal_lane_u16(cb_low, vget_low_u16(b), consts.val[1], 1); + uint32x4_t cb_high = scaled_128_5; + cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(r), consts.val[0], 3); + cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(g), consts.val[1], 0); + cb_high = vmlal_lane_u16(cb_high, vget_high_u16(b), consts.val[1], 1); + + /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */ + uint32x4_t cr_low = scaled_128_5; + cr_low = vmlal_lane_u16(cr_low, vget_low_u16(r), consts.val[1], 1); + cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(g), consts.val[1], 2); + cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(b), consts.val[1], 3); + uint32x4_t cr_high = scaled_128_5; + cr_high = vmlal_lane_u16(cr_high, vget_high_u16(r), consts.val[1], 1); + cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(g), consts.val[1], 2); + cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(b), consts.val[1], 3); + + /* Descale Y values (rounding right shift) and narrow to 16-bit. */ + uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_low, 16), + vrshrn_n_u32(y_high, 16)); + /* Descale Cb values (right shift) and narrow to 16-bit. */ + uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_low, 16), + vshrn_n_u32(cb_high, 16)); + /* Descale Cr values (right shift) and narrow to 16-bit. */ + uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_low, 16), + vshrn_n_u32(cr_high, 16)); + /* Narrow Y, Cb and Cr values to 8-bit and store to memory. Buffer */ + /* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes. */ + vst1_u8(outptr0, vmovn_u16(y_u16)); + vst1_u8(outptr1, vmovn_u16(cb_u16)); + vst1_u8(outptr2, vmovn_u16(cr_u16)); + + /* Increment pointers. */ + inptr += (8 * RGB_PIXELSIZE); + outptr0 += 8; + outptr1 += 8; + outptr2 += 8; + } + } +} diff --git a/simd/arm/jsimd.c b/simd/arm/arm/jsimd.c index 45f9b04..c0d5d90 100644 --- a/simd/arm/jsimd.c +++ b/simd/arm/arm/jsimd.c @@ -17,12 +17,12 @@ */ #define JPEG_INTERNALS -#include "../../jinclude.h" -#include "../../jpeglib.h" +#include "../../../jinclude.h" +#include "../../../jpeglib.h" +#include "../../../jsimd.h" +#include "../../../jdct.h" +#include "../../../jsimddct.h" #include "../../jsimd.h" -#include "../../jdct.h" -#include "../../jsimddct.h" -#include "../jsimd.h" #include <stdio.h> #include <string.h> @@ -164,6 +164,19 @@ jsimd_can_rgb_ycc(void) GLOBAL(int) jsimd_can_rgb_gray(void) { + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + return 0; } @@ -246,6 +259,37 @@ jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows) { + void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch (cinfo->in_color_space) { + case JCS_EXT_RGB: + neonfct = jsimd_extrgb_gray_convert_neon; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + neonfct = jsimd_extrgbx_gray_convert_neon; + break; + case JCS_EXT_BGR: + neonfct = jsimd_extbgr_gray_convert_neon; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + neonfct = jsimd_extbgrx_gray_convert_neon; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + neonfct = jsimd_extxbgr_gray_convert_neon; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + neonfct = jsimd_extxrgb_gray_convert_neon; + break; + default: + neonfct = jsimd_extrgb_gray_convert_neon; + break; + } + + neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); } GLOBAL(void) @@ -298,12 +342,38 @@ jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, GLOBAL(int) jsimd_can_h2v2_downsample(void) { + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (DCTSIZE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + return 0; } GLOBAL(int) jsimd_can_h2v1_downsample(void) { + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (DCTSIZE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + return 0; } @@ -311,23 +381,51 @@ GLOBAL(void) jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY output_data) { + jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); } GLOBAL(void) jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY output_data) { + jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); } GLOBAL(int) jsimd_can_h2v2_upsample(void) { + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + return 0; } GLOBAL(int) jsimd_can_h2v1_upsample(void) { + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + return 0; } @@ -335,17 +433,32 @@ GLOBAL(void) jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); } GLOBAL(void) jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); } GLOBAL(int) jsimd_can_h2v2_fancy_upsample(void) { + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + return 0; } @@ -366,10 +479,30 @@ jsimd_can_h2v1_fancy_upsample(void) return 0; } +GLOBAL(int) +jsimd_can_h1v2_fancy_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + GLOBAL(void) jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); } GLOBAL(void) @@ -381,15 +514,46 @@ jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, output_data_ptr); } +GLOBAL(void) +jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + GLOBAL(int) jsimd_can_h2v2_merged_upsample(void) { + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + return 0; } GLOBAL(int) jsimd_can_h2v1_merged_upsample(void) { + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + return 0; } @@ -397,12 +561,74 @@ GLOBAL(void) jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) { + void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + neonfct = jsimd_h2v2_extrgb_merged_upsample_neon; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon; + break; + case JCS_EXT_BGR: + neonfct = jsimd_h2v2_extbgr_merged_upsample_neon; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon; + break; + default: + neonfct = jsimd_h2v2_extrgb_merged_upsample_neon; + break; + } + + neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); } GLOBAL(void) jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) { + void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + neonfct = jsimd_h2v1_extrgb_merged_upsample_neon; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon; + break; + case JCS_EXT_BGR: + neonfct = jsimd_h2v1_extbgr_merged_upsample_neon; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon; + break; + default: + neonfct = jsimd_h2v1_extrgb_merged_upsample_neon; + break; + } + + neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); } GLOBAL(int) @@ -448,6 +674,17 @@ jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col, GLOBAL(int) jsimd_can_fdct_islow(void) { + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + return 0; } @@ -477,6 +714,7 @@ jsimd_can_fdct_float(void) GLOBAL(void) jsimd_fdct_islow(DCTELEM *data) { + jsimd_fdct_islow_neon(data); } GLOBAL(void) diff --git a/simd/arm/arm/jsimd_neon.S b/simd/arm/arm/jsimd_neon.S new file mode 100644 index 0000000..2c45324 --- /dev/null +++ b/simd/arm/arm/jsimd_neon.S @@ -0,0 +1,499 @@ +/* + * ARMv7 NEON optimizations for libjpeg-turbo + * + * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies). + * All Rights Reserved. + * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> + * Copyright (C) 2014, Siarhei Siamashka. All Rights Reserved. + * Copyright (C) 2014, Linaro Limited. All Rights Reserved. + * Copyright (C) 2015, D. R. Commander. All Rights Reserved. + * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */ +#endif + +.text +.fpu neon +.arch armv7a +.object_arch armv4 +.arm +.syntax unified + + +#define RESPECT_STRICT_ALIGNMENT 1 + + +/*****************************************************************************/ + +/* Supplementary macro for setting function attributes */ +.macro asm_function fname +#ifdef __APPLE__ + .private_extern _\fname + .globl _\fname +_\fname: +#else + .global \fname +#ifdef __ELF__ + .hidden \fname + .type \fname, %function +#endif +\fname: +#endif +.endm + + +#define CENTERJSAMPLE 128 + +/*****************************************************************************/ + +/* + * GLOBAL(JOCTET*) + * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer, + * JCOEFPTR block, int last_dc_val, + * c_derived_tbl *dctbl, c_derived_tbl *actbl) + * + */ + +.macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP + sub \PUT_BITS, \PUT_BITS, #0x8 + lsr \TMP, \PUT_BUFFER, \PUT_BITS + uxtb \TMP, \TMP + strb \TMP, [\BUFFER, #1]! + cmp \TMP, #0xff + /*it eq*/ + strbeq \ZERO, [\BUFFER, #1]! +.endm + +.macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE + /*lsl \PUT_BUFFER, \PUT_BUFFER, \SIZE*/ + add \PUT_BITS, \SIZE + /*orr \PUT_BUFFER, \PUT_BUFFER, \CODE*/ + orr \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE +.endm + +.macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP + cmp \PUT_BITS, #0x10 + blt 15f + eor \ZERO, \ZERO, \ZERO + emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP + emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP +15: +.endm + +.balign 16 +jsimd_huff_encode_one_block_neon_consts: + .byte 0x01 + .byte 0x02 + .byte 0x04 + .byte 0x08 + .byte 0x10 + .byte 0x20 + .byte 0x40 + .byte 0x80 + +asm_function jsimd_huff_encode_one_block_neon + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + add r7, sp, #0x1c + sub r4, sp, #0x40 + bfc r4, #0, #5 + mov sp, r4 /* align sp on 32 bytes */ + vst1.64 {d8, d9, d10, d11}, [r4, :128]! + vst1.64 {d12, d13, d14, d15}, [r4, :128] + sub sp, #0x140 /* reserve 320 bytes */ + str r0, [sp, #0x18] /* working state > sp + Ox18 */ + add r4, sp, #0x20 /* r4 = t1 */ + ldr lr, [r7, #0x8] /* lr = dctbl */ + sub r10, r1, #0x1 /* r10=buffer-- */ + ldrsh r1, [r2] + mov r9, #0x10 + mov r8, #0x1 + adr r5, jsimd_huff_encode_one_block_neon_consts + /* prepare data */ + vld1.8 {d26}, [r5, :64] + veor q8, q8, q8 + veor q9, q9, q9 + vdup.16 q14, r9 + vdup.16 q15, r8 + veor q10, q10, q10 + veor q11, q11, q11 + sub r1, r1, r3 + add r9, r2, #0x22 + add r8, r2, #0x18 + add r3, r2, #0x36 + vmov.16 d0[0], r1 + vld1.16 {d2[0]}, [r9, :16] + vld1.16 {d4[0]}, [r8, :16] + vld1.16 {d6[0]}, [r3, :16] + add r1, r2, #0x2 + add r9, r2, #0x30 + add r8, r2, #0x26 + add r3, r2, #0x28 + vld1.16 {d0[1]}, [r1, :16] + vld1.16 {d2[1]}, [r9, :16] + vld1.16 {d4[1]}, [r8, :16] + vld1.16 {d6[1]}, [r3, :16] + add r1, r2, #0x10 + add r9, r2, #0x40 + add r8, r2, #0x34 + add r3, r2, #0x1a + vld1.16 {d0[2]}, [r1, :16] + vld1.16 {d2[2]}, [r9, :16] + vld1.16 {d4[2]}, [r8, :16] + vld1.16 {d6[2]}, [r3, :16] + add r1, r2, #0x20 + add r9, r2, #0x32 + add r8, r2, #0x42 + add r3, r2, #0xc + vld1.16 {d0[3]}, [r1, :16] + vld1.16 {d2[3]}, [r9, :16] + vld1.16 {d4[3]}, [r8, :16] + vld1.16 {d6[3]}, [r3, :16] + add r1, r2, #0x12 + add r9, r2, #0x24 + add r8, r2, #0x50 + add r3, r2, #0xe + vld1.16 {d1[0]}, [r1, :16] + vld1.16 {d3[0]}, [r9, :16] + vld1.16 {d5[0]}, [r8, :16] + vld1.16 {d7[0]}, [r3, :16] + add r1, r2, #0x4 + add r9, r2, #0x16 + add r8, r2, #0x60 + add r3, r2, #0x1c + vld1.16 {d1[1]}, [r1, :16] + vld1.16 {d3[1]}, [r9, :16] + vld1.16 {d5[1]}, [r8, :16] + vld1.16 {d7[1]}, [r3, :16] + add r1, r2, #0x6 + add r9, r2, #0x8 + add r8, r2, #0x52 + add r3, r2, #0x2a + vld1.16 {d1[2]}, [r1, :16] + vld1.16 {d3[2]}, [r9, :16] + vld1.16 {d5[2]}, [r8, :16] + vld1.16 {d7[2]}, [r3, :16] + add r1, r2, #0x14 + add r9, r2, #0xa + add r8, r2, #0x44 + add r3, r2, #0x38 + vld1.16 {d1[3]}, [r1, :16] + vld1.16 {d3[3]}, [r9, :16] + vld1.16 {d5[3]}, [r8, :16] + vld1.16 {d7[3]}, [r3, :16] + vcgt.s16 q8, q8, q0 + vcgt.s16 q9, q9, q1 + vcgt.s16 q10, q10, q2 + vcgt.s16 q11, q11, q3 + vabs.s16 q0, q0 + vabs.s16 q1, q1 + vabs.s16 q2, q2 + vabs.s16 q3, q3 + veor q8, q8, q0 + veor q9, q9, q1 + veor q10, q10, q2 + veor q11, q11, q3 + add r9, r4, #0x20 + add r8, r4, #0x80 + add r3, r4, #0xa0 + vclz.i16 q0, q0 + vclz.i16 q1, q1 + vclz.i16 q2, q2 + vclz.i16 q3, q3 + vsub.i16 q0, q14, q0 + vsub.i16 q1, q14, q1 + vsub.i16 q2, q14, q2 + vsub.i16 q3, q14, q3 + vst1.16 {d0, d1, d2, d3}, [r4, :256] + vst1.16 {d4, d5, d6, d7}, [r9, :256] + vshl.s16 q0, q15, q0 + vshl.s16 q1, q15, q1 + vshl.s16 q2, q15, q2 + vshl.s16 q3, q15, q3 + vsub.i16 q0, q0, q15 + vsub.i16 q1, q1, q15 + vsub.i16 q2, q2, q15 + vsub.i16 q3, q3, q15 + vand q8, q8, q0 + vand q9, q9, q1 + vand q10, q10, q2 + vand q11, q11, q3 + vst1.16 {d16, d17, d18, d19}, [r8, :256] + vst1.16 {d20, d21, d22, d23}, [r3, :256] + add r1, r2, #0x46 + add r9, r2, #0x3a + add r8, r2, #0x74 + add r3, r2, #0x6a + vld1.16 {d8[0]}, [r1, :16] + vld1.16 {d10[0]}, [r9, :16] + vld1.16 {d12[0]}, [r8, :16] + vld1.16 {d14[0]}, [r3, :16] + veor q8, q8, q8 + veor q9, q9, q9 + veor q10, q10, q10 + veor q11, q11, q11 + add r1, r2, #0x54 + add r9, r2, #0x2c + add r8, r2, #0x76 + add r3, r2, #0x78 + vld1.16 {d8[1]}, [r1, :16] + vld1.16 {d10[1]}, [r9, :16] + vld1.16 {d12[1]}, [r8, :16] + vld1.16 {d14[1]}, [r3, :16] + add r1, r2, #0x62 + add r9, r2, #0x1e + add r8, r2, #0x68 + add r3, r2, #0x7a + vld1.16 {d8[2]}, [r1, :16] + vld1.16 {d10[2]}, [r9, :16] + vld1.16 {d12[2]}, [r8, :16] + vld1.16 {d14[2]}, [r3, :16] + add r1, r2, #0x70 + add r9, r2, #0x2e + add r8, r2, #0x5a + add r3, r2, #0x6c + vld1.16 {d8[3]}, [r1, :16] + vld1.16 {d10[3]}, [r9, :16] + vld1.16 {d12[3]}, [r8, :16] + vld1.16 {d14[3]}, [r3, :16] + add r1, r2, #0x72 + add r9, r2, #0x3c + add r8, r2, #0x4c + add r3, r2, #0x5e + vld1.16 {d9[0]}, [r1, :16] + vld1.16 {d11[0]}, [r9, :16] + vld1.16 {d13[0]}, [r8, :16] + vld1.16 {d15[0]}, [r3, :16] + add r1, r2, #0x64 + add r9, r2, #0x4a + add r8, r2, #0x3e + add r3, r2, #0x6e + vld1.16 {d9[1]}, [r1, :16] + vld1.16 {d11[1]}, [r9, :16] + vld1.16 {d13[1]}, [r8, :16] + vld1.16 {d15[1]}, [r3, :16] + add r1, r2, #0x56 + add r9, r2, #0x58 + add r8, r2, #0x4e + add r3, r2, #0x7c + vld1.16 {d9[2]}, [r1, :16] + vld1.16 {d11[2]}, [r9, :16] + vld1.16 {d13[2]}, [r8, :16] + vld1.16 {d15[2]}, [r3, :16] + add r1, r2, #0x48 + add r9, r2, #0x66 + add r8, r2, #0x5c + add r3, r2, #0x7e + vld1.16 {d9[3]}, [r1, :16] + vld1.16 {d11[3]}, [r9, :16] + vld1.16 {d13[3]}, [r8, :16] + vld1.16 {d15[3]}, [r3, :16] + vcgt.s16 q8, q8, q4 + vcgt.s16 q9, q9, q5 + vcgt.s16 q10, q10, q6 + vcgt.s16 q11, q11, q7 + vabs.s16 q4, q4 + vabs.s16 q5, q5 + vabs.s16 q6, q6 + vabs.s16 q7, q7 + veor q8, q8, q4 + veor q9, q9, q5 + veor q10, q10, q6 + veor q11, q11, q7 + add r1, r4, #0x40 + add r9, r4, #0x60 + add r8, r4, #0xc0 + add r3, r4, #0xe0 + vclz.i16 q4, q4 + vclz.i16 q5, q5 + vclz.i16 q6, q6 + vclz.i16 q7, q7 + vsub.i16 q4, q14, q4 + vsub.i16 q5, q14, q5 + vsub.i16 q6, q14, q6 + vsub.i16 q7, q14, q7 + vst1.16 {d8, d9, d10, d11}, [r1, :256] + vst1.16 {d12, d13, d14, d15}, [r9, :256] + vshl.s16 q4, q15, q4 + vshl.s16 q5, q15, q5 + vshl.s16 q6, q15, q6 + vshl.s16 q7, q15, q7 + vsub.i16 q4, q4, q15 + vsub.i16 q5, q5, q15 + vsub.i16 q6, q6, q15 + vsub.i16 q7, q7, q15 + vand q8, q8, q4 + vand q9, q9, q5 + vand q10, q10, q6 + vand q11, q11, q7 + vst1.16 {d16, d17, d18, d19}, [r8, :256] + vst1.16 {d20, d21, d22, d23}, [r3, :256] + ldr r12, [r7, #0xc] /* r12 = actbl */ + add r1, lr, #0x400 /* r1 = dctbl->ehufsi */ + mov r9, r12 /* r9 = actbl */ + add r6, r4, #0x80 /* r6 = t2 */ + ldr r11, [r0, #0x8] /* r11 = put_buffer */ + ldr r4, [r0, #0xc] /* r4 = put_bits */ + ldrh r2, [r6, #-128] /* r2 = nbits */ + ldrh r3, [r6] /* r3 = temp2 & (((JLONG)1)<<nbits) - 1; */ + ldr r0, [lr, r2, lsl #2] + ldrb r5, [r1, r2] + put_bits r11, r4, r0, r5 + checkbuf15 r10, r11, r4, r5, r0 + put_bits r11, r4, r3, r2 + checkbuf15 r10, r11, r4, r5, r0 + mov lr, r6 /* lr = t2 */ + add r5, r9, #0x400 /* r5 = actbl->ehufsi */ + ldrsb r6, [r5, #0xf0] /* r6 = actbl->ehufsi[0xf0] */ + veor q8, q8, q8 + vceq.i16 q0, q0, q8 + vceq.i16 q1, q1, q8 + vceq.i16 q2, q2, q8 + vceq.i16 q3, q3, q8 + vceq.i16 q4, q4, q8 + vceq.i16 q5, q5, q8 + vceq.i16 q6, q6, q8 + vceq.i16 q7, q7, q8 + vmovn.i16 d0, q0 + vmovn.i16 d2, q1 + vmovn.i16 d4, q2 + vmovn.i16 d6, q3 + vmovn.i16 d8, q4 + vmovn.i16 d10, q5 + vmovn.i16 d12, q6 + vmovn.i16 d14, q7 + vand d0, d0, d26 + vand d2, d2, d26 + vand d4, d4, d26 + vand d6, d6, d26 + vand d8, d8, d26 + vand d10, d10, d26 + vand d12, d12, d26 + vand d14, d14, d26 + vpadd.i8 d0, d0, d2 + vpadd.i8 d4, d4, d6 + vpadd.i8 d8, d8, d10 + vpadd.i8 d12, d12, d14 + vpadd.i8 d0, d0, d4 + vpadd.i8 d8, d8, d12 + vpadd.i8 d0, d0, d8 + vmov.32 r1, d0[1] + vmov.32 r8, d0[0] + mvn r1, r1 + mvn r8, r8 + lsrs r1, r1, #0x1 + rrx r8, r8 /* shift in last r1 bit while shifting out DC bit */ + rbit r1, r1 /* r1 = index1 */ + rbit r8, r8 /* r8 = index0 */ + ldr r0, [r9, #0x3c0] /* r0 = actbl->ehufco[0xf0] */ + str r1, [sp, #0x14] /* index1 > sp + 0x14 */ + cmp r8, #0x0 + beq 6f +1: + clz r2, r8 + add lr, lr, r2, lsl #1 + lsl r8, r8, r2 + ldrh r1, [lr, #-126] +2: + cmp r2, #0x10 + blt 3f + sub r2, r2, #0x10 + put_bits r11, r4, r0, r6 + cmp r4, #0x10 + blt 2b + eor r3, r3, r3 + emit_byte r10, r11, r4, r3, r12 + emit_byte r10, r11, r4, r3, r12 + b 2b +3: + add r2, r1, r2, lsl #4 + ldrh r3, [lr, #2]! + ldr r12, [r9, r2, lsl #2] + ldrb r2, [r5, r2] + put_bits r11, r4, r12, r2 + checkbuf15 r10, r11, r4, r2, r12 + put_bits r11, r4, r3, r1 + checkbuf15 r10, r11, r4, r2, r12 + lsls r8, r8, #0x1 + bne 1b +6: + add r12, sp, #0x20 /* r12 = t1 */ + ldr r8, [sp, #0x14] /* r8 = index1 */ + adds r12, #0xc0 /* r12 = t2 + (DCTSIZE2/2) */ + cmp r8, #0x0 + beq 6f + clz r2, r8 + sub r12, r12, lr + lsl r8, r8, r2 + add r2, r2, r12, lsr #1 + add lr, lr, r2, lsl #1 + b 7f +1: + clz r2, r8 + add lr, lr, r2, lsl #1 + lsl r8, r8, r2 +7: + ldrh r1, [lr, #-126] +2: + cmp r2, #0x10 + blt 3f + sub r2, r2, #0x10 + put_bits r11, r4, r0, r6 + cmp r4, #0x10 + blt 2b + eor r3, r3, r3 + emit_byte r10, r11, r4, r3, r12 + emit_byte r10, r11, r4, r3, r12 + b 2b +3: + add r2, r1, r2, lsl #4 + ldrh r3, [lr, #2]! + ldr r12, [r9, r2, lsl #2] + ldrb r2, [r5, r2] + put_bits r11, r4, r12, r2 + checkbuf15 r10, r11, r4, r2, r12 + put_bits r11, r4, r3, r1 + checkbuf15 r10, r11, r4, r2, r12 + lsls r8, r8, #0x1 + bne 1b +6: + add r0, sp, #0x20 + add r0, #0xfe + cmp lr, r0 + bhs 1f + ldr r1, [r9] + ldrb r0, [r5] + put_bits r11, r4, r1, r0 + checkbuf15 r10, r11, r4, r0, r1 +1: + ldr r12, [sp, #0x18] + str r11, [r12, #0x8] + str r4, [r12, #0xc] + add r0, r10, #0x1 + add r4, sp, #0x140 + vld1.64 {d8, d9, d10, d11}, [r4, :128]! + vld1.64 {d12, d13, d14, d15}, [r4, :128] + sub r4, r7, #0x1c + mov sp, r4 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + +.purgem emit_byte +.purgem put_bits +.purgem checkbuf15 diff --git a/simd/arm/arm64/jccolext-neon.c b/simd/arm/arm64/jccolext-neon.c new file mode 100644 index 0000000..89f520a --- /dev/null +++ b/simd/arm/arm64/jccolext-neon.c @@ -0,0 +1,312 @@ +/* + * jccolext-neon.c - colorspace conversion (Arm NEON) + * + * Copyright 2020 The Chromium Authors. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jccolor-neon.c */ + +/* + * RGB -> YCbCr conversion is defined by the following equations: + * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 + * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 + * + * Avoid floating point arithmetic by using shifted integer constants: + * 0.29899597 = 19595 * 2^-16 + * 0.58700561 = 38470 * 2^-16 + * 0.11399841 = 7471 * 2^-16 + * 0.16874695 = 11059 * 2^-16 + * 0.33125305 = 21709 * 2^-16 + * 0.50000000 = 32768 * 2^-16 + * 0.41868592 = 27439 * 2^-16 + * 0.08131409 = 5329 * 2^-16 + * These constants are defined in jccolor-neon.c + * + * To ensure rounding gives correct values, we add 0.5 to Cb and Cr. + */ + +void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, + JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, + JDIMENSION output_row, + int num_rows) +{ + /* Pointer to RGB(X/A) input data. */ + JSAMPROW inptr; + /* Pointers to Y, Cb and Cr output data. */ + JSAMPROW outptr0, outptr1, outptr2; + + /* Setup conversion constants. */ + const uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts); + const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767); + + while (--num_rows >= 0) { + inptr = *input_buf++; + outptr0 = output_buf[0][output_row]; + outptr1 = output_buf[1][output_row]; + outptr2 = output_buf[2][output_row]; + output_row++; + + int cols_remaining = image_width; + for (; cols_remaining >= 16; cols_remaining -= 16) { + +#if RGB_PIXELSIZE == 4 + uint8x16x4_t input_pixels = vld4q_u8(inptr); +#else + uint8x16x3_t input_pixels = vld3q_u8(inptr); +#endif + uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED])); + uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN])); + uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE])); + uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED])); + uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN])); + uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE])); + + /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */ + uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0); + y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1); + y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2); + uint32x4_t y_lh = vmull_high_laneq_u16(r_l, consts, 0); + y_lh = vmlal_high_laneq_u16(y_lh, g_l, consts, 1); + y_lh = vmlal_high_laneq_u16(y_lh, b_l, consts, 2); + uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0); + y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1); + y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2); + uint32x4_t y_hh = vmull_high_laneq_u16(r_h, consts, 0); + y_hh = vmlal_high_laneq_u16(y_hh, g_h, consts, 1); + y_hh = vmlal_high_laneq_u16(y_hh, b_h, consts, 2); + + /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */ + uint32x4_t cb_ll = scaled_128_5; + cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3); + cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4); + cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5); + uint32x4_t cb_lh = scaled_128_5; + cb_lh = vmlsl_high_laneq_u16(cb_lh, r_l, consts, 3); + cb_lh = vmlsl_high_laneq_u16(cb_lh, g_l, consts, 4); + cb_lh = vmlal_high_laneq_u16(cb_lh, b_l, consts, 5); + uint32x4_t cb_hl = scaled_128_5; + cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3); + cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4); + cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5); + uint32x4_t cb_hh = scaled_128_5; + cb_hh = vmlsl_high_laneq_u16(cb_hh, r_h, consts, 3); + cb_hh = vmlsl_high_laneq_u16(cb_hh, g_h, consts, 4); + cb_hh = vmlal_high_laneq_u16(cb_hh, b_h, consts, 5); + + /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */ + uint32x4_t cr_ll = scaled_128_5; + cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5); + cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6); + cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7); + uint32x4_t cr_lh = scaled_128_5; + cr_lh = vmlal_high_laneq_u16(cr_lh, r_l, consts, 5); + cr_lh = vmlsl_high_laneq_u16(cr_lh, g_l, consts, 6); + cr_lh = vmlsl_high_laneq_u16(cr_lh, b_l, consts, 7); + uint32x4_t cr_hl = scaled_128_5; + cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5); + cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6); + cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7); + uint32x4_t cr_hh = scaled_128_5; + cr_hh = vmlal_high_laneq_u16(cr_hh, r_h, consts, 5); + cr_hh = vmlsl_high_laneq_u16(cr_hh, g_h, consts, 6); + cr_hh = vmlsl_high_laneq_u16(cr_hh, b_h, consts, 7); + + /* Descale Y values (rounding right shift) and narrow to 16-bit. */ + uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16), + vrshrn_n_u32(y_lh, 16)); + uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16), + vrshrn_n_u32(y_hh, 16)); + /* Descale Cb values (right shift) and narrow to 16-bit. */ + uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16), + vshrn_n_u32(cb_lh, 16)); + uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16), + vshrn_n_u32(cb_hh, 16)); + /* Descale Cr values (right shift) and narrow to 16-bit. */ + uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16), + vshrn_n_u32(cr_lh, 16)); + uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16), + vshrn_n_u32(cr_hh, 16)); + /* Narrow Y, Cb and Cr values to 8-bit and store to memory. Buffer */ + /* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes. */ + vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h))); + vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h))); + vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h))); + + /* Increment pointers. */ + inptr += (16 * RGB_PIXELSIZE); + outptr0 += 16; + outptr1 += 16; + outptr2 += 16; + } + + if (cols_remaining > 8) { + /* To prevent buffer overread by the vector load instructions, the */ + /* last (image_width % 16) columns of data are first memcopied to a */ + /* temporary buffer large enough to accommodate the vector load. */ + ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE]; + memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE); + inptr = tmp_buf; + +#if RGB_PIXELSIZE == 4 + uint8x16x4_t input_pixels = vld4q_u8(inptr); +#else + uint8x16x3_t input_pixels = vld3q_u8(inptr); +#endif + uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED])); + uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN])); + uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE])); + uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED])); + uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN])); + uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE])); + + /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */ + uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0); + y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1); + y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2); + uint32x4_t y_lh = vmull_high_laneq_u16(r_l, consts, 0); + y_lh = vmlal_high_laneq_u16(y_lh, g_l, consts, 1); + y_lh = vmlal_high_laneq_u16(y_lh, b_l, consts, 2); + uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0); + y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1); + y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2); + uint32x4_t y_hh = vmull_high_laneq_u16(r_h, consts, 0); + y_hh = vmlal_high_laneq_u16(y_hh, g_h, consts, 1); + y_hh = vmlal_high_laneq_u16(y_hh, b_h, consts, 2); + + /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */ + uint32x4_t cb_ll = scaled_128_5; + cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3); + cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4); + cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5); + uint32x4_t cb_lh = scaled_128_5; + cb_lh = vmlsl_high_laneq_u16(cb_lh, r_l, consts, 3); + cb_lh = vmlsl_high_laneq_u16(cb_lh, g_l, consts, 4); + cb_lh = vmlal_high_laneq_u16(cb_lh, b_l, consts, 5); + uint32x4_t cb_hl = scaled_128_5; + cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3); + cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4); + cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5); + uint32x4_t cb_hh = scaled_128_5; + cb_hh = vmlsl_high_laneq_u16(cb_hh, r_h, consts, 3); + cb_hh = vmlsl_high_laneq_u16(cb_hh, g_h, consts, 4); + cb_hh = vmlal_high_laneq_u16(cb_hh, b_h, consts, 5); + + /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */ + uint32x4_t cr_ll = scaled_128_5; + cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5); + cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6); + cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7); + uint32x4_t cr_lh = scaled_128_5; + cr_lh = vmlal_high_laneq_u16(cr_lh, r_l, consts, 5); + cr_lh = vmlsl_high_laneq_u16(cr_lh, g_l, consts, 6); + cr_lh = vmlsl_high_laneq_u16(cr_lh, b_l, consts, 7); + uint32x4_t cr_hl = scaled_128_5; + cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5); + cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6); + cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7); + uint32x4_t cr_hh = scaled_128_5; + cr_hh = vmlal_high_laneq_u16(cr_hh, r_h, consts, 5); + cr_hh = vmlsl_high_laneq_u16(cr_hh, g_h, consts, 6); + cr_hh = vmlsl_high_laneq_u16(cr_hh, b_h, consts, 7); + + /* Descale Y values (rounding right shift) and narrow to 16-bit. */ + uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16), + vrshrn_n_u32(y_lh, 16)); + uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16), + vrshrn_n_u32(y_hh, 16)); + /* Descale Cb values (right shift) and narrow to 16-bit. */ + uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16), + vshrn_n_u32(cb_lh, 16)); + uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16), + vshrn_n_u32(cb_hh, 16)); + /* Descale Cr values (right shift) and narrow to 16-bit. */ + uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16), + vshrn_n_u32(cr_lh, 16)); + uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16), + vshrn_n_u32(cr_hh, 16)); + /* Narrow Y, Cb and Cr values to 8-bit and store to memory. Buffer */ + /* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes. */ + vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h))); + vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h))); + vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h))); + + } else if (cols_remaining > 0) { + /* To prevent buffer overread by the vector load instructions, the */ + /* last (image_width % 8) columns of data are first memcopied to a */ + /* temporary buffer large enough to accommodate the vector load. */ + ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE]; + memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE); + inptr = tmp_buf; + +#if RGB_PIXELSIZE == 4 + uint8x8x4_t input_pixels = vld4_u8(inptr); +#else + uint8x8x3_t input_pixels = vld3_u8(inptr); +#endif + uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]); + uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]); + uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]); + + /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */ + uint32x4_t y_l = vmull_laneq_u16(vget_low_u16(r), consts, 0); + y_l = vmlal_laneq_u16(y_l, vget_low_u16(g), consts, 1); + y_l = vmlal_laneq_u16(y_l, vget_low_u16(b), consts, 2); + uint32x4_t y_h = vmull_high_laneq_u16(r, consts, 0); + y_h = vmlal_high_laneq_u16(y_h, g, consts, 1); + y_h = vmlal_high_laneq_u16(y_h, b, consts, 2); + + /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */ + uint32x4_t cb_l = scaled_128_5; + cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(r), consts, 3); + cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(g), consts, 4); + cb_l = vmlal_laneq_u16(cb_l, vget_low_u16(b), consts, 5); + uint32x4_t cb_h = scaled_128_5; + cb_h = vmlsl_high_laneq_u16(cb_h, r, consts, 3); + cb_h = vmlsl_high_laneq_u16(cb_h, g, consts, 4); + cb_h = vmlal_high_laneq_u16(cb_h, b, consts, 5); + + /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */ + uint32x4_t cr_l = scaled_128_5; + cr_l = vmlal_laneq_u16(cr_l, vget_low_u16(r), consts, 5); + cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(g), consts, 6); + cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(b), consts, 7); + uint32x4_t cr_h = scaled_128_5; + cr_h = vmlal_high_laneq_u16(cr_h, r, consts, 5); + cr_h = vmlsl_high_laneq_u16(cr_h, g, consts, 6); + cr_h = vmlsl_high_laneq_u16(cr_h, b, consts, 7); + + /* Descale Y values (rounding right shift) and narrow to 16-bit. */ + uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_l, 16), + vrshrn_n_u32(y_h, 16)); + /* Descale Cb values (right shift) and narrow to 16-bit. */ + uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_l, 16), + vshrn_n_u32(cb_h, 16)); + /* Descale Cr values (right shift) and narrow to 16-bit. */ + uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_l, 16), + vshrn_n_u32(cr_h, 16)); + /* Narrow Y, Cb and Cr values to 8-bit and store to memory. Buffer */ + /* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes. */ + vst1_u8(outptr0, vmovn_u16(y_u16)); + vst1_u8(outptr1, vmovn_u16(cb_u16)); + vst1_u8(outptr2, vmovn_u16(cr_u16)); + } + } +} diff --git a/simd/arm64/jsimd.c b/simd/arm/arm64/jsimd.c index 0e6c7b9..ca29cd6 100644 --- a/simd/arm64/jsimd.c +++ b/simd/arm/arm64/jsimd.c @@ -16,25 +16,22 @@ */ #define JPEG_INTERNALS -#include "../../jinclude.h" -#include "../../jpeglib.h" +#include "../../../jinclude.h" +#include "../../../jpeglib.h" +#include "../../../jsimd.h" +#include "../../../jdct.h" +#include "../../../jsimddct.h" #include "../../jsimd.h" -#include "../../jdct.h" -#include "../../jsimddct.h" -#include "../jsimd.h" #include <stdio.h> #include <string.h> #include <ctype.h> -#define JSIMD_FASTLD3 1 -#define JSIMD_FASTST3 2 #define JSIMD_FASTTBL 4 static unsigned int simd_support = ~0; static unsigned int simd_huffman = 1; -static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 | - JSIMD_FASTTBL; +static unsigned int simd_features = JSIMD_FASTTBL; #if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) @@ -154,16 +151,6 @@ init_simd(void) env = getenv("JSIMD_NOHUFFENC"); if ((env != NULL) && (strcmp(env, "1") == 0)) simd_huffman = 0; - env = getenv("JSIMD_FASTLD3"); - if ((env != NULL) && (strcmp(env, "1") == 0)) - simd_features |= JSIMD_FASTLD3; - if ((env != NULL) && (strcmp(env, "0") == 0)) - simd_features &= ~JSIMD_FASTLD3; - env = getenv("JSIMD_FASTST3"); - if ((env != NULL) && (strcmp(env, "1") == 0)) - simd_features |= JSIMD_FASTST3; - if ((env != NULL) && (strcmp(env, "0") == 0)) - simd_features &= ~JSIMD_FASTST3; #endif } @@ -189,6 +176,19 @@ jsimd_can_rgb_ycc(void) GLOBAL(int) jsimd_can_rgb_gray(void) { + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + return 0; } @@ -237,20 +237,14 @@ jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, switch (cinfo->in_color_space) { case JCS_EXT_RGB: - if (simd_features & JSIMD_FASTLD3) - neonfct = jsimd_extrgb_ycc_convert_neon; - else - neonfct = jsimd_extrgb_ycc_convert_neon_slowld3; + neonfct = jsimd_extrgb_ycc_convert_neon; break; case JCS_EXT_RGBX: case JCS_EXT_RGBA: neonfct = jsimd_extrgbx_ycc_convert_neon; break; case JCS_EXT_BGR: - if (simd_features & JSIMD_FASTLD3) - neonfct = jsimd_extbgr_ycc_convert_neon; - else - neonfct = jsimd_extbgr_ycc_convert_neon_slowld3; + neonfct = jsimd_extbgr_ycc_convert_neon; break; case JCS_EXT_BGRX: case JCS_EXT_BGRA: @@ -265,10 +259,7 @@ jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, neonfct = jsimd_extxrgb_ycc_convert_neon; break; default: - if (simd_features & JSIMD_FASTLD3) - neonfct = jsimd_extrgb_ycc_convert_neon; - else - neonfct = jsimd_extrgb_ycc_convert_neon_slowld3; + neonfct = jsimd_extrgb_ycc_convert_neon; break; } @@ -280,6 +271,37 @@ jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows) { + void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + + switch (cinfo->in_color_space) { + case JCS_EXT_RGB: + neonfct = jsimd_extrgb_gray_convert_neon; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + neonfct = jsimd_extrgbx_gray_convert_neon; + break; + case JCS_EXT_BGR: + neonfct = jsimd_extbgr_gray_convert_neon; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + neonfct = jsimd_extbgrx_gray_convert_neon; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + neonfct = jsimd_extxbgr_gray_convert_neon; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + neonfct = jsimd_extxrgb_gray_convert_neon; + break; + default: + neonfct = jsimd_extrgb_gray_convert_neon; + break; + } + + neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); } GLOBAL(void) @@ -291,20 +313,14 @@ jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, switch (cinfo->out_color_space) { case JCS_EXT_RGB: - if (simd_features & JSIMD_FASTST3) - neonfct = jsimd_ycc_extrgb_convert_neon; - else - neonfct = jsimd_ycc_extrgb_convert_neon_slowst3; + neonfct = jsimd_ycc_extrgb_convert_neon; break; case JCS_EXT_RGBX: case JCS_EXT_RGBA: neonfct = jsimd_ycc_extrgbx_convert_neon; break; case JCS_EXT_BGR: - if (simd_features & JSIMD_FASTST3) - neonfct = jsimd_ycc_extbgr_convert_neon; - else - neonfct = jsimd_ycc_extbgr_convert_neon_slowst3; + neonfct = jsimd_ycc_extbgr_convert_neon; break; case JCS_EXT_BGRX: case JCS_EXT_BGRA: @@ -319,11 +335,7 @@ jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, neonfct = jsimd_ycc_extxrgb_convert_neon; break; default: - if (simd_features & JSIMD_FASTST3) - neonfct = jsimd_ycc_extrgb_convert_neon; - else - neonfct = jsimd_ycc_extrgb_convert_neon_slowst3; - break; + neonfct = jsimd_ycc_extrgb_convert_neon; } neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); @@ -397,12 +409,34 @@ jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, GLOBAL(int) jsimd_can_h2v2_upsample(void) { + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + return 0; } GLOBAL(int) jsimd_can_h2v1_upsample(void) { + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + return 0; } @@ -410,23 +444,66 @@ GLOBAL(void) jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); } GLOBAL(void) jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width, + input_data, output_data_ptr); } GLOBAL(int) jsimd_can_h2v2_fancy_upsample(void) { + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + return 0; } GLOBAL(int) jsimd_can_h2v1_fancy_upsample(void) { + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h1v2_fancy_upsample(void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + return 0; } @@ -434,23 +511,60 @@ GLOBAL(void) jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); } GLOBAL(void) jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); +} + +GLOBAL(void) +jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ + jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, + output_data_ptr); } GLOBAL(int) jsimd_can_h2v2_merged_upsample(void) { + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + return 0; } GLOBAL(int) jsimd_can_h2v1_merged_upsample(void) { + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + + if (simd_support & JSIMD_NEON) + return 1; + return 0; } @@ -458,12 +572,74 @@ GLOBAL(void) jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) { + void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + neonfct = jsimd_h2v2_extrgb_merged_upsample_neon; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon; + break; + case JCS_EXT_BGR: + neonfct = jsimd_h2v2_extbgr_merged_upsample_neon; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon; + break; + default: + neonfct = jsimd_h2v2_extrgb_merged_upsample_neon; + break; + } + + neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); } GLOBAL(void) jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) { + void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + + switch (cinfo->out_color_space) { + case JCS_EXT_RGB: + neonfct = jsimd_h2v1_extrgb_merged_upsample_neon; + break; + case JCS_EXT_RGBX: + case JCS_EXT_RGBA: + neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon; + break; + case JCS_EXT_BGR: + neonfct = jsimd_h2v1_extbgr_merged_upsample_neon; + break; + case JCS_EXT_BGRX: + case JCS_EXT_BGRA: + neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon; + break; + case JCS_EXT_XBGR: + case JCS_EXT_ABGR: + neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon; + break; + case JCS_EXT_XRGB: + case JCS_EXT_ARGB: + neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon; + break; + default: + neonfct = jsimd_h2v1_extrgb_merged_upsample_neon; + break; + } + + neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); } GLOBAL(int) diff --git a/simd/arm/arm64/jsimd_neon.S b/simd/arm/arm64/jsimd_neon.S new file mode 100644 index 0000000..898cf2c --- /dev/null +++ b/simd/arm/arm64/jsimd_neon.S @@ -0,0 +1,538 @@ +/* + * ARMv8 NEON optimizations for libjpeg-turbo + * + * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies). + * All Rights Reserved. + * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> + * Copyright (C) 2013-2014, Linaro Limited. All Rights Reserved. + * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org> + * Copyright (C) 2014-2016, D. R. Commander. All Rights Reserved. + * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved. + * Copyright (C) 2016, Siarhei Siamashka. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */ +#endif + +#if defined(__APPLE__) +.section __DATA, __const +#elif defined(_WIN32) +.section .rdata +#else +.section .rodata, "a", %progbits +#endif + +/* Constants for jsimd_huff_encode_one_block_neon() */ + +.balign 16 +Ljsimd_huff_encode_one_block_neon_consts: + .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \ + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 + .byte 0, 1, 2, 3, 16, 17, 32, 33, \ + 18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */ + .byte 34, 35, 48, 49, 255, 255, 50, 51, \ + 36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */ + .byte 8, 9, 22, 23, 36, 37, 50, 51, \ + 255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */ + .byte 54, 55, 40, 41, 26, 27, 12, 13, \ + 14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */ + .byte 6, 7, 20, 21, 34, 35, 48, 49, \ + 50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */ + .byte 42, 43, 28, 29, 14, 15, 30, 31, \ + 44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */ + .byte 255, 255, 255, 255, 56, 57, 42, 43, \ + 28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */ + .byte 26, 27, 40, 41, 42, 43, 28, 29, \ + 14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */ + .byte 255, 255, 255, 255, 0, 1, 255, 255, \ + 255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */ + .byte 255, 255, 255, 255, 255, 255, 255, 255, \ + 0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */ + .byte 255, 255, 255, 255, 255, 255, 255, 255, \ + 255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */ + .byte 4, 5, 6, 7, 255, 255, 255, 255, \ + 255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */ + +.text + + +#define RESPECT_STRICT_ALIGNMENT 1 + + +/*****************************************************************************/ + +/* Supplementary macro for setting function attributes */ +.macro asm_function fname +#ifdef __APPLE__ + .private_extern _\fname + .globl _\fname +_\fname: +#else + .global \fname +#ifdef __ELF__ + .hidden \fname + .type \fname, %function +#endif +\fname: +#endif +.endm + +/* Get symbol location */ +.macro get_symbol_loc reg, symbol +#ifdef __APPLE__ + adrp \reg, \symbol@PAGE + add \reg, \reg, \symbol@PAGEOFF +#else + adrp \reg, \symbol + add \reg, \reg, :lo12:\symbol +#endif +.endm + + +#define CENTERJSAMPLE 128 + +/*****************************************************************************/ + +/* + * GLOBAL(JOCTET *) + * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer, + * JCOEFPTR block, int last_dc_val, + * c_derived_tbl *dctbl, c_derived_tbl *actbl) + * + */ + + BUFFER .req x1 + PUT_BUFFER .req x6 + PUT_BITS .req x7 + PUT_BITSw .req w7 + +.macro emit_byte + sub PUT_BITS, PUT_BITS, #0x8 + lsr x19, PUT_BUFFER, PUT_BITS + uxtb w19, w19 + strb w19, [BUFFER, #1]! + cmp w19, #0xff + b.ne 14f + strb wzr, [BUFFER, #1]! +14: +.endm +.macro put_bits CODE, SIZE + lsl PUT_BUFFER, PUT_BUFFER, \SIZE + add PUT_BITS, PUT_BITS, \SIZE + orr PUT_BUFFER, PUT_BUFFER, \CODE +.endm +.macro checkbuf31 + cmp PUT_BITS, #0x20 + b.lt 31f + emit_byte + emit_byte + emit_byte + emit_byte +31: +.endm +.macro checkbuf47 + cmp PUT_BITS, #0x30 + b.lt 47f + emit_byte + emit_byte + emit_byte + emit_byte + emit_byte + emit_byte +47: +.endm + +.macro generate_jsimd_huff_encode_one_block fast_tbl + +.balign 16 +.if \fast_tbl == 1 +asm_function jsimd_huff_encode_one_block_neon +.else +asm_function jsimd_huff_encode_one_block_neon_slowtbl +.endif + sub sp, sp, 272 + sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */ + /* Save ARM registers */ + stp x19, x20, [sp] + get_symbol_loc x15, Ljsimd_huff_encode_one_block_neon_consts + ldr PUT_BUFFER, [x0, #0x10] + ldr PUT_BITSw, [x0, #0x18] + ldrsh w12, [x2] /* load DC coeff in w12 */ + /* prepare data */ +.if \fast_tbl == 1 + ld1 {v23.16b}, [x15], #16 + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64 + ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64 + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64 + ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64 + ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64 + sub w12, w12, w3 /* last_dc_val, not used afterwards */ + /* ZigZag 8x8 */ + tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b + tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b + tbl v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b + tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b + tbl v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b + tbl v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b + tbl v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b + tbl v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b + ins v0.h[0], w12 + tbx v1.16b, {v28.16b}, v16.16b + tbx v2.16b, {v29.16b, v30.16b}, v17.16b + tbx v5.16b, {v29.16b, v30.16b}, v18.16b + tbx v6.16b, {v31.16b}, v19.16b +.else + add x13, x2, #0x22 + sub w12, w12, w3 /* last_dc_val, not used afterwards */ + ld1 {v23.16b}, [x15] + add x14, x2, #0x18 + add x3, x2, #0x36 + ins v0.h[0], w12 + add x9, x2, #0x2 + ld1 {v1.h}[0], [x13] + add x15, x2, #0x30 + ld1 {v2.h}[0], [x14] + add x19, x2, #0x26 + ld1 {v3.h}[0], [x3] + add x20, x2, #0x28 + ld1 {v0.h}[1], [x9] + add x12, x2, #0x10 + ld1 {v1.h}[1], [x15] + add x13, x2, #0x40 + ld1 {v2.h}[1], [x19] + add x14, x2, #0x34 + ld1 {v3.h}[1], [x20] + add x3, x2, #0x1a + ld1 {v0.h}[2], [x12] + add x9, x2, #0x20 + ld1 {v1.h}[2], [x13] + add x15, x2, #0x32 + ld1 {v2.h}[2], [x14] + add x19, x2, #0x42 + ld1 {v3.h}[2], [x3] + add x20, x2, #0xc + ld1 {v0.h}[3], [x9] + add x12, x2, #0x12 + ld1 {v1.h}[3], [x15] + add x13, x2, #0x24 + ld1 {v2.h}[3], [x19] + add x14, x2, #0x50 + ld1 {v3.h}[3], [x20] + add x3, x2, #0xe + ld1 {v0.h}[4], [x12] + add x9, x2, #0x4 + ld1 {v1.h}[4], [x13] + add x15, x2, #0x16 + ld1 {v2.h}[4], [x14] + add x19, x2, #0x60 + ld1 {v3.h}[4], [x3] + add x20, x2, #0x1c + ld1 {v0.h}[5], [x9] + add x12, x2, #0x6 + ld1 {v1.h}[5], [x15] + add x13, x2, #0x8 + ld1 {v2.h}[5], [x19] + add x14, x2, #0x52 + ld1 {v3.h}[5], [x20] + add x3, x2, #0x2a + ld1 {v0.h}[6], [x12] + add x9, x2, #0x14 + ld1 {v1.h}[6], [x13] + add x15, x2, #0xa + ld1 {v2.h}[6], [x14] + add x19, x2, #0x44 + ld1 {v3.h}[6], [x3] + add x20, x2, #0x38 + ld1 {v0.h}[7], [x9] + add x12, x2, #0x46 + ld1 {v1.h}[7], [x15] + add x13, x2, #0x3a + ld1 {v2.h}[7], [x19] + add x14, x2, #0x74 + ld1 {v3.h}[7], [x20] + add x3, x2, #0x6a + ld1 {v4.h}[0], [x12] + add x9, x2, #0x54 + ld1 {v5.h}[0], [x13] + add x15, x2, #0x2c + ld1 {v6.h}[0], [x14] + add x19, x2, #0x76 + ld1 {v7.h}[0], [x3] + add x20, x2, #0x78 + ld1 {v4.h}[1], [x9] + add x12, x2, #0x62 + ld1 {v5.h}[1], [x15] + add x13, x2, #0x1e + ld1 {v6.h}[1], [x19] + add x14, x2, #0x68 + ld1 {v7.h}[1], [x20] + add x3, x2, #0x7a + ld1 {v4.h}[2], [x12] + add x9, x2, #0x70 + ld1 {v5.h}[2], [x13] + add x15, x2, #0x2e + ld1 {v6.h}[2], [x14] + add x19, x2, #0x5a + ld1 {v7.h}[2], [x3] + add x20, x2, #0x6c + ld1 {v4.h}[3], [x9] + add x12, x2, #0x72 + ld1 {v5.h}[3], [x15] + add x13, x2, #0x3c + ld1 {v6.h}[3], [x19] + add x14, x2, #0x4c + ld1 {v7.h}[3], [x20] + add x3, x2, #0x5e + ld1 {v4.h}[4], [x12] + add x9, x2, #0x64 + ld1 {v5.h}[4], [x13] + add x15, x2, #0x4a + ld1 {v6.h}[4], [x14] + add x19, x2, #0x3e + ld1 {v7.h}[4], [x3] + add x20, x2, #0x6e + ld1 {v4.h}[5], [x9] + add x12, x2, #0x56 + ld1 {v5.h}[5], [x15] + add x13, x2, #0x58 + ld1 {v6.h}[5], [x19] + add x14, x2, #0x4e + ld1 {v7.h}[5], [x20] + add x3, x2, #0x7c + ld1 {v4.h}[6], [x12] + add x9, x2, #0x48 + ld1 {v5.h}[6], [x13] + add x15, x2, #0x66 + ld1 {v6.h}[6], [x14] + add x19, x2, #0x5c + ld1 {v7.h}[6], [x3] + add x20, x2, #0x7e + ld1 {v4.h}[7], [x9] + ld1 {v5.h}[7], [x15] + ld1 {v6.h}[7], [x19] + ld1 {v7.h}[7], [x20] +.endif + cmlt v24.8h, v0.8h, #0 + cmlt v25.8h, v1.8h, #0 + cmlt v26.8h, v2.8h, #0 + cmlt v27.8h, v3.8h, #0 + cmlt v28.8h, v4.8h, #0 + cmlt v29.8h, v5.8h, #0 + cmlt v30.8h, v6.8h, #0 + cmlt v31.8h, v7.8h, #0 + abs v0.8h, v0.8h + abs v1.8h, v1.8h + abs v2.8h, v2.8h + abs v3.8h, v3.8h + abs v4.8h, v4.8h + abs v5.8h, v5.8h + abs v6.8h, v6.8h + abs v7.8h, v7.8h + eor v24.16b, v24.16b, v0.16b + eor v25.16b, v25.16b, v1.16b + eor v26.16b, v26.16b, v2.16b + eor v27.16b, v27.16b, v3.16b + eor v28.16b, v28.16b, v4.16b + eor v29.16b, v29.16b, v5.16b + eor v30.16b, v30.16b, v6.16b + eor v31.16b, v31.16b, v7.16b + cmeq v16.8h, v0.8h, #0 + cmeq v17.8h, v1.8h, #0 + cmeq v18.8h, v2.8h, #0 + cmeq v19.8h, v3.8h, #0 + cmeq v20.8h, v4.8h, #0 + cmeq v21.8h, v5.8h, #0 + cmeq v22.8h, v6.8h, #0 + xtn v16.8b, v16.8h + xtn v18.8b, v18.8h + xtn v20.8b, v20.8h + xtn v22.8b, v22.8h + umov w14, v0.h[0] + xtn2 v16.16b, v17.8h + umov w13, v24.h[0] + xtn2 v18.16b, v19.8h + clz w14, w14 + xtn2 v20.16b, v21.8h + lsl w13, w13, w14 + cmeq v17.8h, v7.8h, #0 + sub w12, w14, #32 + xtn2 v22.16b, v17.8h + lsr w13, w13, w14 + and v16.16b, v16.16b, v23.16b + neg w12, w12 + and v18.16b, v18.16b, v23.16b + add x3, x4, #0x400 /* r1 = dctbl->ehufsi */ + and v20.16b, v20.16b, v23.16b + add x15, sp, #0x90 /* x15 = t2 */ + and v22.16b, v22.16b, v23.16b + ldr w10, [x4, x12, lsl #2] + addp v16.16b, v16.16b, v18.16b + ldrb w11, [x3, x12] + addp v20.16b, v20.16b, v22.16b + checkbuf47 + addp v16.16b, v16.16b, v20.16b + put_bits x10, x11 + addp v16.16b, v16.16b, v18.16b + checkbuf47 + umov x9, v16.D[0] + put_bits x13, x12 + cnt v17.8b, v16.8b + mvn x9, x9 + addv B18, v17.8b + add x4, x5, #0x400 /* x4 = actbl->ehufsi */ + umov w12, v18.b[0] + lsr x9, x9, #0x1 /* clear AC coeff */ + ldr w13, [x5, #0x3c0] /* x13 = actbl->ehufco[0xf0] */ + rbit x9, x9 /* x9 = index0 */ + ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */ + cmp w12, #(64-8) + add x11, sp, #16 + b.lt 4f + cbz x9, 6f + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64 + st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64 + st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64 +1: + clz x2, x9 + add x15, x15, x2, lsl #1 + lsl x9, x9, x2 + ldrh w20, [x15, #-126] +2: + cmp x2, #0x10 + b.lt 3f + sub x2, x2, #0x10 + checkbuf47 + put_bits x13, x14 + b 2b +3: + clz w20, w20 + ldrh w3, [x15, #2]! + sub w11, w20, #32 + lsl w3, w3, w20 + neg w11, w11 + lsr w3, w3, w20 + add x2, x11, x2, lsl #4 + lsl x9, x9, #0x1 + ldr w12, [x5, x2, lsl #2] + ldrb w10, [x4, x2] + checkbuf31 + put_bits x12, x10 + put_bits x3, x11 + cbnz x9, 1b + b 6f +4: + movi v21.8h, #0x0010 + clz v0.8h, v0.8h + clz v1.8h, v1.8h + clz v2.8h, v2.8h + clz v3.8h, v3.8h + clz v4.8h, v4.8h + clz v5.8h, v5.8h + clz v6.8h, v6.8h + clz v7.8h, v7.8h + ushl v24.8h, v24.8h, v0.8h + ushl v25.8h, v25.8h, v1.8h + ushl v26.8h, v26.8h, v2.8h + ushl v27.8h, v27.8h, v3.8h + ushl v28.8h, v28.8h, v4.8h + ushl v29.8h, v29.8h, v5.8h + ushl v30.8h, v30.8h, v6.8h + ushl v31.8h, v31.8h, v7.8h + neg v0.8h, v0.8h + neg v1.8h, v1.8h + neg v2.8h, v2.8h + neg v3.8h, v3.8h + neg v4.8h, v4.8h + neg v5.8h, v5.8h + neg v6.8h, v6.8h + neg v7.8h, v7.8h + ushl v24.8h, v24.8h, v0.8h + ushl v25.8h, v25.8h, v1.8h + ushl v26.8h, v26.8h, v2.8h + ushl v27.8h, v27.8h, v3.8h + ushl v28.8h, v28.8h, v4.8h + ushl v29.8h, v29.8h, v5.8h + ushl v30.8h, v30.8h, v6.8h + ushl v31.8h, v31.8h, v7.8h + add v0.8h, v21.8h, v0.8h + add v1.8h, v21.8h, v1.8h + add v2.8h, v21.8h, v2.8h + add v3.8h, v21.8h, v3.8h + add v4.8h, v21.8h, v4.8h + add v5.8h, v21.8h, v5.8h + add v6.8h, v21.8h, v6.8h + add v7.8h, v21.8h, v7.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64 + st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64 + st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64 +1: + clz x2, x9 + add x15, x15, x2, lsl #1 + lsl x9, x9, x2 + ldrh w11, [x15, #-126] +2: + cmp x2, #0x10 + b.lt 3f + sub x2, x2, #0x10 + checkbuf47 + put_bits x13, x14 + b 2b +3: + ldrh w3, [x15, #2]! + add x2, x11, x2, lsl #4 + lsl x9, x9, #0x1 + ldr w12, [x5, x2, lsl #2] + ldrb w10, [x4, x2] + checkbuf31 + put_bits x12, x10 + put_bits x3, x11 + cbnz x9, 1b +6: + add x13, sp, #0x10e + cmp x15, x13 + b.hs 1f + ldr w12, [x5] + ldrb w14, [x4] + checkbuf47 + put_bits x12, x14 +1: + str PUT_BUFFER, [x0, #0x10] + str PUT_BITSw, [x0, #0x18] + ldp x19, x20, [sp], 16 + add x0, BUFFER, #0x1 + add sp, sp, 256 + br x30 + +.endm + +generate_jsimd_huff_encode_one_block 1 +generate_jsimd_huff_encode_one_block 0 + + .unreq BUFFER + .unreq PUT_BUFFER + .unreq PUT_BITS + .unreq PUT_BITSw + +.purgem emit_byte +.purgem put_bits +.purgem checkbuf31 +.purgem checkbuf47 diff --git a/simd/arm/common/jccolor-neon.c b/simd/arm/common/jccolor-neon.c new file mode 100644 index 0000000..f87c8d9 --- /dev/null +++ b/simd/arm/common/jccolor-neon.c @@ -0,0 +1,158 @@ +/* + * jccolor-neon.c - colorspace conversion (Arm Neon) + * + * Copyright 2020 The Chromium Authors. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../../jconfigint.h" +#include "../../../jinclude.h" +#include "../../../jpeglib.h" +#include "../../../jsimd.h" +#include "../../../jdct.h" +#include "../../../jsimddct.h" +#include "../../jsimd.h" + +#include <arm_neon.h> + +/* RGB -> YCbCr conversion constants. */ + +#define F_0_298 19595 +#define F_0_587 38470 +#define F_0_113 7471 +#define F_0_168 11059 +#define F_0_331 21709 +#define F_0_500 32768 +#define F_0_418 27439 +#define F_0_081 5329 + +ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = { + F_0_298, F_0_587, + F_0_113, F_0_168, + F_0_331, F_0_500, + F_0_418, F_0_081 + }; + +/* Include inline routines for colorspace extensions. */ + +#if defined(__aarch64__) +#include "../arm64/jccolext-neon.c" +#else +#include "../arm/jccolext-neon.c" +#endif +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE + +#define RGB_RED EXT_RGB_RED +#define RGB_GREEN EXT_RGB_GREEN +#define RGB_BLUE EXT_RGB_BLUE +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_rgb_ycc_convert_neon jsimd_extrgb_ycc_convert_neon +#if defined(__aarch64__) +#include "../arm64/jccolext-neon.c" +#else +#include "../arm/jccolext-neon.c" +#endif +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_ycc_convert_neon + +#define RGB_RED EXT_RGBX_RED +#define RGB_GREEN EXT_RGBX_GREEN +#define RGB_BLUE EXT_RGBX_BLUE +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define jsimd_rgb_ycc_convert_neon jsimd_extrgbx_ycc_convert_neon +#if defined(__aarch64__) +#include "../arm64/jccolext-neon.c" +#else +#include "../arm/jccolext-neon.c" +#endif +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_ycc_convert_neon + +#define RGB_RED EXT_BGR_RED +#define RGB_GREEN EXT_BGR_GREEN +#define RGB_BLUE EXT_BGR_BLUE +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define jsimd_rgb_ycc_convert_neon jsimd_extbgr_ycc_convert_neon +#if defined(__aarch64__) +#include "../arm64/jccolext-neon.c" +#else +#include "../arm/jccolext-neon.c" +#endif +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_ycc_convert_neon + +#define RGB_RED EXT_BGRX_RED +#define RGB_GREEN EXT_BGRX_GREEN +#define RGB_BLUE EXT_BGRX_BLUE +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define jsimd_rgb_ycc_convert_neon jsimd_extbgrx_ycc_convert_neon +#if defined(__aarch64__) +#include "../arm64/jccolext-neon.c" +#else +#include "../arm/jccolext-neon.c" +#endif +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_ycc_convert_neon + +#define RGB_RED EXT_XBGR_RED +#define RGB_GREEN EXT_XBGR_GREEN +#define RGB_BLUE EXT_XBGR_BLUE +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define jsimd_rgb_ycc_convert_neon jsimd_extxbgr_ycc_convert_neon +#if defined(__aarch64__) +#include "../arm64/jccolext-neon.c" +#else +#include "../arm/jccolext-neon.c" +#endif +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_ycc_convert_neon + +#define RGB_RED EXT_XRGB_RED +#define RGB_GREEN EXT_XRGB_GREEN +#define RGB_BLUE EXT_XRGB_BLUE +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define jsimd_rgb_ycc_convert_neon jsimd_extxrgb_ycc_convert_neon +#if defined(__aarch64__) +#include "../arm64/jccolext-neon.c" +#else +#include "../arm/jccolext-neon.c" +#endif +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_ycc_convert_neon diff --git a/simd/arm/common/jcgray-neon.c b/simd/arm/common/jcgray-neon.c new file mode 100644 index 0000000..39d903f --- /dev/null +++ b/simd/arm/common/jcgray-neon.c @@ -0,0 +1,118 @@ +/* + * jcgray-neon.c - grayscale colorspace conversion (Arm NEON) + * + * Copyright 2020 The Chromium Authors. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../../jconfigint.h" +#include "../../../jinclude.h" +#include "../../../jpeglib.h" +#include "../../../jsimd.h" +#include "../../../jdct.h" +#include "../../../jsimddct.h" +#include "../../jsimd.h" + +#include <arm_neon.h> + +/* RGB -> Grayscale conversion constants. */ + +#define F_0_298 19595 +#define F_0_587 38470 +#define F_0_113 7471 + +/* Include inline routines for colorspace extensions. */ + +#include "jcgryext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE + +#define RGB_RED EXT_RGB_RED +#define RGB_GREEN EXT_RGB_GREEN +#define RGB_BLUE EXT_RGB_BLUE +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_rgb_gray_convert_neon jsimd_extrgb_gray_convert_neon +#include "jcgryext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_gray_convert_neon + +#define RGB_RED EXT_RGBX_RED +#define RGB_GREEN EXT_RGBX_GREEN +#define RGB_BLUE EXT_RGBX_BLUE +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define jsimd_rgb_gray_convert_neon jsimd_extrgbx_gray_convert_neon +#include "jcgryext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_gray_convert_neon + +#define RGB_RED EXT_BGR_RED +#define RGB_GREEN EXT_BGR_GREEN +#define RGB_BLUE EXT_BGR_BLUE +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define jsimd_rgb_gray_convert_neon jsimd_extbgr_gray_convert_neon +#include "jcgryext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_gray_convert_neon + +#define RGB_RED EXT_BGRX_RED +#define RGB_GREEN EXT_BGRX_GREEN +#define RGB_BLUE EXT_BGRX_BLUE +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define jsimd_rgb_gray_convert_neon jsimd_extbgrx_gray_convert_neon +#include "jcgryext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_gray_convert_neon + +#define RGB_RED EXT_XBGR_RED +#define RGB_GREEN EXT_XBGR_GREEN +#define RGB_BLUE EXT_XBGR_BLUE +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define jsimd_rgb_gray_convert_neon jsimd_extxbgr_gray_convert_neon +#include "jcgryext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_gray_convert_neon + +#define RGB_RED EXT_XRGB_RED +#define RGB_GREEN EXT_XRGB_GREEN +#define RGB_BLUE EXT_XRGB_BLUE +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define jsimd_rgb_gray_convert_neon jsimd_extxrgb_gray_convert_neon +#include "jcgryext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_rgb_gray_convert_neon diff --git a/simd/arm/common/jcgryext-neon.c b/simd/arm/common/jcgryext-neon.c new file mode 100644 index 0000000..69ea67f --- /dev/null +++ b/simd/arm/common/jcgryext-neon.c @@ -0,0 +1,107 @@ +/* + * jcgryext-neon.c - grayscale colorspace conversion (Arm NEON) + * + * Copyright 2020 The Chromium Authors. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jcgray-neon.c */ + +/* + * RGB -> Grayscale conversion is defined by the following equation: + * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + * + * Avoid floating point arithmetic by using shifted integer constants: + * 0.29899597 = 19595 * 2^-16 + * 0.58700561 = 38470 * 2^-16 + * 0.11399841 = 7471 * 2^-16 + * These constants are defined in jcgray-neon.c + * + * We use rounding later to get correct values. + * + * This is the same computation as the RGB -> Y portion of RGB -> YCbCr. + */ + +void jsimd_rgb_gray_convert_neon(JDIMENSION image_width, + JSAMPARRAY input_buf, + JSAMPIMAGE output_buf, + JDIMENSION output_row, + int num_rows) +{ + JSAMPROW inptr; + JSAMPROW outptr; + + while (--num_rows >= 0) { + inptr = *input_buf++; + outptr = output_buf[0][output_row]; + output_row++; + + int cols_remaining = image_width; + for (; cols_remaining > 0; cols_remaining -= 16) { + + /* To prevent buffer overread by the vector load instructions, the */ + /* last (image_width % 16) columns of data are first memcopied to a */ + /* temporary buffer large enough to accommodate the vector load. */ + if (cols_remaining < 16) { + ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE]; + memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE); + inptr = tmp_buf; + } + +#if RGB_PIXELSIZE == 4 + uint8x16x4_t input_pixels = vld4q_u8(inptr); +#else + uint8x16x3_t input_pixels = vld3q_u8(inptr); +#endif + uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED])); + uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED])); + uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN])); + uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN])); + uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE])); + uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE])); + + /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */ + uint32x4_t y_ll = vmull_n_u16(vget_low_u16(r_l), F_0_298); + uint32x4_t y_lh = vmull_n_u16(vget_high_u16(r_l), F_0_298); + uint32x4_t y_hl = vmull_n_u16(vget_low_u16(r_h), F_0_298); + uint32x4_t y_hh = vmull_n_u16(vget_high_u16(r_h), F_0_298); + y_ll = vmlal_n_u16(y_ll, vget_low_u16(g_l), F_0_587); + y_lh = vmlal_n_u16(y_lh, vget_high_u16(g_l), F_0_587); + y_hl = vmlal_n_u16(y_hl, vget_low_u16(g_h), F_0_587); + y_hh = vmlal_n_u16(y_hh, vget_high_u16(g_h), F_0_587); + y_ll = vmlal_n_u16(y_ll, vget_low_u16(b_l), F_0_113); + y_lh = vmlal_n_u16(y_lh, vget_high_u16(b_l), F_0_113); + y_hl = vmlal_n_u16(y_hl, vget_low_u16(b_h), F_0_113); + y_hh = vmlal_n_u16(y_hh, vget_high_u16(b_h), F_0_113); + + /* Descale Y values (rounding right shift) and narrow to 16-bit. */ + uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16), + vrshrn_n_u32(y_lh, 16)); + uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16), + vrshrn_n_u32(y_hh, 16)); + + /* Narrow Y values to 8-bit and store to memory. Buffer overwrite is */ + /* permitted up to the next multiple of ALIGN_SIZE bytes. */ + vst1q_u8(outptr, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h))); + + /* Increment pointers. */ + inptr += (16 * RGB_PIXELSIZE); + outptr += 16; + } + } +} diff --git a/simd/arm/common/jcsample-neon.c b/simd/arm/common/jcsample-neon.c new file mode 100644 index 0000000..a5ddf16 --- /dev/null +++ b/simd/arm/common/jcsample-neon.c @@ -0,0 +1,191 @@ +/* + * jcsample-neon.c - downsampling (Arm NEON) + * + * Copyright 2020 The Chromium Authors. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../../jconfigint.h" +#include "../../../jinclude.h" +#include "../../../jpeglib.h" +#include "../../../jsimd.h" +#include "../../../jdct.h" +#include "../../../jsimddct.h" +#include "../../jsimd.h" + +#include <arm_neon.h> + + +ALIGN(16) static const uint8_t jsimd_h2_downsample_consts[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 0 */ + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 1 */ + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 2 */ + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 3 */ + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 4 */ + 0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 5 */ + 0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 6 */ + 0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 7 */ + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* Pad 8 */ + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, /* Pad 9 */ + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, /* Pad 10 */ + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, /* Pad 11 */ + 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, + 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, /* Pad 12 */ + 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, + 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* Pad 13 */ + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, /* Pad 14 */ + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Pad 15 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + + +/* + * Downsample pixel values of a single chroma component i.e. Cb, Cr. + * This version handles the common case of 2:1 horizontal and 1:1 vertical, + * without smoothing. + */ + +void jsimd_h2v1_downsample_neon(JDIMENSION image_width, + int max_v_samp_factor, + JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, + JSAMPARRAY input_data, + JSAMPARRAY output_data) +{ + JSAMPROW inptr, outptr; + /* Load expansion mask to pad remaining elements of last DCT block. */ + const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width); + const uint8x16_t expand_mask = vld1q_u8( + &jsimd_h2_downsample_consts[mask_offset]); + /* Load bias pattern alternating every pixel. */ + const uint16x8_t bias = { 0, 1, 0, 1, 0, 1, 0, 1 }; + + for (unsigned outrow = 0; outrow < v_samp_factor; outrow++) { + outptr = output_data[outrow]; + inptr = input_data[outrow]; + + /* Downsample all but the last DCT block of pixels. */ + for (unsigned i = 0; i < width_in_blocks - 1; i++) { + uint8x16_t pixels = vld1q_u8(inptr + i * 2 * DCTSIZE); + /* Add adjacent pixel values, widen to 16-bit and add bias. */ + uint16x8_t samples_u16 = vpadalq_u8(bias, pixels); + /* Divide total by 2 and narrow to 8-bit. */ + uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1); + /* Store samples to memory. */ + vst1_u8(outptr + i * DCTSIZE, samples_u8); + } + + /* Load pixels in last DCT block into a table. */ + uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE); +#if defined(__aarch64__) + /* Pad the empty elements with the value of the last pixel. */ + pixels = vqtbl1q_u8(pixels, expand_mask); +#else + uint8x8x2_t table = { vget_low_u8(pixels), vget_high_u8(pixels) }; + pixels = vcombine_u8(vtbl2_u8(table, vget_low_u8(expand_mask)), + vtbl2_u8(table, vget_high_u8(expand_mask))); +#endif + /* Add adjacent pixel values, widen to 16-bit and add bias. */ + uint16x8_t samples_u16 = vpadalq_u8(bias, pixels); + /* Divide total by 2, narrow to 8-bit and store. */ + uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1); + vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8); + } +} + + +/* + * Downsample pixel values of a single chroma component i.e. Cb, Cr. + * This version handles the standard case of 2:1 horizontal and 2:1 vertical, + * without smoothing. + */ + +void jsimd_h2v2_downsample_neon(JDIMENSION image_width, + int max_v_samp_factor, + JDIMENSION v_samp_factor, + JDIMENSION width_in_blocks, + JSAMPARRAY input_data, + JSAMPARRAY output_data) +{ + JSAMPROW inptr0, inptr1, outptr; + /* Load expansion mask to pad remaining elements of last DCT block. */ + const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width); + const uint8x16_t expand_mask = vld1q_u8( + &jsimd_h2_downsample_consts[mask_offset]); + /* Load bias pattern alternating every pixel. */ + const uint16x8_t bias = { 1, 2, 1, 2, 1, 2, 1, 2 }; + + for (unsigned outrow = 0; outrow < v_samp_factor; outrow++) { + outptr = output_data[outrow]; + inptr0 = input_data[outrow]; + inptr1 = input_data[outrow + 1]; + + /* Downsample all but the last DCT block of pixels. */ + for (unsigned i = 0; i < width_in_blocks - 1; i++) { + uint8x16_t pixels_r0 = vld1q_u8(inptr0 + i * 2 * DCTSIZE); + uint8x16_t pixels_r1 = vld1q_u8(inptr1 + i * 2 * DCTSIZE); + /* Add adjacent pixel values in row 0, widen to 16-bit and add bias. */ + uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0); + /* Add adjacent pixel values in row 1, widen to 16-bit and accumulate. */ + samples_u16 = vpadalq_u8(samples_u16, pixels_r1); + /* Divide total by 4 and narrow to 8-bit. */ + uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2); + /* Store samples to memory and increment pointers. */ + vst1_u8(outptr + i * DCTSIZE, samples_u8); + } + + /* Load pixels in last DCT block into a table. */ + uint8x16_t pixels_r0 = vld1q_u8( + inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE); + uint8x16_t pixels_r1 = vld1q_u8( + inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE); +#if defined(__aarch64__) + /* Pad the empty elements with the value of the last pixel. */ + pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask); + pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask); +#else + uint8x8x2_t table_r0 = { vget_low_u8(pixels_r0), vget_high_u8(pixels_r0) }; + uint8x8x2_t table_r1 = { vget_low_u8(pixels_r1), vget_high_u8(pixels_r1) }; + pixels_r0 = vcombine_u8(vtbl2_u8(table_r0, vget_low_u8(expand_mask)), + vtbl2_u8(table_r0, vget_high_u8(expand_mask))); + pixels_r1 = vcombine_u8(vtbl2_u8(table_r1, vget_low_u8(expand_mask)), + vtbl2_u8(table_r1, vget_high_u8(expand_mask))); +#endif + /* Add adjacent pixel values in row 0, widen to 16-bit and add bias. */ + uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0); + /* Add adjacent pixel values in row 1, widen to 16-bit and accumulate. */ + samples_u16 = vpadalq_u8(samples_u16, pixels_r1); + /* Divide total by 4, narrow to 8-bit and store. */ + uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2); + vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8); + } +} diff --git a/simd/arm/common/jdcolext-neon.c b/simd/arm/common/jdcolext-neon.c new file mode 100644 index 0000000..b201792 --- /dev/null +++ b/simd/arm/common/jdcolext-neon.c @@ -0,0 +1,330 @@ +/* + * jdcolext-neon.c - colorspace conversion (Arm NEON) + * + * Copyright 2019 The Chromium Authors. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jdcolor-neon.c. */ + +/* + * YCbCr -> RGB conversion is defined by the following equations: + * R = Y + 1.40200 * (Cr - 128) + * G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) + * B = Y + 1.77200 * (Cb - 128) + * + * Scaled integer constants are used to avoid floating-point arithmetic: + * 0.3441467 = 11277 * 2^-15 + * 0.7141418 = 23401 * 2^-15 + * 1.4020386 = 22971 * 2^-14 + * 1.7720337 = 29033 * 2^-14 + * These constants are defined in jdcolor-neon.c. + * + * Rounding is used when descaling to ensure correct results. + */ + +/* + * Notes on safe memory access for YCbCr -> RGB conversion routines: + * + * Input memory buffers can be safely overread up to the next multiple of + * ALIGN_SIZE bytes since they are always allocated by alloc_sarray() in + * jmemmgr.c. + * + * The output buffer cannot safely be written beyond output_width since the + * TurboJPEG API permits it to be allocated with or without padding up to the + * next multiple of ALIGN_SIZE bytes. + */ + +void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width, + JSAMPIMAGE input_buf, + JDIMENSION input_row, + JSAMPARRAY output_buf, + int num_rows) +{ + JSAMPROW outptr; + /* Pointers to Y, Cb and Cr data. */ + JSAMPROW inptr0, inptr1, inptr2; + + const int16x8_t neg_128 = vdupq_n_s16(-128); + + while (--num_rows >= 0) { + inptr0 = input_buf[0][input_row]; + inptr1 = input_buf[1][input_row]; + inptr2 = input_buf[2][input_row]; + input_row++; + outptr = *output_buf++; + int cols_remaining = output_width; + for (; cols_remaining >= 16; cols_remaining -= 16) { + uint8x16_t y = vld1q_u8(inptr0); + uint8x16_t cb = vld1q_u8(inptr1); + uint8x16_t cr = vld1q_u8(inptr2); + /* Subtract 128 from Cb and Cr. */ + int16x8_t cr_128_l = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(neg_128), vget_low_u8(cr))); + int16x8_t cr_128_h = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(neg_128), vget_high_u8(cr))); + int16x8_t cb_128_l = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(neg_128), vget_low_u8(cb))); + int16x8_t cb_128_h = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(neg_128), vget_high_u8(cb))); + /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */ + int32x4_t g_sub_y_ll = vmull_n_s16(vget_low_s16(cb_128_l), -F_0_344); + int32x4_t g_sub_y_lh = vmull_n_s16(vget_high_s16(cb_128_l), -F_0_344); + int32x4_t g_sub_y_hl = vmull_n_s16(vget_low_s16(cb_128_h), -F_0_344); + int32x4_t g_sub_y_hh = vmull_n_s16(vget_high_s16(cb_128_h), -F_0_344); + g_sub_y_ll = vmlsl_n_s16(g_sub_y_ll, vget_low_s16(cr_128_l), F_0_714); + g_sub_y_lh = vmlsl_n_s16(g_sub_y_lh, vget_high_s16(cr_128_l), F_0_714); + g_sub_y_hl = vmlsl_n_s16(g_sub_y_hl, vget_low_s16(cr_128_h), F_0_714); + g_sub_y_hh = vmlsl_n_s16(g_sub_y_hh, vget_high_s16(cr_128_h), F_0_714); + /* Descale G components: shift right 15, round and narrow to 16-bit. */ + int16x8_t g_sub_y_l = vcombine_s16(vrshrn_n_s32(g_sub_y_ll, 15), + vrshrn_n_s32(g_sub_y_lh, 15)); + int16x8_t g_sub_y_h = vcombine_s16(vrshrn_n_s32(g_sub_y_hl, 15), + vrshrn_n_s32(g_sub_y_hh, 15)); + /* Compute R-Y: 1.40200 * (Cr - 128) */ + int16x8_t r_sub_y_l = vqrdmulhq_n_s16(vshlq_n_s16(cr_128_l, 1), F_1_402); + int16x8_t r_sub_y_h = vqrdmulhq_n_s16(vshlq_n_s16(cr_128_h, 1), F_1_402); + /* Compute B-Y: 1.77200 * (Cb - 128) */ + int16x8_t b_sub_y_l = vqrdmulhq_n_s16(vshlq_n_s16(cb_128_l, 1), F_1_772); + int16x8_t b_sub_y_h = vqrdmulhq_n_s16(vshlq_n_s16(cb_128_h, 1), F_1_772); + /* Add Y. */ + int16x8_t r_l = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(r_sub_y_l), vget_low_u8(y))); + int16x8_t r_h = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(r_sub_y_h), vget_high_u8(y))); + int16x8_t b_l = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(b_sub_y_l), vget_low_u8(y))); + int16x8_t b_h = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(b_sub_y_h), vget_high_u8(y))); + int16x8_t g_l = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(g_sub_y_l), vget_low_u8(y))); + int16x8_t g_h = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h), vget_high_u8(y))); + +#if RGB_PIXELSIZE == 4 + uint8x16x4_t rgba; + /* Convert each component to unsigned and narrow, clamping to [0-255]. */ + rgba.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h)); + rgba.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h)); + rgba.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h)); + /* Set alpha channel to opaque (0xFF). */ + rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF); + /* Store RGBA pixel data to memory. */ + vst4q_u8(outptr, rgba); +#elif RGB_PIXELSIZE == 3 + uint8x16x3_t rgb; + /* Convert each component to unsigned and narrow, clamping to [0-255]. */ + rgb.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h)); + rgb.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h)); + rgb.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h)); + /* Store RGB pixel data to memory. */ + vst3q_u8(outptr, rgb); +#else /* RGB565 */ + /* Pack R, G and B values in ratio 5:6:5. */ + uint16x8_t rgb565_l = vqshluq_n_s16(r_l, 8); + rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(g_l, 8), 5); + rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(b_l, 8), 11); + uint16x8_t rgb565_h = vqshluq_n_s16(r_h, 8); + rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(g_h, 8), 5); + rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(b_h, 8), 11); + /* Store RGB pixel data to memory. */ + vst1q_u16((uint16_t *)outptr, rgb565_l); + vst1q_u16(((uint16_t *)outptr) + 8, rgb565_h); +#endif /* RGB565 */ + + /* Increment pointers. */ + inptr0 += 16; + inptr1 += 16; + inptr2 += 16; + outptr += (RGB_PIXELSIZE * 16); + } + + if (cols_remaining >= 8) { + uint8x8_t y = vld1_u8(inptr0); + uint8x8_t cb = vld1_u8(inptr1); + uint8x8_t cr = vld1_u8(inptr2); + /* Subtract 128 from Cb and Cr. */ + int16x8_t cr_128 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(neg_128), cr)); + int16x8_t cb_128 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(neg_128), cb)); + /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */ + int32x4_t g_sub_y_l = vmull_n_s16(vget_low_s16(cb_128), -F_0_344); + int32x4_t g_sub_y_h = vmull_n_s16(vget_high_s16(cb_128), -F_0_344); + g_sub_y_l = vmlsl_n_s16(g_sub_y_l, vget_low_s16(cr_128), F_0_714); + g_sub_y_h = vmlsl_n_s16(g_sub_y_h, vget_high_s16(cr_128), F_0_714); + /* Descale G components: shift right 15, round and narrow to 16-bit. */ + int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15), + vrshrn_n_s32(g_sub_y_h, 15)); + /* Compute R-Y: 1.40200 * (Cr - 128) */ + int16x8_t r_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cr_128, 1), F_1_402); + /* Compute B-Y: 1.77200 * (Cb - 128) */ + int16x8_t b_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cb_128, 1), F_1_772); + /* Add Y. */ + int16x8_t r = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y)); + int16x8_t b = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y)); + int16x8_t g = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y)); + +#if RGB_PIXELSIZE == 4 + uint8x8x4_t rgba; + /* Convert each component to unsigned and narrow, clamping to [0-255]. */ + rgba.val[RGB_RED] = vqmovun_s16(r); + rgba.val[RGB_GREEN] = vqmovun_s16(g); + rgba.val[RGB_BLUE] = vqmovun_s16(b); + /* Set alpha channel to opaque (0xFF). */ + rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF); + /* Store RGBA pixel data to memory. */ + vst4_u8(outptr, rgba); +#elif RGB_PIXELSIZE == 3 + uint8x8x3_t rgb; + /* Convert each component to unsigned and narrow, clamping to [0-255]. */ + rgb.val[RGB_RED] = vqmovun_s16(r); + rgb.val[RGB_GREEN] = vqmovun_s16(g); + rgb.val[RGB_BLUE] = vqmovun_s16(b); + /* Store RGB pixel data to memory. */ + vst3_u8(outptr, rgb); +#else /* RGB565 */ + /* Pack R, G and B values in ratio 5:6:5. */ + uint16x8_t rgb565 = vqshluq_n_s16(r, 8); + rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5); + rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11); + /* Store RGB pixel data to memory. */ + vst1q_u16((uint16_t *)outptr, rgb565); +#endif /* RGB565 */ + + /* Increment pointers. */ + inptr0 += 8; + inptr1 += 8; + inptr2 += 8; + outptr += (RGB_PIXELSIZE * 8); + cols_remaining -= 8; + } + + /* Handle the tail elements. */ + if (cols_remaining > 0) { + uint8x8_t y = vld1_u8(inptr0); + uint8x8_t cb = vld1_u8(inptr1); + uint8x8_t cr = vld1_u8(inptr2); + /* Subtract 128 from Cb and Cr. */ + int16x8_t cr_128 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(neg_128), cr)); + int16x8_t cb_128 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(neg_128), cb)); + /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */ + int32x4_t g_sub_y_l = vmull_n_s16(vget_low_s16(cb_128), -F_0_344); + int32x4_t g_sub_y_h = vmull_n_s16(vget_high_s16(cb_128), -F_0_344); + g_sub_y_l = vmlsl_n_s16(g_sub_y_l, vget_low_s16(cr_128), F_0_714); + g_sub_y_h = vmlsl_n_s16(g_sub_y_h, vget_high_s16(cr_128), F_0_714); + /* Descale G components: shift right 15, round and narrow to 16-bit. */ + int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15), + vrshrn_n_s32(g_sub_y_h, 15)); + /* Compute R-Y: 1.40200 * (Cr - 128) */ + int16x8_t r_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cr_128, 1), F_1_402); + /* Compute B-Y: 1.77200 * (Cb - 128) */ + int16x8_t b_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cb_128, 1), F_1_772); + /* Add Y. */ + int16x8_t r = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y)); + int16x8_t b = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y)); + int16x8_t g = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y)); + +#if RGB_PIXELSIZE == 4 + uint8x8x4_t rgba; + /* Convert each component to unsigned and narrow, clamping to [0-255]. */ + rgba.val[RGB_RED] = vqmovun_s16(r); + rgba.val[RGB_GREEN] = vqmovun_s16(g); + rgba.val[RGB_BLUE] = vqmovun_s16(b); + /* Set alpha channel to opaque (0xFF). */ + rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF); + /* Store RGBA pixel data to memory. */ + switch (cols_remaining) { + case 7 : + vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba, 6); + case 6 : + vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba, 5); + case 5 : + vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba, 4); + case 4 : + vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba, 3); + case 3 : + vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba, 2); + case 2 : + vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba, 1); + case 1 : + vst4_lane_u8(outptr, rgba, 0); + default: + break; + } +#elif RGB_PIXELSIZE == 3 + uint8x8x3_t rgb; + /* Convert each component to unsigned and narrow, clamping to [0-255]. */ + rgb.val[RGB_RED] = vqmovun_s16(r); + rgb.val[RGB_GREEN] = vqmovun_s16(g); + rgb.val[RGB_BLUE] = vqmovun_s16(b); + /* Store RGB pixel data to memory. */ + switch (cols_remaining) { + case 7 : + vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb, 6); + case 6 : + vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb, 5); + case 5 : + vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb, 4); + case 4 : + vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb, 3); + case 3 : + vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb, 2); + case 2 : + vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb, 1); + case 1 : + vst3_lane_u8(outptr, rgb, 0); + default: + break; + } +#else /* RGB565 */ + /* Pack R, G and B values in ratio 5:6:5. */ + uint16x8_t rgb565 = vqshluq_n_s16(r, 8); + rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5); + rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11); + /* Store RGB565 pixel data to memory. */ + switch (cols_remaining) { + case 7 : + vst1q_lane_u16(outptr + 6 * RGB_PIXELSIZE, rgb565, 6); + case 6 : + vst1q_lane_u16(outptr + 5 * RGB_PIXELSIZE, rgb565, 5); + case 5 : + vst1q_lane_u16(outptr + 4 * RGB_PIXELSIZE, rgb565, 4); + case 4 : + vst1q_lane_u16(outptr + 3 * RGB_PIXELSIZE, rgb565, 3); + case 3 : + vst1q_lane_u16(outptr + 2 * RGB_PIXELSIZE, rgb565, 2); + case 2 : + vst1q_lane_u16(outptr + RGB_PIXELSIZE, rgb565, 1); + case 1 : + vst1q_lane_u16(outptr, rgb565, 0); + default: + break; + } +#endif /* RGB565 */ + } + } +} diff --git a/simd/arm/common/jdcolor-neon.c b/simd/arm/common/jdcolor-neon.c new file mode 100644 index 0000000..52dab1e --- /dev/null +++ b/simd/arm/common/jdcolor-neon.c @@ -0,0 +1,134 @@ +/* + * jdcolor-neon.c - colorspace conversion (Arm NEON) + * + * Copyright 2019 The Chromium Authors. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../../jinclude.h" +#include "../../../jpeglib.h" +#include "../../../jsimd.h" +#include "../../../jdct.h" +#include "../../../jsimddct.h" +#include "../../jsimd.h" + +#include <arm_neon.h> + +/* YCbCr -> RGB conversion constants. */ + +#define F_0_344 11277 /* 0.3441467 = 11277 * 2^-15 */ +#define F_0_714 23401 /* 0.7141418 = 23401 * 2^-15 */ +#define F_1_402 22971 /* 1.4020386 = 22971 * 2^-14 */ +#define F_1_772 29033 /* 1.7720337 = 29033 * 2^-14 */ + +/* Include inline routines for colorspace extensions. */ + +#include "jdcolext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE + +#define RGB_RED EXT_RGB_RED +#define RGB_GREEN EXT_RGB_GREEN +#define RGB_BLUE EXT_RGB_BLUE +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extrgb_convert_neon +#include "jdcolext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_neon + +#define RGB_RED EXT_RGBX_RED +#define RGB_GREEN EXT_RGBX_GREEN +#define RGB_BLUE EXT_RGBX_BLUE +#define RGB_ALPHA 3 +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extrgbx_convert_neon +#include "jdcolext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_ALPHA +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_neon + +#define RGB_RED EXT_BGR_RED +#define RGB_GREEN EXT_BGR_GREEN +#define RGB_BLUE EXT_BGR_BLUE +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extbgr_convert_neon +#include "jdcolext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_neon + +#define RGB_RED EXT_BGRX_RED +#define RGB_GREEN EXT_BGRX_GREEN +#define RGB_BLUE EXT_BGRX_BLUE +#define RGB_ALPHA 3 +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extbgrx_convert_neon +#include "jdcolext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_ALPHA +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_neon + +#define RGB_RED EXT_XBGR_RED +#define RGB_GREEN EXT_XBGR_GREEN +#define RGB_BLUE EXT_XBGR_BLUE +#define RGB_ALPHA 0 +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extxbgr_convert_neon +#include "jdcolext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_ALPHA +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_neon + +#define RGB_RED EXT_XRGB_RED +#define RGB_GREEN EXT_XRGB_GREEN +#define RGB_BLUE EXT_XRGB_BLUE +#define RGB_ALPHA 0 +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define jsimd_ycc_rgb_convert_neon jsimd_ycc_extxrgb_convert_neon +#include "jdcolext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_ALPHA +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_neon + +/* YCbCr -> RGB565 Conversion. */ + +#define RGB_PIXELSIZE 2 +#define jsimd_ycc_rgb_convert_neon jsimd_ycc_rgb565_convert_neon +#include "jdcolext-neon.c" +#undef RGB_PIXELSIZE +#undef jsimd_ycc_rgb_convert_neon diff --git a/simd/arm/common/jdmerge-neon.c b/simd/arm/common/jdmerge-neon.c new file mode 100644 index 0000000..71798c7 --- /dev/null +++ b/simd/arm/common/jdmerge-neon.c @@ -0,0 +1,138 @@ +/* + * jdmerge-neon.c - merged upsampling/color conversion (Arm NEON) + * + * Copyright 2019 The Chromium Authors. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../../jinclude.h" +#include "../../../jpeglib.h" +#include "../../../jsimd.h" +#include "../../../jdct.h" +#include "../../../jsimddct.h" +#include "../../jsimd.h" + +#include <arm_neon.h> + +/* YCbCr -> RGB conversion constants. */ + +#define F_0_344 11277 /* 0.3441467 = 11277 * 2^-15 */ +#define F_0_714 23401 /* 0.7141418 = 23401 * 2^-15 */ +#define F_1_402 22971 /* 1.4020386 = 22971 * 2^-14 */ +#define F_1_772 29033 /* 1.7720337 = 29033 * 2^-14 */ + +/* Include inline routines for colorspace extensions */ + +#include "jdmrgext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE + +#define RGB_RED EXT_RGB_RED +#define RGB_GREEN EXT_RGB_GREEN +#define RGB_BLUE EXT_RGB_BLUE +#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE +#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extrgb_merged_upsample_neon +#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extrgb_merged_upsample_neon +#include "jdmrgext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_h2v1_merged_upsample_neon +#undef jsimd_h2v2_merged_upsample_neon + +#define RGB_RED EXT_RGBX_RED +#define RGB_GREEN EXT_RGBX_GREEN +#define RGB_BLUE EXT_RGBX_BLUE +#define RGB_ALPHA 3 +#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE +#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extrgbx_merged_upsample_neon +#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extrgbx_merged_upsample_neon +#include "jdmrgext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_ALPHA +#undef RGB_PIXELSIZE +#undef jsimd_h2v1_merged_upsample_neon +#undef jsimd_h2v2_merged_upsample_neon + +#define RGB_RED EXT_BGR_RED +#define RGB_GREEN EXT_BGR_GREEN +#define RGB_BLUE EXT_BGR_BLUE +#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE +#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extbgr_merged_upsample_neon +#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extbgr_merged_upsample_neon +#include "jdmrgext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_PIXELSIZE +#undef jsimd_h2v1_merged_upsample_neon +#undef jsimd_h2v2_merged_upsample_neon + +#define RGB_RED EXT_BGRX_RED +#define RGB_GREEN EXT_BGRX_GREEN +#define RGB_BLUE EXT_BGRX_BLUE +#define RGB_ALPHA 3 +#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE +#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extbgrx_merged_upsample_neon +#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extbgrx_merged_upsample_neon +#include "jdmrgext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_ALPHA +#undef RGB_PIXELSIZE +#undef jsimd_h2v1_merged_upsample_neon +#undef jsimd_h2v2_merged_upsample_neon + +#define RGB_RED EXT_XBGR_RED +#define RGB_GREEN EXT_XBGR_GREEN +#define RGB_BLUE EXT_XBGR_BLUE +#define RGB_ALPHA 0 +#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE +#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extxbgr_merged_upsample_neon +#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extxbgr_merged_upsample_neon +#include "jdmrgext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_ALPHA +#undef RGB_PIXELSIZE +#undef jsimd_h2v1_merged_upsample_neon +#undef jsimd_h2v2_merged_upsample_neon + +#define RGB_RED EXT_XRGB_RED +#define RGB_GREEN EXT_XRGB_GREEN +#define RGB_BLUE EXT_XRGB_BLUE +#define RGB_ALPHA 0 +#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE +#define jsimd_h2v1_merged_upsample_neon jsimd_h2v1_extxrgb_merged_upsample_neon +#define jsimd_h2v2_merged_upsample_neon jsimd_h2v2_extxrgb_merged_upsample_neon +#include "jdmrgext-neon.c" +#undef RGB_RED +#undef RGB_GREEN +#undef RGB_BLUE +#undef RGB_ALPHA +#undef RGB_PIXELSIZE +#undef jsimd_h2v1_merged_upsample_neon +#undef jsimd_h2v2_merged_upsample_neon diff --git a/simd/arm/common/jdmrgext-neon.c b/simd/arm/common/jdmrgext-neon.c new file mode 100644 index 0000000..8533d71 --- /dev/null +++ b/simd/arm/common/jdmrgext-neon.c @@ -0,0 +1,607 @@ +/* + * jdmrgext-neon.c - merged upsampling/color conversion (Arm NEON) + * + * Copyright 2019 The Chromium Authors. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* This file is included by jdmerge-neon.c. */ + +/* + * These routines perform simple chroma upsampling - h2v1 or h2v2 - followed by + * YCbCr -> RGB color conversion all in the same function. + * + * As with the standalone functions, YCbCr -> RGB conversion is defined by the + * following equations: + * R = Y + 1.40200 * (Cr - 128) + * G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) + * B = Y + 1.77200 * (Cb - 128) + * + * Scaled integer constants are used to avoid floating-point arithmetic: + * 0.3441467 = 11277 * 2^-15 + * 0.7141418 = 23401 * 2^-15 + * 1.4020386 = 22971 * 2^-14 + * 1.7720337 = 29033 * 2^-14 + * These constants are defined in jdmerge-neon.c. + * + * Rounding is used when descaling to ensure correct results. + */ + +/* + * Notes on safe memory access for merged upsampling/YCbCr -> RGB conversion + * routines: + * + * Input memory buffers can be safely overread up to the next multiple of + * ALIGN_SIZE bytes since they are always allocated by alloc_sarray() in + * jmemmgr.c. + * + * The output buffer cannot safely be written beyond output_width since the + * TurboJPEG API permits it to be allocated with or without padding up to the + * next multiple of ALIGN_SIZE bytes. + */ + +/* + * Upsample and color convert from YCbCr -> RGB for the case of 2:1 horizontal. + */ + +void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + JSAMPROW outptr; + /* Pointers to Y, Cb and Cr data. */ + JSAMPROW inptr0, inptr1, inptr2; + + int16x8_t neg_128 = vdupq_n_s16(-128); + + inptr0 = input_buf[0][in_row_group_ctr]; + inptr1 = input_buf[1][in_row_group_ctr]; + inptr2 = input_buf[2][in_row_group_ctr]; + outptr = output_buf[0]; + + int cols_remaining = output_width; + for (; cols_remaining >= 16; cols_remaining -= 16) { + /* Load Y-values such that even pixel indices are in one vector and odd */ + /* pixel indices are in another vector. */ + uint8x8x2_t y = vld2_u8(inptr0); + uint8x8_t cb = vld1_u8(inptr1); + uint8x8_t cr = vld1_u8(inptr2); + /* Subtract 128 from Cb and Cr. */ + int16x8_t cr_128 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(neg_128), cr)); + int16x8_t cb_128 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(neg_128), cb)); + /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */ + int32x4_t g_sub_y_l = vmull_n_s16(vget_low_s16(cb_128), -F_0_344); + int32x4_t g_sub_y_h = vmull_n_s16(vget_high_s16(cb_128), -F_0_344); + g_sub_y_l = vmlsl_n_s16(g_sub_y_l, vget_low_s16(cr_128), F_0_714); + g_sub_y_h = vmlsl_n_s16(g_sub_y_h, vget_high_s16(cr_128), F_0_714); + /* Descale G components: shift right 15, round and narrow to 16-bit. */ + int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15), + vrshrn_n_s32(g_sub_y_h, 15)); + /* Compute R-Y: 1.40200 * (Cr - 128) */ + int16x8_t r_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cr_128, 1), F_1_402); + /* Compute B-Y: 1.77200 * (Cb - 128) */ + int16x8_t b_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cb_128, 1), F_1_772); + /* Add Y and duplicate chroma components; upsampling horizontally. */ + int16x8_t g_even = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y.val[0])); + int16x8_t r_even = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y.val[0])); + int16x8_t b_even = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y.val[0])); + int16x8_t g_odd = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y.val[1])); + int16x8_t r_odd = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y.val[1])); + int16x8_t b_odd = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y.val[1])); + /* Convert each component to unsigned and narrow, clamping to [0-255]. */ + /* Interleave pixel channel values having odd and even pixel indices. */ + uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd)); + uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd)); + uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd)); + +#ifdef RGB_ALPHA + uint8x16x4_t rgba; + rgba.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]); + rgba.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]); + rgba.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]); + /* Set alpha channel to opaque (0xFF). */ + rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF); + /* Store RGBA pixel data to memory. */ + vst4q_u8(outptr, rgba); +#else + uint8x16x3_t rgb; + rgb.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]); + rgb.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]); + rgb.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]); + /* Store RGB pixel data to memory. */ + vst3q_u8(outptr, rgb); +#endif + + /* Increment pointers. */ + inptr0 += 16; + inptr1 += 8; + inptr2 += 8; + outptr += (RGB_PIXELSIZE * 16); + } + + if (cols_remaining > 0) { + /* Load y-values such that even pixel indices are in one vector and odd */ + /* pixel indices are in another vector. */ + uint8x8x2_t y = vld2_u8(inptr0); + uint8x8_t cb = vld1_u8(inptr1); + uint8x8_t cr = vld1_u8(inptr2); + /* Subtract 128 from Cb and Cr. */ + int16x8_t cr_128 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(neg_128), cr)); + int16x8_t cb_128 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(neg_128), cb)); + /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */ + int32x4_t g_sub_y_l = vmull_n_s16(vget_low_s16(cb_128), -F_0_344); + int32x4_t g_sub_y_h = vmull_n_s16(vget_high_s16(cb_128), -F_0_344); + g_sub_y_l = vmlsl_n_s16(g_sub_y_l, vget_low_s16(cr_128), F_0_714); + g_sub_y_h = vmlsl_n_s16(g_sub_y_h, vget_high_s16(cr_128), F_0_714); + /* Descale G components: shift right 15, round and narrow to 16-bit. */ + int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15), + vrshrn_n_s32(g_sub_y_h, 15)); + /* Compute R-Y: 1.40200 * (Cr - 128) */ + int16x8_t r_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cr_128, 1), F_1_402); + /* Compute B-Y: 1.77200 * (Cb - 128) */ + int16x8_t b_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cb_128, 1), F_1_772); + /* Add Y and duplicate chroma components - upsample horizontally. */ + int16x8_t g_even = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y.val[0])); + int16x8_t r_even = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y.val[0])); + int16x8_t b_even = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y.val[0])); + int16x8_t g_odd = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y.val[1])); + int16x8_t r_odd = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y.val[1])); + int16x8_t b_odd = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y.val[1])); + /* Convert each component to unsigned and narrow, clamping to [0-255]. */ + /* Interleave pixel channel values having odd and even pixel indices. */ + uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd)); + uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd)); + uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd)); + +#ifdef RGB_ALPHA + uint8x8x4_t rgba_h; + rgba_h.val[RGB_RED] = r.val[1]; + rgba_h.val[RGB_GREEN] = g.val[1]; + rgba_h.val[RGB_BLUE] = b.val[1]; + /* Set alpha channel to opaque (0xFF). */ + rgba_h.val[RGB_ALPHA] = vdup_n_u8(0xFF); + uint8x8x4_t rgba_l; + rgba_l.val[RGB_RED] = r.val[0]; + rgba_l.val[RGB_GREEN] = g.val[0]; + rgba_l.val[RGB_BLUE] = b.val[0]; + /* Set alpha channel to opaque (0xFF). */ + rgba_l.val[RGB_ALPHA] = vdup_n_u8(0xFF); + /* Store RGBA pixel data to memory. */ + switch (cols_remaining) { + case 15 : + vst4_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgba_h, 6); + case 14 : + vst4_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgba_h, 5); + case 13 : + vst4_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgba_h, 4); + case 12 : + vst4_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgba_h, 3); + case 11 : + vst4_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgba_h, 2); + case 10 : + vst4_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgba_h, 1); + case 9 : + vst4_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgba_h, 0); + case 8 : + vst4_u8(outptr, rgba_l); + break; + case 7 : + vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba_l, 6); + case 6 : + vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba_l, 5); + case 5 : + vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba_l, 4); + case 4 : + vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba_l, 3); + case 3 : + vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba_l, 2); + case 2 : + vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba_l, 1); + case 1 : + vst4_lane_u8(outptr, rgba_l, 0); + default : + break; + } +#else + uint8x8x3_t rgb_h; + rgb_h.val[RGB_RED] = r.val[1]; + rgb_h.val[RGB_GREEN] = g.val[1]; + rgb_h.val[RGB_BLUE] = b.val[1]; + uint8x8x3_t rgb_l; + rgb_l.val[RGB_RED] = r.val[0]; + rgb_l.val[RGB_GREEN] = g.val[0]; + rgb_l.val[RGB_BLUE] = b.val[0]; + /* Store RGB pixel data to memory. */ + switch (cols_remaining) { + case 15 : + vst3_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgb_h, 6); + case 14 : + vst3_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgb_h, 5); + case 13 : + vst3_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgb_h, 4); + case 12 : + vst3_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgb_h, 3); + case 11 : + vst3_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgb_h, 2); + case 10 : + vst3_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgb_h, 1); + case 9 : + vst3_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgb_h, 0); + case 8 : + vst3_u8(outptr, rgb_l); + break; + case 7 : + vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb_l, 6); + case 6 : + vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb_l, 5); + case 5 : + vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb_l, 4); + case 4 : + vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb_l, 3); + case 3 : + vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb_l, 2); + case 2 : + vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb_l, 1); + case 1 : + vst3_lane_u8(outptr, rgb_l, 0); + default : + break; + } +#endif + } +} + + +/* + * Upsample and color convert from YCbCr -> RGB for the case of 2:1 horizontal + * and 2:1 vertical. + * + * See above for details of color conversion and safe memory buffer access. + */ + +void jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ + JSAMPROW outptr0, outptr1; + /* Pointers to Y (both rows), Cb and Cr data. */ + JSAMPROW inptr0_0, inptr0_1, inptr1, inptr2; + + int16x8_t neg_128 = vdupq_n_s16(-128); + + inptr0_0 = input_buf[0][in_row_group_ctr * 2]; + inptr0_1 = input_buf[0][in_row_group_ctr * 2 + 1]; + inptr1 = input_buf[1][in_row_group_ctr]; + inptr2 = input_buf[2][in_row_group_ctr]; + outptr0 = output_buf[0]; + outptr1 = output_buf[1]; + + int cols_remaining = output_width; + for (; cols_remaining >= 16; cols_remaining -= 16) { + /* Load Y-values such that even pixel indices are in one vector and odd */ + /* pixel indices are in another vector. */ + uint8x8x2_t y0 = vld2_u8(inptr0_0); + uint8x8x2_t y1 = vld2_u8(inptr0_1); + uint8x8_t cb = vld1_u8(inptr1); + uint8x8_t cr = vld1_u8(inptr2); + /* Subtract 128 from Cb and Cr. */ + int16x8_t cr_128 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(neg_128), cr)); + int16x8_t cb_128 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(neg_128), cb)); + /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */ + int32x4_t g_sub_y_l = vmull_n_s16(vget_low_s16(cb_128), -F_0_344); + int32x4_t g_sub_y_h = vmull_n_s16(vget_high_s16(cb_128), -F_0_344); + g_sub_y_l = vmlsl_n_s16(g_sub_y_l, vget_low_s16(cr_128), F_0_714); + g_sub_y_h = vmlsl_n_s16(g_sub_y_h, vget_high_s16(cr_128), F_0_714); + /* Descale G components: shift right 15, round and narrow to 16-bit. */ + int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15), + vrshrn_n_s32(g_sub_y_h, 15)); + /* Compute R-Y: 1.40200 * (Cr - 128) */ + int16x8_t r_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cr_128, 1), F_1_402); + /* Compute B-Y: 1.77200 * (Cb - 128) */ + int16x8_t b_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cb_128, 1), F_1_772); + /* Add Y and duplicate chroma components - upsample horizontally. */ + int16x8_t g0_even = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y0.val[0])); + int16x8_t r0_even = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y0.val[0])); + int16x8_t b0_even = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y0.val[0])); + int16x8_t g0_odd = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y0.val[1])); + int16x8_t r0_odd = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y0.val[1])); + int16x8_t b0_odd = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y0.val[1])); + int16x8_t g1_even = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y1.val[0])); + int16x8_t r1_even = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y1.val[0])); + int16x8_t b1_even = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y1.val[0])); + int16x8_t g1_odd = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y1.val[1])); + int16x8_t r1_odd = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y1.val[1])); + int16x8_t b1_odd = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y1.val[1])); + /* Convert each component to unsigned and narrow, clamping to [0-255]. */ + /* Interleave pixel channel values having odd and even pixel indices. */ + uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd)); + uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd)); + uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd)); + uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd)); + uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd)); + uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd)); + +#ifdef RGB_ALPHA + uint8x16x4_t rgba0, rgba1; + rgba0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]); + rgba1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]); + rgba0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]); + rgba1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]); + rgba0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]); + rgba1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]); + /* Set alpha channel to opaque (0xFF). */ + rgba0.val[RGB_ALPHA] = vdupq_n_u8(0xFF); + rgba1.val[RGB_ALPHA] = vdupq_n_u8(0xFF); + /* Store RGBA pixel data to memory. */ + vst4q_u8(outptr0, rgba0); + vst4q_u8(outptr1, rgba1); +#else + uint8x16x3_t rgb0, rgb1; + rgb0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]); + rgb1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]); + rgb0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]); + rgb1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]); + rgb0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]); + rgb1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]); + /* Store RGB pixel data to memory. */ + vst3q_u8(outptr0, rgb0); + vst3q_u8(outptr1, rgb1); +#endif + + /* Increment pointers. */ + inptr0_0 += 16; + inptr0_1 += 16; + inptr1 += 8; + inptr2 += 8; + outptr0 += (RGB_PIXELSIZE * 16); + outptr1 += (RGB_PIXELSIZE * 16); + } + + if (cols_remaining > 0) { + /* Load Y-values such that even pixel indices are in one vector and */ + /* odd pixel indices are in another vector. */ + uint8x8x2_t y0 = vld2_u8(inptr0_0); + uint8x8x2_t y1 = vld2_u8(inptr0_1); + uint8x8_t cb = vld1_u8(inptr1); + uint8x8_t cr = vld1_u8(inptr2); + /* Subtract 128 from Cb and Cr. */ + int16x8_t cr_128 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(neg_128), cr)); + int16x8_t cb_128 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(neg_128), cb)); + /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */ + int32x4_t g_sub_y_l = vmull_n_s16(vget_low_s16(cb_128), -F_0_344); + int32x4_t g_sub_y_h = vmull_n_s16(vget_high_s16(cb_128), -F_0_344); + g_sub_y_l = vmlsl_n_s16(g_sub_y_l, vget_low_s16(cr_128), F_0_714); + g_sub_y_h = vmlsl_n_s16(g_sub_y_h, vget_high_s16(cr_128), F_0_714); + /* Descale G components: shift right 15, round and narrow to 16-bit. */ + int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15), + vrshrn_n_s32(g_sub_y_h, 15)); + /* Compute R-Y: 1.40200 * (Cr - 128) */ + int16x8_t r_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cr_128, 1), F_1_402); + /* Compute B-Y: 1.77200 * (Cb - 128) */ + int16x8_t b_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cb_128, 1), F_1_772); + /* Add Y and duplicate chroma components - upsample horizontally. */ + int16x8_t g0_even = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y0.val[0])); + int16x8_t r0_even = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y0.val[0])); + int16x8_t b0_even = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y0.val[0])); + int16x8_t g0_odd = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y0.val[1])); + int16x8_t r0_odd = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y0.val[1])); + int16x8_t b0_odd = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y0.val[1])); + int16x8_t g1_even = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y1.val[0])); + int16x8_t r1_even = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y1.val[0])); + int16x8_t b1_even = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y1.val[0])); + int16x8_t g1_odd = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y1.val[1])); + int16x8_t r1_odd = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y1.val[1])); + int16x8_t b1_odd = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y1.val[1])); + /* Convert each component to unsigned and narrow, clamping to [0-255]. */ + /* Interleave pixel channel values having odd and even pixel indices. */ + uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd)); + uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd)); + uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd)); + uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd)); + uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd)); + uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd)); + +#ifdef RGB_ALPHA + uint8x8x4_t rgba0_h, rgba1_h; + rgba0_h.val[RGB_RED] = r0.val[1]; + rgba1_h.val[RGB_RED] = r1.val[1]; + rgba0_h.val[RGB_GREEN] = g0.val[1]; + rgba1_h.val[RGB_GREEN] = g1.val[1]; + rgba0_h.val[RGB_BLUE] = b0.val[1]; + rgba1_h.val[RGB_BLUE] = b1.val[1]; + /* Set alpha channel to opaque (0xFF). */ + rgba0_h.val[RGB_ALPHA] = vdup_n_u8(0xFF); + rgba1_h.val[RGB_ALPHA] = vdup_n_u8(0xFF); + + uint8x8x4_t rgba0_l, rgba1_l; + rgba0_l.val[RGB_RED] = r0.val[0]; + rgba1_l.val[RGB_RED] = r1.val[0]; + rgba0_l.val[RGB_GREEN] = g0.val[0]; + rgba1_l.val[RGB_GREEN] = g1.val[0]; + rgba0_l.val[RGB_BLUE] = b0.val[0]; + rgba1_l.val[RGB_BLUE] = b1.val[0]; + /* Set alpha channel to opaque (0xFF). */ + rgba0_l.val[RGB_ALPHA] = vdup_n_u8(0xFF); + rgba1_l.val[RGB_ALPHA] = vdup_n_u8(0xFF); + /* Store RGBA pixel data to memory. */ + switch (cols_remaining) { + case 15 : + vst4_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgba0_h, 6); + vst4_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgba1_h, 6); + case 14 : + vst4_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgba0_h, 5); + vst4_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgba1_h, 5); + case 13 : + vst4_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgba0_h, 4); + vst4_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgba1_h, 4); + case 12 : + vst4_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgba0_h, 3); + vst4_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgba1_h, 3); + case 11 : + vst4_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgba0_h, 2); + vst4_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgba1_h, 2); + case 10 : + vst4_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgba0_h, 1); + vst4_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgba1_h, 1); + case 9 : + vst4_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgba0_h, 0); + vst4_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgba1_h, 0); + case 8 : + vst4_u8(outptr0, rgba0_l); + vst4_u8(outptr1, rgba1_l); + break; + case 7 : + vst4_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgba0_l, 6); + vst4_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgba1_l, 6); + case 6 : + vst4_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgba0_l, 5); + vst4_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgba1_l, 5); + case 5 : + vst4_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgba0_l, 4); + vst4_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgba1_l, 4); + case 4 : + vst4_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgba0_l, 3); + vst4_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgba1_l, 3); + case 3 : + vst4_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgba0_l, 2); + vst4_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgba1_l, 2); + case 2 : + vst4_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgba0_l, 1); + vst4_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgba1_l, 1); + case 1 : + vst4_lane_u8(outptr0, rgba0_l, 0); + vst4_lane_u8(outptr1, rgba1_l, 0); + default : + break; + } +#else + uint8x8x3_t rgb0_h, rgb1_h; + rgb0_h.val[RGB_RED] = r0.val[1]; + rgb1_h.val[RGB_RED] = r1.val[1]; + rgb0_h.val[RGB_GREEN] = g0.val[1]; + rgb1_h.val[RGB_GREEN] = g1.val[1]; + rgb0_h.val[RGB_BLUE] = b0.val[1]; + rgb1_h.val[RGB_BLUE] = b1.val[1]; + + uint8x8x3_t rgb0_l, rgb1_l; + rgb0_l.val[RGB_RED] = r0.val[0]; + rgb1_l.val[RGB_RED] = r1.val[0]; + rgb0_l.val[RGB_GREEN] = g0.val[0]; + rgb1_l.val[RGB_GREEN] = g1.val[0]; + rgb0_l.val[RGB_BLUE] = b0.val[0]; + rgb1_l.val[RGB_BLUE] = b1.val[0]; + /* Store RGB pixel data to memory. */ + switch (cols_remaining) { + case 15 : + vst3_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgb0_h, 6); + vst3_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgb1_h, 6); + case 14 : + vst3_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgb0_h, 5); + vst3_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgb1_h, 5); + case 13 : + vst3_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgb0_h, 4); + vst3_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgb1_h, 4); + case 12 : + vst3_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgb0_h, 3); + vst3_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgb1_h, 3); + case 11 : + vst3_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgb0_h, 2); + vst3_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgb1_h, 2); + case 10 : + vst3_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgb0_h, 1); + vst3_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgb1_h, 1); + case 9 : + vst3_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgb0_h, 0); + vst3_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgb1_h, 0); + case 8 : + vst3_u8(outptr0, rgb0_l); + vst3_u8(outptr1, rgb1_l); + break; + case 7 : + vst3_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgb0_l, 6); + vst3_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgb1_l, 6); + case 6 : + vst3_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgb0_l, 5); + vst3_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgb1_l, 5); + case 5 : + vst3_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgb0_l, 4); + vst3_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgb1_l, 4); + case 4 : + vst3_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgb0_l, 3); + vst3_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgb1_l, 3); + case 3 : + vst3_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgb0_l, 2); + vst3_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgb1_l, 2); + case 2 : + vst3_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgb0_l, 1); + vst3_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgb1_l, 1); + case 1 : + vst3_lane_u8(outptr0, rgb0_l, 0); + vst3_lane_u8(outptr1, rgb1_l, 0); + default : + break; + } +#endif + } +} diff --git a/simd/arm/common/jdsample-neon.c b/simd/arm/common/jdsample-neon.c new file mode 100644 index 0000000..e4f5129 --- /dev/null +++ b/simd/arm/common/jdsample-neon.c @@ -0,0 +1,557 @@ +/* + * jdsample-neon.c - upsampling (Arm NEON) + * + * Copyright 2019 The Chromium Authors. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../../jinclude.h" +#include "../../../jpeglib.h" +#include "../../../jsimd.h" +#include "../../../jdct.h" +#include "../../../jsimddct.h" +#include "../../jsimd.h" + +#include <arm_neon.h> + +/* + * The diagram below shows a row of samples (luma or chroma) produced by h2v1 + * downsampling. + * + * s0 s1 s2 + * +---------+---------+---------+ + * | | | | + * | p0 p1 | p2 p3 | p4 p5 | + * | | | | + * +---------+---------+---------+ + * + * Each sample contains two of the original pixel channel values. These pixel + * channel values are centred at positions p0, p1, p2, p3, p4 and p5 above. To + * compute the channel values of the original image, we proportionally blend + * the adjacent samples in each row. + * + * There are three cases to consider: + * + * 1) The first pixel in the original image. + * Pixel channel value p0 contains only a component from sample s0, so we + * set p0 = s0. + * 2) The last pixel in the original image. + * Pixel channel value p5 contains only a component from sample s2, so we + * set p5 = s2. + * 3) General case (all other pixels in the row). + * Apart from the first and last pixels, every other pixel channel value is + * computed by blending the containing sample and the nearest neigbouring + * sample in the ratio 3:1. + * For example, the pixel channel value centred at p1 would be computed as + * follows: + * 3/4 * s0 + 1/4 * s1 + * while the pixel channel value centred at p2 would be: + * 3/4 * s1 + 1/4 * s0 + */ + +void jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor, + JDIMENSION downsampled_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr, outptr; + /* Setup constants. */ + const uint16x8_t one_u16 = vdupq_n_u16(1); + const uint8x8_t three_u8 = vdup_n_u8(3); + + for (int inrow = 0; inrow < max_v_samp_factor; inrow++) { + inptr = input_data[inrow]; + outptr = output_data[inrow]; + /* Case 1: first pixel channel value in this row of the original image. */ + *outptr = (JSAMPLE)GETJSAMPLE(*inptr); + + /* General case: */ + /* 3/4 * containing sample + 1/4 * nearest neighbouring sample */ + /* For p1: containing sample = s0, nearest neighbouring sample = s1. */ + /* For p2: containing sample = s1, nearest neighbouring sample = s0. */ + uint8x16_t s0 = vld1q_u8(inptr); + uint8x16_t s1 = vld1q_u8(inptr + 1); + /* Multiplication makes vectors twice as wide: '_l' and '_h' suffixes */ + /* denote low half and high half respectively. */ + uint16x8_t s1_add_3s0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1)), + vget_low_u8(s0), three_u8); + uint16x8_t s1_add_3s0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1)), + vget_high_u8(s0), three_u8); + uint16x8_t s0_add_3s1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0)), + vget_low_u8(s1), three_u8); + uint16x8_t s0_add_3s1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0)), + vget_high_u8(s1), three_u8); + /* Add ordered dithering bias to odd pixel values. */ + s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16); + s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16); + + /* Initially 1 - due to having already stored the first pixel of the */ + /* image. However, in subsequent iterations of the SIMD loop this offset */ + /* is (2 * colctr - 1) to stay within the bounds of the sample buffers */ + /* without having to resort to a slow scalar tail case for the last */ + /* (downsampled_width % 16) samples. See "Creation of 2-D sample arrays" */ + /* in jmemmgr.c for details. */ + unsigned outptr_offset = 1; + uint8x16x2_t output_pixels; + +#if defined(__aarch64__) && defined(__clang__) && !defined(__OPTIMIZE_SIZE__) + /* Unrolling by four is beneficial on AArch64 as there are 16 additional */ + /* 128-bit SIMD registers to accommodate the extra data in flight. */ + #pragma clang loop unroll_count(4) +#endif + /* We use software pipelining to maximise performance. The code indented */ + /* an extra 6 spaces begins the next iteration of the loop. */ + for (unsigned colctr = 16; colctr < downsampled_width; colctr += 16) { + s0 = vld1q_u8(inptr + colctr - 1); + s1 = vld1q_u8(inptr + colctr); + /* Right-shift by 2 (divide by 4), narrow to 8-bit and combine. */ + output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2), + vrshrn_n_u16(s1_add_3s0_h, 2)); + output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2), + vshrn_n_u16(s0_add_3s1_h, 2)); + /* Multiplication makes vectors twice as wide: '_l' and '_h' */ + /* suffixes denote low half and high half respectively. */ + s1_add_3s0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1)), + vget_low_u8(s0), three_u8); + s1_add_3s0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1)), + vget_high_u8(s0), three_u8); + s0_add_3s1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0)), + vget_low_u8(s1), three_u8); + s0_add_3s1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0)), + vget_high_u8(s1), three_u8); + /* Add ordered dithering bias to odd pixel values. */ + s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16); + s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16); + /* Store pixel channel values to memory. */ + vst2q_u8(outptr + outptr_offset, output_pixels); + outptr_offset = 2 * colctr - 1; + } + + /* Complete the last iteration of the loop. */ + /* Right-shift by 2 (divide by 4), narrow to 8-bit and combine. */ + output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2), + vrshrn_n_u16(s1_add_3s0_h, 2)); + output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2), + vshrn_n_u16(s0_add_3s1_h, 2)); + /* Store pixel channel values to memory. */ + vst2q_u8(outptr + outptr_offset, output_pixels); + + /* Case 2: last pixel channel value in this row of the original image. */ + outptr[2 * downsampled_width - 1] = + GETJSAMPLE(inptr[downsampled_width - 1]); + } +} + + +/* + * The diagram below shows a grid-window of samples (luma or chroma) produced + * by h2v2 downsampling. + * + * s0 s1 + * +---------+---------+ + * | p0 p1 | p2 p3 | + * r0 | | | + * | p4 p5 | p6 p7 | + * +---------+---------+ + * | p8 p9 | p10 p11| + * r1 | | | + * | p12 p13| p14 p15| + * +---------+---------+ + * | p16 p17| p18 p19| + * r2 | | | + * | p20 p21| p22 p23| + * +---------+---------+ + * + * Every sample contains four of the original pixel channel values. The pixels' + * channel values are centred at positions p0, p1, p2,..., p23 above. For a + * given grid-window position, r1 is always used to denote the row of samples + * containing the pixel channel values we are computing. For the top row of + * pixel channel values in r1 (p8-p11), the nearest neighbouring samples are in + * the row above - denoted by r0. Likewise, for the bottom row of pixels in r1 + * (p12-p15), the nearest neighbouring samples are in the row below - denoted + * by r2. + * + * To compute the pixel channel values of the original image, we proportionally + * blend the sample containing the pixel centre with the nearest neighbouring + * samples in each row, column and diagonal. + * + * There are three cases to consider: + * + * 1) The first pixel in this row of the original image. + * Pixel channel value p8 only contains components from sample column s0. + * Its value is computed by blending samples s0r1 and s0r0 in the ratio 3:1. + * 2) The last pixel in this row of the original image. + * Pixel channel value p11 only contains components from sample column s1. + * Its value is computed by blending samples s1r1 and s1r0 in the ratio 3:1. + * 3) General case (all other pixels in the row). + * Apart from the first and last pixels, every other pixel channel value in + * the row contains components from samples in adjacent columns. + * + * For example, the pixel centred at p9 would be computed as follows: + * (9/16 * s0r1) + (3/16 * s0r0) + (3/16 * s1r1) + (1/16 * s1r0) + * + * This can be broken down into two steps: + * 1) Blend samples vertically in columns s0 and s1 in the ratio 3:1: + * s0colsum = 3/4 * s0r1 + 1/4 * s0r0 + * s1colsum = 3/4 * s1r1 + 1/4 * s1r0 + * 2) Blend the already-blended columns in the ratio 3:1: + * p9 = 3/4 * s0colsum + 1/4 * s1colsum + * + * The bottom row of pixel channel values in row r1 can be computed in the same + * way for each of the three cases, only using samples in row r2 instead of row + * r0 - as r2 is the nearest neighbouring row. + */ + +void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor, + JDIMENSION downsampled_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1; + int inrow, outrow; + /* Setup constants. */ + const uint16x8_t seven_u16 = vdupq_n_u16(7); + const uint8x8_t three_u8 = vdup_n_u8(3); + const uint16x8_t three_u16 = vdupq_n_u16(3); + + inrow = outrow = 0; + while (outrow < max_v_samp_factor) { + inptr0 = input_data[inrow - 1]; + inptr1 = input_data[inrow]; + inptr2 = input_data[inrow + 1]; + /* Suffixes 0 and 1 denote the top and bottom rows of output pixels */ + /* respectively. */ + outptr0 = output_data[outrow++]; + outptr1 = output_data[outrow++]; + + /* Case 1: first pixel channel value in this row of original image. */ + int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0); + *outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4); + int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2); + *outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4); + + /* General case as described above. */ + /* Step 1: Blend samples vertically in columns s0 and s1. */ + /* Leave the divide by 4 to the end when it can be done for both */ + /* dimensions at once, right-shifting by 4. */ + + /* Load and compute s0colsum0 and s0colsum1. */ + uint8x16_t s0r0 = vld1q_u8(inptr0); + uint8x16_t s0r1 = vld1q_u8(inptr1); + uint8x16_t s0r2 = vld1q_u8(inptr2); + /* Multiplication makes vectors twice as wide: '_l' and '_h' suffixes */ + /* denote low half and high half respectively. */ + uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r0)), + vget_low_u8(s0r1), three_u8); + uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r0)), + vget_high_u8(s0r1), three_u8); + uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r2)), + vget_low_u8(s0r1), three_u8); + uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r2)), + vget_high_u8(s0r1), three_u8); + /* Load and compute s1colsum0 and s1colsum1. */ + uint8x16_t s1r0 = vld1q_u8(inptr0 + 1); + uint8x16_t s1r1 = vld1q_u8(inptr1 + 1); + uint8x16_t s1r2 = vld1q_u8(inptr2 + 1); + uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r0)), + vget_low_u8(s1r1), three_u8); + uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r0)), + vget_high_u8(s1r1), three_u8); + uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r2)), + vget_low_u8(s1r1), three_u8); + uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r2)), + vget_high_u8(s1r1), three_u8); + /* Step 2: Blend the already-blended columns. */ + uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16); + uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16); + uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16); + uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16); + uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16); + uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16); + uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16); + uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16); + /* Add ordered dithering bias to odd pixel values. */ + output0_p1_l = vaddq_u16(output0_p1_l, seven_u16); + output0_p1_h = vaddq_u16(output0_p1_h, seven_u16); + output1_p1_l = vaddq_u16(output1_p1_l, seven_u16); + output1_p1_h = vaddq_u16(output1_p1_h, seven_u16); + /* Right-shift by 4 (divide by 16), narrow to 8-bit and combine. */ + uint8x16x2_t output_pixels0 = { vcombine_u8(vshrn_n_u16(output0_p1_l, 4), + vshrn_n_u16(output0_p1_h, 4)), + vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), + vrshrn_n_u16(output0_p2_h, 4)) + }; + uint8x16x2_t output_pixels1 = { vcombine_u8(vshrn_n_u16(output1_p1_l, 4), + vshrn_n_u16(output1_p1_h, 4)), + vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), + vrshrn_n_u16(output1_p2_h, 4)) + }; + /* Store pixel channel values to memory. */ + /* The minimum size of the output buffer for each row is 64 bytes => no */ + /* need to worry about buffer overflow here. See "Creation of 2-D sample */ + /* arrays" in jmemmgr.c for details. */ + vst2q_u8(outptr0 + 1, output_pixels0); + vst2q_u8(outptr1 + 1, output_pixels1); + + /* The first pixel of the image shifted our loads and stores by one */ + /* byte. We have to re-align on a 32-byte boundary at some point before */ + /* the end of the row (we do it now on the 32/33 pixel boundary) to stay */ + /* within the bounds of the sample buffers without having to resort to a */ + /* slow scalar tail case for the last (downsampled_width % 16) samples. */ + /* See "Creation of 2-D sample arrays" in jmemmgr.c for details.*/ + for (unsigned colctr = 16; colctr < downsampled_width; colctr += 16) { + /* Step 1: Blend samples vertically in columns s0 and s1. */ + /* Load and compute s0colsum0 and s0colsum1. */ + s0r0 = vld1q_u8(inptr0 + colctr - 1); + s0r1 = vld1q_u8(inptr1 + colctr - 1); + s0r2 = vld1q_u8(inptr2 + colctr - 1); + s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r0)), + vget_low_u8(s0r1), three_u8); + s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r0)), + vget_high_u8(s0r1), three_u8); + s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r2)), + vget_low_u8(s0r1), three_u8); + s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r2)), + vget_high_u8(s0r1), three_u8); + /* Load and compute s1colsum0 and s1colsum1. */ + s1r0 = vld1q_u8(inptr0 + colctr); + s1r1 = vld1q_u8(inptr1 + colctr); + s1r2 = vld1q_u8(inptr2 + colctr); + s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r0)), + vget_low_u8(s1r1), three_u8); + s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r0)), + vget_high_u8(s1r1), three_u8); + s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r2)), + vget_low_u8(s1r1), three_u8); + s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r2)), + vget_high_u8(s1r1), three_u8); + /* Step 2: Blend the already-blended columns. */ + output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16); + output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16); + output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16); + output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16); + output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16); + output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16); + output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16); + output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16); + /* Add ordered dithering bias to odd pixel values. */ + output0_p1_l = vaddq_u16(output0_p1_l, seven_u16); + output0_p1_h = vaddq_u16(output0_p1_h, seven_u16); + output1_p1_l = vaddq_u16(output1_p1_l, seven_u16); + output1_p1_h = vaddq_u16(output1_p1_h, seven_u16); + /* Right-shift by 4 (divide by 16), narrow to 8-bit and combine. */ + output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4), + vshrn_n_u16(output0_p1_h, 4)); + output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), + vrshrn_n_u16(output0_p2_h, 4)); + output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4), + vshrn_n_u16(output1_p1_h, 4)); + output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), + vrshrn_n_u16(output1_p2_h, 4)); + /* Store pixel channel values to memory. */ + vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0); + vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1); + } + + /* Case 2: last pixel channel value in this row of the original image. */ + int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 + + GETJSAMPLE(inptr0[downsampled_width - 1]); + outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4); + int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 + + GETJSAMPLE(inptr2[downsampled_width - 1]); + outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4); + inrow++; + } +} + + +/* + * The diagram below shows a grid-window of samples (luma or chroma) produced + * by h2v1 downsampling; which has been subsequently rotated 90 degrees. (The + * usual use of h1v2 upsampling is upsampling rotated or transposed h2v1 + * downsampled images.) + * + * s0 s1 + * +---------+---------+ + * | p0 | p1 | + * r0 | | | + * | p2 | p3 | + * +---------+---------+ + * | p4 | p5 | + * r1 | | | + * | p6 | p7 | + * +---------+---------+ + * | p8 | p9 | + * r2 | | | + * | p10 | p11 | + * +---------+---------+ + * + * Every sample contains two of the original pixel channel values. The pixels' + * channel values are centred at positions p0, p1, p2,..., p11 above. For a + * given grid-window position, r1 is always used to denote the row of samples + * containing the pixel channel values we are computing. For the top row of + * pixel channel values in r1 (p4 and p5), the nearest neighbouring samples are + * in the row above - denoted by r0. Likewise, for the bottom row of pixels in + * r1 (p6 and p7), the nearest neighbouring samples are in the row below - + * denoted by r2. + * + * To compute the pixel channel values of the original image, we proportionally + * blend the adjacent samples in each column. + * + * For example, the pixel channel value centred at p4 would be computed as + * follows: + * 3/4 * s0r1 + 1/4 * s0r0 + * while the pixel channel value centred at p6 would be: + * 3/4 * s0r1 + 1/4 * s0r2 + */ + +void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor, + JDIMENSION downsampled_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1; + int inrow, outrow; + /* Setup constants. */ + const uint16x8_t one_u16 = vdupq_n_u16(1); + const uint8x8_t three_u8 = vdup_n_u8(3); + + inrow = outrow = 0; + while (outrow < max_v_samp_factor) { + inptr0 = input_data[inrow - 1]; + inptr1 = input_data[inrow]; + inptr2 = input_data[inrow + 1]; + /* Suffixes 0 and 1 denote the top and bottom rows of output pixels */ + /* respectively. */ + outptr0 = output_data[outrow++]; + outptr1 = output_data[outrow++]; + inrow++; + + /* The size of the input and output buffers is always a multiple of 32 */ + /* bytes => no need to worry about buffer overflow when reading/writing */ + /* memory. See "Creation of 2-D sample arrays" in jmemmgr.c for details. */ + for (unsigned colctr = 0; colctr < downsampled_width; colctr += 16) { + /* Load samples. */ + uint8x16_t r0 = vld1q_u8(inptr0 + colctr); + uint8x16_t r1 = vld1q_u8(inptr1 + colctr); + uint8x16_t r2 = vld1q_u8(inptr2 + colctr); + /* Blend samples vertically. */ + uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(r0)), + vget_low_u8(r1), three_u8); + uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(r0)), + vget_high_u8(r1), three_u8); + uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(r2)), + vget_low_u8(r1), three_u8); + uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(r2)), + vget_high_u8(r1), three_u8); + /* Add ordered dithering bias to pixel values in even output rows. */ + colsum0_l = vaddq_u16(colsum0_l, one_u16); + colsum0_h = vaddq_u16(colsum0_h, one_u16); + /* Right-shift by 2 (divide by 4), narrow to 8-bit and combine. */ + uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2), + vshrn_n_u16(colsum0_h, 2)); + uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2), + vrshrn_n_u16(colsum1_h, 2)); + /* Store pixel channel values to memory. */ + vst1q_u8(outptr0 + colctr, output_pixels0); + vst1q_u8(outptr1 + colctr, output_pixels1); + } + } +} + + +/* + * The diagram below shows the operation of h2v1 (simple) upsampling. Each + * sample in the row is duplicated to form two output pixel channel values. + * + * p0 p1 p2 p3 + * +----+----+ +----+----+----+----+ + * | s0 | s1 | -> | s0 | s0 | s1 | s1 | + * +----+----+ +----+----+----+----+ + */ + +void jsimd_h2v1_upsample_neon(int max_v_samp_factor, + JDIMENSION output_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr, outptr; + + for (int inrow = 0; inrow < max_v_samp_factor; inrow++) { + inptr = input_data[inrow]; + outptr = output_data[inrow]; + for (unsigned colctr = 0; 2 * colctr < output_width; colctr += 16) { + uint8x16_t samples = vld1q_u8(inptr + colctr); + /* Duplicate the samples - the store interleaves them to produce the */ + /* pattern in the diagram above. */ + uint8x16x2_t output_pixels = { samples, samples }; + /* Store pixel values to memory. */ + /* Due to the way sample buffers are allocated, we don't need to worry */ + /* about tail cases when output_width is not a multiple of 32. */ + /* See "Creation of 2-D sample arrays" in jmemmgr.c for details. */ + vst2q_u8(outptr + 2 * colctr, output_pixels); + } + } +} + + +/* + * The diagram below shows the operation of h2v2 (simple) upsampling. Each + * sample in the row is duplicated to form two output pixel channel values. + * This horizontally-upsampled row is then also duplicated. + * + * p0 p1 p2 p3 + * +-----+-----+ +-----+-----+-----+-----+ + * | s0 | s1 | -> | s0 | s0 | s1 | s1 | + * +-----+-----+ +-----+-----+-----+-----+ + * | s0 | s0 | s1 | s1 | + * +-----+-----+-----+-----+ + */ + +void jsimd_h2v2_upsample_neon(int max_v_samp_factor, + JDIMENSION output_width, + JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr) +{ + JSAMPARRAY output_data = *output_data_ptr; + JSAMPROW inptr, outptr0, outptr1; + + for (int inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { + inptr = input_data[inrow]; + outptr0 = output_data[outrow++]; + outptr1 = output_data[outrow++]; + + for (unsigned colctr = 0; 2 * colctr < output_width; colctr += 16) { + uint8x16_t samples = vld1q_u8(inptr + colctr); + /* Duplicate the samples - the store interleaves them to produce the */ + /* pattern in the diagram above. */ + uint8x16x2_t output_pixels = { samples, samples }; + /* Store pixel values to memory for both output rows. */ + /* Due to the way sample buffers are allocated, we don't need to worry */ + /* about tail cases when output_width is not a multiple of 32. */ + /* See "Creation of 2-D sample arrays" in jmemmgr.c for details. */ + vst2q_u8(outptr0 + 2 * colctr, output_pixels); + vst2q_u8(outptr1 + 2 * colctr, output_pixels); + } + } +} diff --git a/simd/arm/common/jfdctfst-neon.c b/simd/arm/common/jfdctfst-neon.c new file mode 100644 index 0000000..e7b2e96 --- /dev/null +++ b/simd/arm/common/jfdctfst-neon.c @@ -0,0 +1,211 @@ +/* + * jfdctfst-neon.c - fast DCT (Arm NEON) + * + * Copyright 2020 The Chromium Authors. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../../jconfigint.h" +#include "../../../jinclude.h" +#include "../../../jpeglib.h" +#include "../../../jsimd.h" +#include "../../../jdct.h" +#include "../../../jsimddct.h" +#include "../../jsimd.h" + +#include <arm_neon.h> + +/* + * 'jsimd_fdct_ifast_neon' performs a fast, not so accurate forward DCT + * (Discrete Cosine Transform) on one block of samples. It uses the same + * calculations and produces exactly the same output as IJG's original + * 'jpeg_fdct_ifast' function, which can be found in jfdctfst.c. + * + * Scaled integer constants are used to avoid floating-point arithmetic: + * 0.382683433 = 12544 * 2^-15 + * 0.541196100 = 17795 * 2^-15 + * 0.707106781 = 23168 * 2^-15 + * 0.306562965 = 9984 * 2^-15 + * + * See jfdctfst.c for further details of the IDCT algorithm. Where possible, + * the variable names and comments here in 'jsimd_fdct_ifast_neon' match up + * with those in 'jpeg_fdct_ifast'. + */ + +#define F_0_382 12544 +#define F_0_541 17792 +#define F_0_707 23168 +#define F_0_306 9984 + +ALIGN(16) static const int16_t jsimd_fdct_ifast_neon_consts[] = { + F_0_382, F_0_541, F_0_707, F_0_306 +}; + +void jsimd_fdct_ifast_neon(DCTELEM *data) +{ + /* Load an 8x8 block of samples into Neon registers. De-interleaving loads */ + /* are used followed by vuzp to transpose the block such that we have a */ + /* column of samples per vector - allowing all rows to be processed at */ + /* once. */ + int16x8x4_t data1 = vld4q_s16(data); + int16x8x4_t data2 = vld4q_s16(data + 4 * DCTSIZE); + + int16x8x2_t cols_04 = vuzpq_s16(data1.val[0], data2.val[0]); + int16x8x2_t cols_15 = vuzpq_s16(data1.val[1], data2.val[1]); + int16x8x2_t cols_26 = vuzpq_s16(data1.val[2], data2.val[2]); + int16x8x2_t cols_37 = vuzpq_s16(data1.val[3], data2.val[3]); + + int16x8_t col0 = cols_04.val[0]; + int16x8_t col1 = cols_15.val[0]; + int16x8_t col2 = cols_26.val[0]; + int16x8_t col3 = cols_37.val[0]; + int16x8_t col4 = cols_04.val[1]; + int16x8_t col5 = cols_15.val[1]; + int16x8_t col6 = cols_26.val[1]; + int16x8_t col7 = cols_37.val[1]; + + /* Load DCT conversion constants. */ + const int16x4_t consts = vld1_s16(jsimd_fdct_ifast_neon_consts); + + /* Pass 1: process rows. */ + int16x8_t tmp0 = vaddq_s16(col0, col7); + int16x8_t tmp7 = vsubq_s16(col0, col7); + int16x8_t tmp1 = vaddq_s16(col1, col6); + int16x8_t tmp6 = vsubq_s16(col1, col6); + int16x8_t tmp2 = vaddq_s16(col2, col5); + int16x8_t tmp5 = vsubq_s16(col2, col5); + int16x8_t tmp3 = vaddq_s16(col3, col4); + int16x8_t tmp4 = vsubq_s16(col3, col4); + + /* Even part */ + int16x8_t tmp10 = vaddq_s16(tmp0, tmp3); /* phase 2 */ + int16x8_t tmp13 = vsubq_s16(tmp0, tmp3); + int16x8_t tmp11 = vaddq_s16(tmp1, tmp2); + int16x8_t tmp12 = vsubq_s16(tmp1, tmp2); + + col0 = vaddq_s16(tmp10, tmp11); /* phase 3 */ + col4 = vsubq_s16(tmp10, tmp11); + + int16x8_t z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2); + col2 = vaddq_s16(tmp13, z1); /* phase 5 */ + col6 = vsubq_s16(tmp13, z1); + + /* Odd part */ + tmp10 = vaddq_s16(tmp4, tmp5); /* phase 2 */ + tmp11 = vaddq_s16(tmp5, tmp6); + tmp12 = vaddq_s16(tmp6, tmp7); + + int16x8_t z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0); + int16x8_t z2 = vqdmulhq_lane_s16(tmp10, consts, 1); + z2 = vaddq_s16(z2, z5); + int16x8_t z4 = vqdmulhq_lane_s16(tmp12, consts, 3); + z5 = vaddq_s16(tmp12, z5); + z4 = vaddq_s16(z4, z5); + int16x8_t z3 = vqdmulhq_lane_s16(tmp11, consts, 2); + + int16x8_t z11 = vaddq_s16(tmp7, z3); /* phase 5 */ + int16x8_t z13 = vsubq_s16(tmp7, z3); + + col5 = vaddq_s16(z13, z2); /* phase 6 */ + col3 = vsubq_s16(z13, z2); + col1 = vaddq_s16(z11, z4); + col7 = vsubq_s16(z11, z4); + + /* Transpose to work on columns in pass 2. */ + int16x8x2_t cols_01 = vtrnq_s16(col0, col1); + int16x8x2_t cols_23 = vtrnq_s16(col2, col3); + int16x8x2_t cols_45 = vtrnq_s16(col4, col5); + int16x8x2_t cols_67 = vtrnq_s16(col6, col7); + + int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]), + vreinterpretq_s32_s16(cols_45.val[0])); + int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]), + vreinterpretq_s32_s16(cols_45.val[1])); + int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]), + vreinterpretq_s32_s16(cols_67.val[0])); + int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]), + vreinterpretq_s32_s16(cols_67.val[1])); + + int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]); + int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]); + int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]); + int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]); + + int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]); + int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]); + int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]); + int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]); + int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]); + int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]); + int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]); + int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]); + + /* Pass 2: process columns. */ + tmp0 = vaddq_s16(row0, row7); + tmp7 = vsubq_s16(row0, row7); + tmp1 = vaddq_s16(row1, row6); + tmp6 = vsubq_s16(row1, row6); + tmp2 = vaddq_s16(row2, row5); + tmp5 = vsubq_s16(row2, row5); + tmp3 = vaddq_s16(row3, row4); + tmp4 = vsubq_s16(row3, row4); + + /* Even part */ + tmp10 = vaddq_s16(tmp0, tmp3); /* phase 2 */ + tmp13 = vsubq_s16(tmp0, tmp3); + tmp11 = vaddq_s16(tmp1, tmp2); + tmp12 = vsubq_s16(tmp1, tmp2); + + row0 = vaddq_s16(tmp10, tmp11); /* phase 3 */ + row4 = vsubq_s16(tmp10, tmp11); + + z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2); + row2 = vaddq_s16(tmp13, z1); /* phase 5 */ + row6 = vsubq_s16(tmp13, z1); + + /* Odd part */ + tmp10 = vaddq_s16(tmp4, tmp5); /* phase 2 */ + tmp11 = vaddq_s16(tmp5, tmp6); + tmp12 = vaddq_s16(tmp6, tmp7); + + z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0); + z2 = vqdmulhq_lane_s16(tmp10, consts, 1); + z2 = vaddq_s16(z2, z5); + z4 = vqdmulhq_lane_s16(tmp12, consts, 3); + z5 = vaddq_s16(tmp12, z5); + z4 = vaddq_s16(z4, z5); + z3 = vqdmulhq_lane_s16(tmp11, consts, 2); + + z11 = vaddq_s16(tmp7, z3); /* phase 5 */ + z13 = vsubq_s16(tmp7, z3); + + row5 = vaddq_s16(z13, z2); /* phase 6 */ + row3 = vsubq_s16(z13, z2); + row1 = vaddq_s16(z11, z4); + row7 = vsubq_s16(z11, z4); + + vst1q_s16(data + 0 * DCTSIZE, row0); + vst1q_s16(data + 1 * DCTSIZE, row1); + vst1q_s16(data + 2 * DCTSIZE, row2); + vst1q_s16(data + 3 * DCTSIZE, row3); + vst1q_s16(data + 4 * DCTSIZE, row4); + vst1q_s16(data + 5 * DCTSIZE, row5); + vst1q_s16(data + 6 * DCTSIZE, row6); + vst1q_s16(data + 7 * DCTSIZE, row7); +} diff --git a/simd/arm/common/jfdctint-neon.c b/simd/arm/common/jfdctint-neon.c new file mode 100644 index 0000000..55abb1b --- /dev/null +++ b/simd/arm/common/jfdctint-neon.c @@ -0,0 +1,371 @@ +/* + * jfdctint-neon.c - accurate DCT (Arm NEON) + * + * Copyright 2020 The Chromium Aruthors. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../../jconfigint.h" +#include "../../../jinclude.h" +#include "../../../jpeglib.h" +#include "../../../jsimd.h" +#include "../../../jdct.h" +#include "../../../jsimddct.h" +#include "../../jsimd.h" + +#include <arm_neon.h> + +/* + * 'jsimd_fdct_islow_neon' performs a slow-but-accurate forward DCT (Discrete + * Cosine Transform) on one block of samples. It uses the same calculations + * and produces exactly the same output as IJG's original 'jpeg_fdct_islow' + * function, which can be found in jfdctint.c. + * + * Scaled integer constants are used to avoid floating-point arithmetic: + * 0.298631336 = 2446 * 2^-13 + * 0.390180644 = 3196 * 2^-13 + * 0.541196100 = 4433 * 2^-13 + * 0.765366865 = 6270 * 2^-13 + * 0.899976223 = 7373 * 2^-13 + * 1.175875602 = 9633 * 2^-13 + * 1.501321110 = 12299 * 2^-13 + * 1.847759065 = 15137 * 2^-13 + * 1.961570560 = 16069 * 2^-13 + * 2.053119869 = 16819 * 2^-13 + * 2.562915447 = 20995 * 2^-13 + * 3.072711026 = 25172 * 2^-13 + * + * See jfdctint.c for further details of the DCT algorithm. Where possible, + * the variable names and comments here in 'jsimd_fdct_islow_neon' match up + * with those in 'jpeg_fdct_islow'. + */ + +#define CONST_BITS 13 +#define PASS1_BITS 2 + +#define DESCALE_P1 (CONST_BITS - PASS1_BITS) +#define DESCALE_P2 (CONST_BITS + PASS1_BITS) + +#define F_0_298 2446 +#define F_0_390 3196 +#define F_0_541 4433 +#define F_0_765 6270 +#define F_0_899 7373 +#define F_1_175 9633 +#define F_1_501 12299 +#define F_1_847 15137 +#define F_1_961 16069 +#define F_2_053 16819 +#define F_2_562 20995 +#define F_3_072 25172 + +ALIGN(16) static const int16_t jsimd_fdct_islow_neon_consts[] = { + F_0_298, -F_0_390, F_0_541, F_0_765, + -F_0_899, F_1_175, F_1_501, -F_1_847, + -F_1_961, F_2_053, -F_2_562, F_3_072 +}; + +void jsimd_fdct_islow_neon(DCTELEM *data) +{ + /* Load DCT constants. */ +#if defined(__clang__) || defined(_MSC_VER) + const int16x4x3_t consts = vld1_s16_x3(jsimd_fdct_islow_neon_consts); +#else + /* GCC does not currently support the intrinsic vld1_<type>_x3(). */ + const int16x4_t consts1 = vld1_s16(jsimd_fdct_islow_neon_consts); + const int16x4_t consts2 = vld1_s16(jsimd_fdct_islow_neon_consts + 4); + const int16x4_t consts3 = vld1_s16(jsimd_fdct_islow_neon_consts + 8); + const int16x4x3_t consts = { consts1, consts2, consts3 }; +#endif + + /* Load an 8x8 block of samples into Neon registers. De-interleaving loads */ + /* are used followed by vuzp to transpose the block such that we have a */ + /* column of samples per vector - allowing all rows to be processed at */ + /* once. */ + int16x8x4_t s_rows_0123 = vld4q_s16(data); + int16x8x4_t s_rows_4567 = vld4q_s16(data + 4 * DCTSIZE); + + int16x8x2_t cols_04 = vuzpq_s16(s_rows_0123.val[0], s_rows_4567.val[0]); + int16x8x2_t cols_15 = vuzpq_s16(s_rows_0123.val[1], s_rows_4567.val[1]); + int16x8x2_t cols_26 = vuzpq_s16(s_rows_0123.val[2], s_rows_4567.val[2]); + int16x8x2_t cols_37 = vuzpq_s16(s_rows_0123.val[3], s_rows_4567.val[3]); + + int16x8_t col0 = cols_04.val[0]; + int16x8_t col1 = cols_15.val[0]; + int16x8_t col2 = cols_26.val[0]; + int16x8_t col3 = cols_37.val[0]; + int16x8_t col4 = cols_04.val[1]; + int16x8_t col5 = cols_15.val[1]; + int16x8_t col6 = cols_26.val[1]; + int16x8_t col7 = cols_37.val[1]; + + /* Pass 1: process rows. */ + int16x8_t tmp0 = vaddq_s16(col0, col7); + int16x8_t tmp7 = vsubq_s16(col0, col7); + int16x8_t tmp1 = vaddq_s16(col1, col6); + int16x8_t tmp6 = vsubq_s16(col1, col6); + int16x8_t tmp2 = vaddq_s16(col2, col5); + int16x8_t tmp5 = vsubq_s16(col2, col5); + int16x8_t tmp3 = vaddq_s16(col3, col4); + int16x8_t tmp4 = vsubq_s16(col3, col4); + + /* Even part. */ + int16x8_t tmp10 = vaddq_s16(tmp0, tmp3); + int16x8_t tmp13 = vsubq_s16(tmp0, tmp3); + int16x8_t tmp11 = vaddq_s16(tmp1, tmp2); + int16x8_t tmp12 = vsubq_s16(tmp1, tmp2); + + col0 = vshlq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS); + col4 = vshlq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS); + + int16x8_t tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13); + int32x4_t z1_l = vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), + consts.val[0], 2); + int32x4_t z1_h = vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), + consts.val[0], 2); + + int32x4_t col2_scaled_l = vmlal_lane_s16(z1_l, vget_low_s16(tmp13), + consts.val[0], 3); + int32x4_t col2_scaled_h = vmlal_lane_s16(z1_h, vget_high_s16(tmp13), + consts.val[0], 3); + col2 = vcombine_s16(vrshrn_n_s32(col2_scaled_l, DESCALE_P1), + vrshrn_n_s32(col2_scaled_h, DESCALE_P1)); + + int32x4_t col6_scaled_l = vmlal_lane_s16(z1_l, vget_low_s16(tmp12), + consts.val[1], 3); + int32x4_t col6_scaled_h = vmlal_lane_s16(z1_h, vget_high_s16(tmp12), + consts.val[1], 3); + col6 = vcombine_s16(vrshrn_n_s32(col6_scaled_l, DESCALE_P1), + vrshrn_n_s32(col6_scaled_h, DESCALE_P1)); + + /* Odd part. */ + int16x8_t z1 = vaddq_s16(tmp4, tmp7); + int16x8_t z2 = vaddq_s16(tmp5, tmp6); + int16x8_t z3 = vaddq_s16(tmp4, tmp6); + int16x8_t z4 = vaddq_s16(tmp5, tmp7); + /* sqrt(2) * c3 */ + int32x4_t z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1); + int32x4_t z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1); + z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1); + z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1); + + /* sqrt(2) * (-c1+c3+c5-c7) */ + int32x4_t tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0); + int32x4_t tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0); + /* sqrt(2) * ( c1+c3-c5+c7) */ + int32x4_t tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1); + int32x4_t tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1); + /* sqrt(2) * ( c1+c3+c5-c7) */ + int32x4_t tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3); + int32x4_t tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3); + /* sqrt(2) * ( c1+c3-c5-c7) */ + int32x4_t tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2); + int32x4_t tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2); + + /* sqrt(2) * (c7-c3) */ + z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0); + z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0); + /* sqrt(2) * (-c1-c3) */ + int32x4_t z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2); + int32x4_t z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2); + /* sqrt(2) * (-c3-c5) */ + int32x4_t z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0); + int32x4_t z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0); + /* sqrt(2) * (c5-c3) */ + int32x4_t z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1); + int32x4_t z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1); + + z3_l = vaddq_s32(z3_l, z5_l); + z3_h = vaddq_s32(z3_h, z5_h); + z4_l = vaddq_s32(z4_l, z5_l); + z4_h = vaddq_s32(z4_h, z5_h); + + tmp4_l = vaddq_s32(tmp4_l, z1_l); + tmp4_h = vaddq_s32(tmp4_h, z1_h); + tmp4_l = vaddq_s32(tmp4_l, z3_l); + tmp4_h = vaddq_s32(tmp4_h, z3_h); + col7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P1), + vrshrn_n_s32(tmp4_h, DESCALE_P1)); + + tmp5_l = vaddq_s32(tmp5_l, z2_l); + tmp5_h = vaddq_s32(tmp5_h, z2_h); + tmp5_l = vaddq_s32(tmp5_l, z4_l); + tmp5_h = vaddq_s32(tmp5_h, z4_h); + col5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P1), + vrshrn_n_s32(tmp5_h, DESCALE_P1)); + + tmp6_l = vaddq_s32(tmp6_l, z2_l); + tmp6_h = vaddq_s32(tmp6_h, z2_h); + tmp6_l = vaddq_s32(tmp6_l, z3_l); + tmp6_h = vaddq_s32(tmp6_h, z3_h); + col3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P1), + vrshrn_n_s32(tmp6_h, DESCALE_P1)); + + tmp7_l = vaddq_s32(tmp7_l, z1_l); + tmp7_h = vaddq_s32(tmp7_h, z1_h); + tmp7_l = vaddq_s32(tmp7_l, z4_l); + tmp7_h = vaddq_s32(tmp7_h, z4_h); + col1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P1), + vrshrn_n_s32(tmp7_h, DESCALE_P1)); + + /* Transpose to work on columns in pass 2. */ + int16x8x2_t cols_01 = vtrnq_s16(col0, col1); + int16x8x2_t cols_23 = vtrnq_s16(col2, col3); + int16x8x2_t cols_45 = vtrnq_s16(col4, col5); + int16x8x2_t cols_67 = vtrnq_s16(col6, col7); + + int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]), + vreinterpretq_s32_s16(cols_45.val[0])); + int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]), + vreinterpretq_s32_s16(cols_45.val[1])); + int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]), + vreinterpretq_s32_s16(cols_67.val[0])); + int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]), + vreinterpretq_s32_s16(cols_67.val[1])); + + int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]); + int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]); + int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]); + int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]); + + int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]); + int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]); + int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]); + int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]); + int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]); + int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]); + int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]); + int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]); + + /* Pass 2. */ + tmp0 = vaddq_s16(row0, row7); + tmp7 = vsubq_s16(row0, row7); + tmp1 = vaddq_s16(row1, row6); + tmp6 = vsubq_s16(row1, row6); + tmp2 = vaddq_s16(row2, row5); + tmp5 = vsubq_s16(row2, row5); + tmp3 = vaddq_s16(row3, row4); + tmp4 = vsubq_s16(row3, row4); + + /* Even part. */ + tmp10 = vaddq_s16(tmp0, tmp3); + tmp13 = vsubq_s16(tmp0, tmp3); + tmp11 = vaddq_s16(tmp1, tmp2); + tmp12 = vsubq_s16(tmp1, tmp2); + + row0 = vrshrq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS); + row4 = vrshrq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS); + + tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13); + z1_l = vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2); + z1_h = vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2); + + int32x4_t row2_scaled_l = vmlal_lane_s16(z1_l, vget_low_s16(tmp13), + consts.val[0], 3); + int32x4_t row2_scaled_h = vmlal_lane_s16(z1_h, vget_high_s16(tmp13), + consts.val[0], 3); + row2 = vcombine_s16(vrshrn_n_s32(row2_scaled_l, DESCALE_P2), + vrshrn_n_s32(row2_scaled_h, DESCALE_P2)); + + int32x4_t row6_scaled_l = vmlal_lane_s16(z1_l, vget_low_s16(tmp12), + consts.val[1], 3); + int32x4_t row6_scaled_h = vmlal_lane_s16(z1_h, vget_high_s16(tmp12), + consts.val[1], 3); + row6 = vcombine_s16(vrshrn_n_s32(row6_scaled_l, DESCALE_P2), + vrshrn_n_s32(row6_scaled_h, DESCALE_P2)); + + /* Odd part. */ + z1 = vaddq_s16(tmp4, tmp7); + z2 = vaddq_s16(tmp5, tmp6); + z3 = vaddq_s16(tmp4, tmp6); + z4 = vaddq_s16(tmp5, tmp7); + /* sqrt(2) * c3 */ + z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1); + z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1); + z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1); + z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1); + + /* sqrt(2) * (-c1+c3+c5-c7) */ + tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0); + tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0); + /* sqrt(2) * ( c1+c3-c5+c7) */ + tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1); + tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1); + /* sqrt(2) * ( c1+c3+c5-c7) */ + tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3); + tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3); + /* sqrt(2) * ( c1+c3-c5-c7) */ + tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2); + tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2); + + /* sqrt(2) * (c7-c3) */ + z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0); + z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0); + /* sqrt(2) * (-c1-c3) */ + z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2); + z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2); + /* sqrt(2) * (-c3-c5) */ + z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0); + z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0); + /* sqrt(2) * (c5-c3) */ + z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1); + z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1); + + z3_l = vaddq_s32(z3_l, z5_l); + z3_h = vaddq_s32(z3_h, z5_h); + z4_l = vaddq_s32(z4_l, z5_l); + z4_h = vaddq_s32(z4_h, z5_h); + + tmp4_l = vaddq_s32(tmp4_l, z1_l); + tmp4_h = vaddq_s32(tmp4_h, z1_h); + tmp4_l = vaddq_s32(tmp4_l, z3_l); + tmp4_h = vaddq_s32(tmp4_h, z3_h); + row7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P2), + vrshrn_n_s32(tmp4_h, DESCALE_P2)); + + tmp5_l = vaddq_s32(tmp5_l, z2_l); + tmp5_h = vaddq_s32(tmp5_h, z2_h); + tmp5_l = vaddq_s32(tmp5_l, z4_l); + tmp5_h = vaddq_s32(tmp5_h, z4_h); + row5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P2), + vrshrn_n_s32(tmp5_h, DESCALE_P2)); + + tmp6_l = vaddq_s32(tmp6_l, z2_l); + tmp6_h = vaddq_s32(tmp6_h, z2_h); + tmp6_l = vaddq_s32(tmp6_l, z3_l); + tmp6_h = vaddq_s32(tmp6_h, z3_h); + row3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P2), + vrshrn_n_s32(tmp6_h, DESCALE_P2)); + + tmp7_l = vaddq_s32(tmp7_l, z1_l); + tmp7_h = vaddq_s32(tmp7_h, z1_h); + tmp7_l = vaddq_s32(tmp7_l, z4_l); + tmp7_h = vaddq_s32(tmp7_h, z4_h); + row1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P2), + vrshrn_n_s32(tmp7_h, DESCALE_P2)); + + vst1q_s16(data + 0 * DCTSIZE, row0); + vst1q_s16(data + 1 * DCTSIZE, row1); + vst1q_s16(data + 2 * DCTSIZE, row2); + vst1q_s16(data + 3 * DCTSIZE, row3); + vst1q_s16(data + 4 * DCTSIZE, row4); + vst1q_s16(data + 5 * DCTSIZE, row5); + vst1q_s16(data + 6 * DCTSIZE, row6); + vst1q_s16(data + 7 * DCTSIZE, row7); +} diff --git a/simd/arm/common/jidctfst-neon.c b/simd/arm/common/jidctfst-neon.c new file mode 100644 index 0000000..87806fd --- /dev/null +++ b/simd/arm/common/jidctfst-neon.c @@ -0,0 +1,454 @@ +/* + * jidctfst-neon.c - fast IDCT (Arm NEON) + * + * Copyright 2019 The Chromium Authors. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../../jinclude.h" +#include "../../../jpeglib.h" +#include "../../../jsimd.h" +#include "../../../jdct.h" +#include "../../../jsimddct.h" +#include "../../jsimd.h" + +#include <arm_neon.h> + +/* + * 'jsimd_idct_ifast_neon' performs dequantization and a fast, not so accurate + * inverse DCT (Discrete Cosine Transform) on one block of coefficients. It + * uses the same calculations and produces exactly the same output as IJG's + * original 'jpeg_idct_ifast' function, which can be found in jidctfst.c. + * + * Scaled integer constants are used to avoid floating-point arithmetic: + * 0.082392200 = 2688 * 2^-15 + * 0.414213562 = 13568 * 2^-15 + * 0.847759065 = 27776 * 2^-15 + * 0.613125930 = 20096 * 2^-15 + * + * See jidctfst.c for further details of the IDCT algorithm. Where possible, + * the variable names and comments here in 'jsimd_idct_ifast_neon' match up + * with those in 'jpeg_idct_ifast'. + */ + +#define PASS1_BITS 2 + +#define F_0_082 2688 +#define F_0_414 13568 +#define F_0_847 27776 +#define F_0_613 20096 + +void jsimd_idct_ifast_neon(void *dct_table, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + IFAST_MULT_TYPE *quantptr = dct_table; + + /* Load DCT coefficients. */ + int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE); + int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE); + int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE); + int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE); + int16x8_t row4 = vld1q_s16(coef_block + 4 * DCTSIZE); + int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE); + int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE); + int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE); + + /* Load quantization table values for DC coefficients. */ + int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE); + /* Dequantize DC coefficients. */ + row0 = vmulq_s16(row0, quant_row0); + + /* Construct bitmap to test if all AC coefficients are 0. */ + int16x8_t bitmap = vorrq_s16(row1, row2); + bitmap = vorrq_s16(bitmap, row3); + bitmap = vorrq_s16(bitmap, row4); + bitmap = vorrq_s16(bitmap, row5); + bitmap = vorrq_s16(bitmap, row6); + bitmap = vorrq_s16(bitmap, row7); + + int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0); + int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1); + + if (left_ac_bitmap == 0 && right_ac_bitmap == 0) { + /* All AC coefficients are zero. */ + /* Compute DC values and duplicate into vectors. */ + int16x8_t dcval = row0; + row1 = dcval; + row2 = dcval; + row3 = dcval; + row4 = dcval; + row5 = dcval; + row6 = dcval; + row7 = dcval; + } else if (left_ac_bitmap == 0) { + /* AC coefficients are zero for columns 0, 1, 2 and 3. */ + /* Use DC values for these columns. */ + int16x4_t dcval = vget_low_s16(row0); + + /* Commence regular fast IDCT computation for columns 4, 5, 6 and 7. */ + /* Load quantization table.*/ + int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4); + int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4); + int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4); + int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4); + int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4); + int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4); + int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4); + + /* Even part: dequantize DCT coefficients. */ + int16x4_t tmp0 = vget_high_s16(row0); + int16x4_t tmp1 = vmul_s16(vget_high_s16(row2), quant_row2); + int16x4_t tmp2 = vmul_s16(vget_high_s16(row4), quant_row4); + int16x4_t tmp3 = vmul_s16(vget_high_s16(row6), quant_row6); + + int16x4_t tmp10 = vadd_s16(tmp0, tmp2); /* phase 3 */ + int16x4_t tmp11 = vsub_s16(tmp0, tmp2); + + int16x4_t tmp13 = vadd_s16(tmp1, tmp3); /* phases 5-3 */ + int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3); + int16x4_t tmp12 = vqdmulh_n_s16(tmp1_sub_tmp3, F_0_414); + tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3); + tmp12 = vsub_s16(tmp12, tmp13); + + tmp0 = vadd_s16(tmp10, tmp13); /* phase 2 */ + tmp3 = vsub_s16(tmp10, tmp13); + tmp1 = vadd_s16(tmp11, tmp12); + tmp2 = vsub_s16(tmp11, tmp12); + + /* Odd part: dequantize DCT coefficients. */ + int16x4_t tmp4 = vmul_s16(vget_high_s16(row1), quant_row1); + int16x4_t tmp5 = vmul_s16(vget_high_s16(row3), quant_row3); + int16x4_t tmp6 = vmul_s16(vget_high_s16(row5), quant_row5); + int16x4_t tmp7 = vmul_s16(vget_high_s16(row7), quant_row7); + + int16x4_t z13 = vadd_s16(tmp6, tmp5); /* phase 6 */ + int16x4_t neg_z10 = vsub_s16(tmp5, tmp6); + int16x4_t z11 = vadd_s16(tmp4, tmp7); + int16x4_t z12 = vsub_s16(tmp4, tmp7); + + tmp7 = vadd_s16(z11, z13); /* phase 5 */ + int16x4_t z11_sub_z13 = vsub_s16(z11, z13); + tmp11 = vqdmulh_n_s16(z11_sub_z13, F_0_414); + tmp11 = vadd_s16(tmp11, z11_sub_z13); + + int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10); + int16x4_t z5 = vqdmulh_n_s16(z10_add_z12, F_0_847); + z5 = vadd_s16(z5, z10_add_z12); + tmp10 = vqdmulh_n_s16(z12, F_0_082); + tmp10 = vadd_s16(tmp10, z12); + tmp10 = vsub_s16(tmp10, z5); + tmp12 = vqdmulh_n_s16(neg_z10, F_0_613); + tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10)); + tmp12 = vadd_s16(tmp12, z5); + + tmp6 = vsub_s16(tmp12, tmp7); /* phase 2 */ + tmp5 = vsub_s16(tmp11, tmp6); + tmp4 = vadd_s16(tmp10, tmp5); + + row0 = vcombine_s16(dcval, vadd_s16(tmp0, tmp7)); + row7 = vcombine_s16(dcval, vsub_s16(tmp0, tmp7)); + row1 = vcombine_s16(dcval, vadd_s16(tmp1, tmp6)); + row6 = vcombine_s16(dcval, vsub_s16(tmp1, tmp6)); + row2 = vcombine_s16(dcval, vadd_s16(tmp2, tmp5)); + row5 = vcombine_s16(dcval, vsub_s16(tmp2, tmp5)); + row4 = vcombine_s16(dcval, vadd_s16(tmp3, tmp4)); + row3 = vcombine_s16(dcval, vsub_s16(tmp3, tmp4)); + } else if (right_ac_bitmap == 0) { + /* AC coefficients are zero for columns 4, 5, 6 and 7. */ + /* Use DC values for these columns. */ + int16x4_t dcval = vget_high_s16(row0); + + /* Commence regular fast IDCT computation for columns 0, 1, 2 and 3. */ + /* Load quantization table.*/ + int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE); + int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE); + int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE); + int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE); + int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE); + int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE); + int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE); + + /* Even part: dequantize DCT coefficients. */ + int16x4_t tmp0 = vget_low_s16(row0); + int16x4_t tmp1 = vmul_s16(vget_low_s16(row2), quant_row2); + int16x4_t tmp2 = vmul_s16(vget_low_s16(row4), quant_row4); + int16x4_t tmp3 = vmul_s16(vget_low_s16(row6), quant_row6); + + int16x4_t tmp10 = vadd_s16(tmp0, tmp2); /* phase 3 */ + int16x4_t tmp11 = vsub_s16(tmp0, tmp2); + + int16x4_t tmp13 = vadd_s16(tmp1, tmp3); /* phases 5-3 */ + int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3); + int16x4_t tmp12 = vqdmulh_n_s16(tmp1_sub_tmp3, F_0_414); + tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3); + tmp12 = vsub_s16(tmp12, tmp13); + + tmp0 = vadd_s16(tmp10, tmp13); /* phase 2 */ + tmp3 = vsub_s16(tmp10, tmp13); + tmp1 = vadd_s16(tmp11, tmp12); + tmp2 = vsub_s16(tmp11, tmp12); + + /* Odd part: dequantize DCT coefficients. */ + int16x4_t tmp4 = vmul_s16(vget_low_s16(row1), quant_row1); + int16x4_t tmp5 = vmul_s16(vget_low_s16(row3), quant_row3); + int16x4_t tmp6 = vmul_s16(vget_low_s16(row5), quant_row5); + int16x4_t tmp7 = vmul_s16(vget_low_s16(row7), quant_row7); + + int16x4_t z13 = vadd_s16(tmp6, tmp5); /* phase 6 */ + int16x4_t neg_z10 = vsub_s16(tmp5, tmp6); + int16x4_t z11 = vadd_s16(tmp4, tmp7); + int16x4_t z12 = vsub_s16(tmp4, tmp7); + + tmp7 = vadd_s16(z11, z13); /* phase 5 */ + int16x4_t z11_sub_z13 = vsub_s16(z11, z13); + tmp11 = vqdmulh_n_s16(z11_sub_z13, F_0_414); + tmp11 = vadd_s16(tmp11, z11_sub_z13); + + int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10); + int16x4_t z5 = vqdmulh_n_s16(z10_add_z12, F_0_847); + z5 = vadd_s16(z5, z10_add_z12); + tmp10 = vqdmulh_n_s16(z12, F_0_082); + tmp10 = vadd_s16(tmp10, z12); + tmp10 = vsub_s16(tmp10, z5); + tmp12 = vqdmulh_n_s16(neg_z10, F_0_613); + tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10)); + tmp12 = vadd_s16(tmp12, z5); + + tmp6 = vsub_s16(tmp12, tmp7); /* phase 2 */ + tmp5 = vsub_s16(tmp11, tmp6); + tmp4 = vadd_s16(tmp10, tmp5); + + row0 = vcombine_s16(vadd_s16(tmp0, tmp7), dcval); + row7 = vcombine_s16(vsub_s16(tmp0, tmp7), dcval); + row1 = vcombine_s16(vadd_s16(tmp1, tmp6), dcval); + row6 = vcombine_s16(vsub_s16(tmp1, tmp6), dcval); + row2 = vcombine_s16(vadd_s16(tmp2, tmp5), dcval); + row5 = vcombine_s16(vsub_s16(tmp2, tmp5), dcval); + row4 = vcombine_s16(vadd_s16(tmp3, tmp4), dcval); + row3 = vcombine_s16(vsub_s16(tmp3, tmp4), dcval); + } else { + /* Some AC coefficients are non-zero; full IDCT calculation required. */ + /* Load quantization table.*/ + int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE); + int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE); + int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE); + int16x8_t quant_row4 = vld1q_s16(quantptr + 4 * DCTSIZE); + int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE); + int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE); + int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE); + + /* Even part: dequantize DCT coefficients. */ + int16x8_t tmp0 = row0; + int16x8_t tmp1 = vmulq_s16(row2, quant_row2); + int16x8_t tmp2 = vmulq_s16(row4, quant_row4); + int16x8_t tmp3 = vmulq_s16(row6, quant_row6); + + int16x8_t tmp10 = vaddq_s16(tmp0, tmp2); /* phase 3 */ + int16x8_t tmp11 = vsubq_s16(tmp0, tmp2); + + int16x8_t tmp13 = vaddq_s16(tmp1, tmp3); /* phases 5-3 */ + int16x8_t tmp1_sub_tmp3 = vsubq_s16(tmp1, tmp3); + int16x8_t tmp12 = vqdmulhq_n_s16(tmp1_sub_tmp3, F_0_414); + tmp12 = vaddq_s16(tmp12, tmp1_sub_tmp3); + tmp12 = vsubq_s16(tmp12, tmp13); + + tmp0 = vaddq_s16(tmp10, tmp13); /* phase 2 */ + tmp3 = vsubq_s16(tmp10, tmp13); + tmp1 = vaddq_s16(tmp11, tmp12); + tmp2 = vsubq_s16(tmp11, tmp12); + + /* Odd part: dequantize DCT coefficients. */ + int16x8_t tmp4 = vmulq_s16(row1, quant_row1); + int16x8_t tmp5 = vmulq_s16(row3, quant_row3); + int16x8_t tmp6 = vmulq_s16(row5, quant_row5); + int16x8_t tmp7 = vmulq_s16(row7, quant_row7); + + int16x8_t z13 = vaddq_s16(tmp6, tmp5); /* phase 6 */ + int16x8_t neg_z10 = vsubq_s16(tmp5, tmp6); + int16x8_t z11 = vaddq_s16(tmp4, tmp7); + int16x8_t z12 = vsubq_s16(tmp4, tmp7); + + tmp7 = vaddq_s16(z11, z13); /* phase 5 */ + int16x8_t z11_sub_z13 = vsubq_s16(z11, z13); + tmp11 = vqdmulhq_n_s16(z11_sub_z13, F_0_414); + tmp11 = vaddq_s16(tmp11, z11_sub_z13); + + int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10); + int16x8_t z5 = vqdmulhq_n_s16(z10_add_z12, F_0_847); + z5 = vaddq_s16(z5, z10_add_z12); + tmp10 = vqdmulhq_n_s16(z12, F_0_082); + tmp10 = vaddq_s16(tmp10, z12); + tmp10 = vsubq_s16(tmp10, z5); + tmp12 = vqdmulhq_n_s16(neg_z10, F_0_613); + tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10)); + tmp12 = vaddq_s16(tmp12, z5); + + tmp6 = vsubq_s16(tmp12, tmp7); /* phase 2 */ + tmp5 = vsubq_s16(tmp11, tmp6); + tmp4 = vaddq_s16(tmp10, tmp5); + + row0 = vaddq_s16(tmp0, tmp7); + row7 = vsubq_s16(tmp0, tmp7); + row1 = vaddq_s16(tmp1, tmp6); + row6 = vsubq_s16(tmp1, tmp6); + row2 = vaddq_s16(tmp2, tmp5); + row5 = vsubq_s16(tmp2, tmp5); + row4 = vaddq_s16(tmp3, tmp4); + row3 = vsubq_s16(tmp3, tmp4); + } + + /* Tranpose rows to work on columns in pass 2. */ + int16x8x2_t rows_01 = vtrnq_s16(row0, row1); + int16x8x2_t rows_23 = vtrnq_s16(row2, row3); + int16x8x2_t rows_45 = vtrnq_s16(row4, row5); + int16x8x2_t rows_67 = vtrnq_s16(row6, row7); + + int32x4x2_t rows_0145_l = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[0]), + vreinterpretq_s32_s16(rows_45.val[0])); + int32x4x2_t rows_0145_h = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[1]), + vreinterpretq_s32_s16(rows_45.val[1])); + int32x4x2_t rows_2367_l = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[0]), + vreinterpretq_s32_s16(rows_67.val[0])); + int32x4x2_t rows_2367_h = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[1]), + vreinterpretq_s32_s16(rows_67.val[1])); + + int32x4x2_t cols_04 = vzipq_s32(rows_0145_l.val[0], rows_2367_l.val[0]); + int32x4x2_t cols_15 = vzipq_s32(rows_0145_h.val[0], rows_2367_h.val[0]); + int32x4x2_t cols_26 = vzipq_s32(rows_0145_l.val[1], rows_2367_l.val[1]); + int32x4x2_t cols_37 = vzipq_s32(rows_0145_h.val[1], rows_2367_h.val[1]); + + int16x8_t col0 = vreinterpretq_s16_s32(cols_04.val[0]); + int16x8_t col1 = vreinterpretq_s16_s32(cols_15.val[0]); + int16x8_t col2 = vreinterpretq_s16_s32(cols_26.val[0]); + int16x8_t col3 = vreinterpretq_s16_s32(cols_37.val[0]); + int16x8_t col4 = vreinterpretq_s16_s32(cols_04.val[1]); + int16x8_t col5 = vreinterpretq_s16_s32(cols_15.val[1]); + int16x8_t col6 = vreinterpretq_s16_s32(cols_26.val[1]); + int16x8_t col7 = vreinterpretq_s16_s32(cols_37.val[1]); + + /* 1-D IDCT, pass 2. */ + /* Even part. */ + int16x8_t tmp10 = vaddq_s16(col0, col4); + int16x8_t tmp11 = vsubq_s16(col0, col4); + + int16x8_t tmp13 = vaddq_s16(col2, col6); + int16x8_t col2_sub_col6 = vsubq_s16(col2, col6); + int16x8_t tmp12 = vqdmulhq_n_s16(col2_sub_col6, F_0_414); + tmp12 = vaddq_s16(tmp12, col2_sub_col6); + tmp12 = vsubq_s16(tmp12, tmp13); + + int16x8_t tmp0 = vaddq_s16(tmp10, tmp13); + int16x8_t tmp3 = vsubq_s16(tmp10, tmp13); + int16x8_t tmp1 = vaddq_s16(tmp11, tmp12); + int16x8_t tmp2 = vsubq_s16(tmp11, tmp12); + + /* Odd part. */ + int16x8_t z13 = vaddq_s16(col5, col3); + int16x8_t neg_z10 = vsubq_s16(col3, col5); + int16x8_t z11 = vaddq_s16(col1, col7); + int16x8_t z12 = vsubq_s16(col1, col7); + + int16x8_t tmp7 = vaddq_s16(z11, z13); /* phase 5 */ + int16x8_t z11_sub_z13 = vsubq_s16(z11, z13); + tmp11 = vqdmulhq_n_s16(z11_sub_z13, F_0_414); + tmp11 = vaddq_s16(tmp11, z11_sub_z13); + + int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10); + int16x8_t z5 = vqdmulhq_n_s16(z10_add_z12, F_0_847); + z5 = vaddq_s16(z5, z10_add_z12); + tmp10 = vqdmulhq_n_s16(z12, F_0_082); + tmp10 = vaddq_s16(tmp10, z12); + tmp10 = vsubq_s16(tmp10, z5); + tmp12 = vqdmulhq_n_s16(neg_z10, F_0_613); + tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10)); + tmp12 = vaddq_s16(tmp12, z5); + + int16x8_t tmp6 = vsubq_s16(tmp12, tmp7); /* phase 2 */ + int16x8_t tmp5 = vsubq_s16(tmp11, tmp6); + int16x8_t tmp4 = vaddq_s16(tmp10, tmp5); + + col0 = vaddq_s16(tmp0, tmp7); + col7 = vsubq_s16(tmp0, tmp7); + col1 = vaddq_s16(tmp1, tmp6); + col6 = vsubq_s16(tmp1, tmp6); + col2 = vaddq_s16(tmp2, tmp5); + col5 = vsubq_s16(tmp2, tmp5); + col4 = vaddq_s16(tmp3, tmp4); + col3 = vsubq_s16(tmp3, tmp4); + + /* Scale down by factor of 8, narrowing to 8-bit. */ + int8x16_t cols_01_s8 = vcombine_s8(vqshrn_n_s16(col0, PASS1_BITS + 3), + vqshrn_n_s16(col1, PASS1_BITS + 3)); + int8x16_t cols_45_s8 = vcombine_s8(vqshrn_n_s16(col4, PASS1_BITS + 3), + vqshrn_n_s16(col5, PASS1_BITS + 3)); + int8x16_t cols_23_s8 = vcombine_s8(vqshrn_n_s16(col2, PASS1_BITS + 3), + vqshrn_n_s16(col3, PASS1_BITS + 3)); + int8x16_t cols_67_s8 = vcombine_s8(vqshrn_n_s16(col6, PASS1_BITS + 3), + vqshrn_n_s16(col7, PASS1_BITS + 3)); + /* Clamp to range [0-255]. */ + uint8x16_t cols_01 = vreinterpretq_u8_s8( + vaddq_s8(cols_01_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE)))); + uint8x16_t cols_45 = vreinterpretq_u8_s8( + vaddq_s8(cols_45_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE)))); + uint8x16_t cols_23 = vreinterpretq_u8_s8( + vaddq_s8(cols_23_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE)))); + uint8x16_t cols_67 = vreinterpretq_u8_s8( + vaddq_s8(cols_67_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE)))); + + /* Transpose block ready for store. */ + uint32x4x2_t cols_0415 = vzipq_u32(vreinterpretq_u32_u8(cols_01), + vreinterpretq_u32_u8(cols_45)); + uint32x4x2_t cols_2637 = vzipq_u32(vreinterpretq_u32_u8(cols_23), + vreinterpretq_u32_u8(cols_67)); + + uint8x16x2_t cols_0145 = vtrnq_u8(vreinterpretq_u8_u32(cols_0415.val[0]), + vreinterpretq_u8_u32(cols_0415.val[1])); + uint8x16x2_t cols_2367 = vtrnq_u8(vreinterpretq_u8_u32(cols_2637.val[0]), + vreinterpretq_u8_u32(cols_2637.val[1])); + uint16x8x2_t rows_0426 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[0]), + vreinterpretq_u16_u8(cols_2367.val[0])); + uint16x8x2_t rows_1537 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[1]), + vreinterpretq_u16_u8(cols_2367.val[1])); + + uint8x16_t rows_04 = vreinterpretq_u8_u16(rows_0426.val[0]); + uint8x16_t rows_15 = vreinterpretq_u8_u16(rows_1537.val[0]); + uint8x16_t rows_26 = vreinterpretq_u8_u16(rows_0426.val[1]); + uint8x16_t rows_37 = vreinterpretq_u8_u16(rows_1537.val[1]); + + JSAMPROW outptr0 = output_buf[0] + output_col; + JSAMPROW outptr1 = output_buf[1] + output_col; + JSAMPROW outptr2 = output_buf[2] + output_col; + JSAMPROW outptr3 = output_buf[3] + output_col; + JSAMPROW outptr4 = output_buf[4] + output_col; + JSAMPROW outptr5 = output_buf[5] + output_col; + JSAMPROW outptr6 = output_buf[6] + output_col; + JSAMPROW outptr7 = output_buf[7] + output_col; + + /* Store DCT block to memory. */ + vst1q_lane_u64((uint64_t *)outptr0, vreinterpretq_u64_u8(rows_04), 0); + vst1q_lane_u64((uint64_t *)outptr1, vreinterpretq_u64_u8(rows_15), 0); + vst1q_lane_u64((uint64_t *)outptr2, vreinterpretq_u64_u8(rows_26), 0); + vst1q_lane_u64((uint64_t *)outptr3, vreinterpretq_u64_u8(rows_37), 0); + vst1q_lane_u64((uint64_t *)outptr4, vreinterpretq_u64_u8(rows_04), 1); + vst1q_lane_u64((uint64_t *)outptr5, vreinterpretq_u64_u8(rows_15), 1); + vst1q_lane_u64((uint64_t *)outptr6, vreinterpretq_u64_u8(rows_26), 1); + vst1q_lane_u64((uint64_t *)outptr7, vreinterpretq_u64_u8(rows_37), 1); +} diff --git a/simd/arm/common/jidctint-neon.c b/simd/arm/common/jidctint-neon.c new file mode 100644 index 0000000..0fd4a36 --- /dev/null +++ b/simd/arm/common/jidctint-neon.c @@ -0,0 +1,758 @@ +/* + * jidctint-neon.c - slow IDCT (Arm NEON) + * + * Copyright 2019 The Chromium Authors. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../../jconfigint.h" +#include "../../../jinclude.h" +#include "../../../jpeglib.h" +#include "../../../jsimd.h" +#include "../../../jdct.h" +#include "../../../jsimddct.h" +#include "../../jsimd.h" + +#include <arm_neon.h> + +#define CONST_BITS 13 +#define PASS1_BITS 2 + +#define DESCALE_P1 (CONST_BITS - PASS1_BITS) +#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3) + +/* The computation of the inverse DCT requires the use of constants known at + * compile-time. Scaled integer constants are used to avoid floating-point + * arithmetic: + * 0.298631336 = 2446 * 2^-13 + * 0.390180644 = 3196 * 2^-13 + * 0.541196100 = 4433 * 2^-13 + * 0.765366865 = 6270 * 2^-13 + * 0.899976223 = 7373 * 2^-13 + * 1.175875602 = 9633 * 2^-13 + * 1.501321110 = 12299 * 2^-13 + * 1.847759065 = 15137 * 2^-13 + * 1.961570560 = 16069 * 2^-13 + * 2.053119869 = 16819 * 2^-13 + * 2.562915447 = 20995 * 2^-13 + * 3.072711026 = 25172 * 2^-13 + */ + +#define F_0_298 2446 +#define F_0_390 3196 +#define F_0_541 4433 +#define F_0_765 6270 +#define F_0_899 7373 +#define F_1_175 9633 +#define F_1_501 12299 +#define F_1_847 15137 +#define F_1_961 16069 +#define F_2_053 16819 +#define F_2_562 20995 +#define F_3_072 25172 + +#define F_1_175_MINUS_1_961 (F_1_175 - F_1_961) +#define F_1_175_MINUS_0_390 (F_1_175 - F_0_390) +#define F_0_541_MINUS_1_847 (F_0_541 - F_1_847) +#define F_3_072_MINUS_2_562 (F_3_072 - F_2_562) +#define F_0_298_MINUS_0_899 (F_0_298 - F_0_899) +#define F_1_501_MINUS_0_899 (F_1_501 - F_0_899) +#define F_2_053_MINUS_2_562 (F_2_053 - F_2_562) +#define F_0_541_PLUS_0_765 (F_0_541 + F_0_765) + +ALIGN(16) static const int16_t jsimd_idct_islow_neon_consts[] = { + F_0_899, F_0_541, + F_2_562, F_0_298_MINUS_0_899, + F_1_501_MINUS_0_899, F_2_053_MINUS_2_562, + F_0_541_PLUS_0_765, F_1_175, + F_1_175_MINUS_0_390, F_0_541_MINUS_1_847, + F_3_072_MINUS_2_562, F_1_175_MINUS_1_961, + 0, 0, 0, 0 + }; + +/* Forward declaration of regular and sparse IDCT helper functions. */ + +static inline void jsimd_idct_islow_pass1_regular(int16x4_t row0, + int16x4_t row1, + int16x4_t row2, + int16x4_t row3, + int16x4_t row4, + int16x4_t row5, + int16x4_t row6, + int16x4_t row7, + int16x4_t quant_row0, + int16x4_t quant_row1, + int16x4_t quant_row2, + int16x4_t quant_row3, + int16x4_t quant_row4, + int16x4_t quant_row5, + int16x4_t quant_row6, + int16x4_t quant_row7, + int16_t *workspace_1, + int16_t *workspace_2); + +static inline void jsimd_idct_islow_pass1_sparse(int16x4_t row0, + int16x4_t row1, + int16x4_t row2, + int16x4_t row3, + int16x4_t quant_row0, + int16x4_t quant_row1, + int16x4_t quant_row2, + int16x4_t quant_row3, + int16_t *workspace_1, + int16_t *workspace_2); + +static inline void jsimd_idct_islow_pass2_regular(int16_t *workspace, + JSAMPARRAY output_buf, + JDIMENSION output_col, + unsigned buf_offset); + +static inline void jsimd_idct_islow_pass2_sparse(int16_t *workspace, + JSAMPARRAY output_buf, + JDIMENSION output_col, + unsigned buf_offset); + + +/* Performs dequantization and inverse DCT on one block of coefficients. For + * reference, the C implementation 'jpeg_idct_slow' can be found jidctint.c. + * + * Optimization techniques used for data access: + * + * In each pass, the inverse DCT is computed on the left and right 4x8 halves + * of the DCT block. This avoids spilling due to register pressure and the + * increased granularity allows an optimized calculation depending on the + * values of the DCT coefficients. Between passes, intermediate data is stored + * in 4x8 workspace buffers. + * + * Transposing the 8x8 DCT block after each pass can be achieved by transposing + * each of the four 4x4 quadrants, and swapping quadrants 1 and 2 (in the + * diagram below.) Swapping quadrants is cheap as the second pass can just load + * from the other workspace buffer. + * + * +-------+-------+ +-------+-------+ + * | | | | | | + * | 0 | 1 | | 0 | 2 | + * | | | transpose | | | + * +-------+-------+ ------> +-------+-------+ + * | | | | | | + * | 2 | 3 | | 1 | 3 | + * | | | | | | + * +-------+-------+ +-------+-------+ + * + * Optimization techniques used to accelerate the inverse DCT calculation: + * + * In a DCT coefficient block, the coefficients are increasingly likely to be 0 + * moving diagonally from top left to bottom right. If whole rows of + * coefficients are 0, the inverse DCT calculation can be simplified. In this + * NEON implementation, on the first pass of the inverse DCT, we test for three + * special cases before defaulting to a full 'regular' inverse DCT: + * + * i) AC and DC coefficients are all zero. (Only tested for the right 4x8 + * half of the DCT coefficient block.) In this case the inverse DCT result + * is all zero. We do no work here, signalling that the 'sparse' case is + * required in the second pass. + * ii) AC coefficients (all but the top row) are zero. In this case, the value + * of the inverse DCT of the AC coefficients is just the DC coefficients. + * iii) Coefficients of rows 4, 5, 6 and 7 are all zero. In this case we opt to + * execute a 'sparse' simplified inverse DCT. + * + * In the second pass, only a single special case is tested: whether the the AC + * and DC coefficients were all zero in the right 4x8 block in the first pass + * (case 'i'). If this is the case, a 'sparse' variant of the second pass + * inverse DCT is executed for both the left and right halves of the DCT block. + * (The transposition after the first pass would have made the bottom half of + * the block all zero.) + */ + +void jsimd_idct_islow_neon(void *dct_table, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + ISLOW_MULT_TYPE *quantptr = dct_table; + + int16_t workspace_l[8 * DCTSIZE / 2]; + int16_t workspace_r[8 * DCTSIZE / 2]; + + /* Compute IDCT first pass on left 4x8 coefficient block. */ + /* Load DCT coefficients in left 4x8 block. */ + int16x4_t row0 = vld1_s16(coef_block + 0 * DCTSIZE); + int16x4_t row1 = vld1_s16(coef_block + 1 * DCTSIZE); + int16x4_t row2 = vld1_s16(coef_block + 2 * DCTSIZE); + int16x4_t row3 = vld1_s16(coef_block + 3 * DCTSIZE); + int16x4_t row4 = vld1_s16(coef_block + 4 * DCTSIZE); + int16x4_t row5 = vld1_s16(coef_block + 5 * DCTSIZE); + int16x4_t row6 = vld1_s16(coef_block + 6 * DCTSIZE); + int16x4_t row7 = vld1_s16(coef_block + 7 * DCTSIZE); + + /* Load quantization table for left 4x8 block. */ + int16x4_t quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE); + int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE); + int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE); + int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE); + int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE); + int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE); + int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE); + int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE); + + /* Construct bitmap to test if DCT coefficients in left 4x8 block are 0. */ + int16x4_t bitmap = vorr_s16(row7, row6); + bitmap = vorr_s16(bitmap, row5); + bitmap = vorr_s16(bitmap, row4); + int64_t bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0); + + if (bitmap_rows_4567 == 0) { + bitmap = vorr_s16(bitmap, row3); + bitmap = vorr_s16(bitmap, row2); + bitmap = vorr_s16(bitmap, row1); + int64_t left_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0); + + if (left_ac_bitmap == 0) { + int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS); + int16x4x4_t quadrant = { dcval, dcval, dcval, dcval }; + /* Store 4x4 blocks to workspace, transposing in the process. */ + vst4_s16(workspace_l, quadrant); + vst4_s16(workspace_r, quadrant); + } else { + jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0, + quant_row1, quant_row2, quant_row3, + workspace_l, workspace_r); + } + } else { + jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5, + row6, row7, quant_row0, quant_row1, + quant_row2, quant_row3, quant_row4, + quant_row5, quant_row6, quant_row7, + workspace_l, workspace_r); + } + + /* Compute IDCT first pass on right 4x8 coefficient block.*/ + /* Load DCT coefficients for right 4x8 block. */ + row0 = vld1_s16(coef_block + 0 * DCTSIZE + 4); + row1 = vld1_s16(coef_block + 1 * DCTSIZE + 4); + row2 = vld1_s16(coef_block + 2 * DCTSIZE + 4); + row3 = vld1_s16(coef_block + 3 * DCTSIZE + 4); + row4 = vld1_s16(coef_block + 4 * DCTSIZE + 4); + row5 = vld1_s16(coef_block + 5 * DCTSIZE + 4); + row6 = vld1_s16(coef_block + 6 * DCTSIZE + 4); + row7 = vld1_s16(coef_block + 7 * DCTSIZE + 4); + + /* Load quantization table for right 4x8 block. */ + quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE + 4); + quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4); + quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4); + quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4); + quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4); + quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4); + quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4); + quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4); + + /* Construct bitmap to test if DCT coefficients in right 4x8 block are 0. */ + bitmap = vorr_s16(row7, row6); + bitmap = vorr_s16(bitmap, row5); + bitmap = vorr_s16(bitmap, row4); + bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0); + bitmap = vorr_s16(bitmap, row3); + bitmap = vorr_s16(bitmap, row2); + bitmap = vorr_s16(bitmap, row1); + int64_t right_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0); + + /* Initialise to non-zero value: defaults to regular second pass. */ + int64_t right_ac_dc_bitmap = 1; + + if (right_ac_bitmap == 0) { + bitmap = vorr_s16(bitmap, row0); + right_ac_dc_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0); + + if (right_ac_dc_bitmap != 0) { + int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS); + int16x4x4_t quadrant = { dcval, dcval, dcval, dcval }; + /* Store 4x4 blocks to workspace, transposing in the process. */ + vst4_s16(workspace_l + 4 * DCTSIZE / 2, quadrant); + vst4_s16(workspace_r + 4 * DCTSIZE / 2, quadrant); + } + } else { + if (bitmap_rows_4567 == 0) { + jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0, + quant_row1, quant_row2, quant_row3, + workspace_l + 4 * DCTSIZE / 2, + workspace_r + 4 * DCTSIZE / 2); + } else { + jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5, + row6, row7, quant_row0, quant_row1, + quant_row2, quant_row3, quant_row4, + quant_row5, quant_row6, quant_row7, + workspace_l + 4 * DCTSIZE / 2, + workspace_r + 4 * DCTSIZE / 2); + } + } + + /* Second pass: compute IDCT on rows in workspace. */ + /* If all coefficients in right 4x8 block are 0, use 'sparse' second pass. */ + if (right_ac_dc_bitmap == 0) { + jsimd_idct_islow_pass2_sparse(workspace_l, output_buf, output_col, 0); + jsimd_idct_islow_pass2_sparse(workspace_r, output_buf, output_col, 4); + } else { + jsimd_idct_islow_pass2_regular(workspace_l, output_buf, output_col, 0); + jsimd_idct_islow_pass2_regular(workspace_r, output_buf, output_col, 4); + } +} + + +/* Performs dequantization and the first pass of the slow-but-accurate inverse + * DCT on a 4x8 block of coefficients. (To process the full 8x8 DCT block this + * function - or some other optimized variant - needs to be called on both the + * right and left 4x8 blocks.) + * + * This 'regular' version assumes that no optimization can be made to the IDCT + * calculation since no useful set of AC coefficients are all 0. + * + * The original C implementation of the slow IDCT 'jpeg_idct_slow' can be found + * in jidctint.c. Algorithmic changes made here are documented inline. + */ + +static inline void jsimd_idct_islow_pass1_regular(int16x4_t row0, + int16x4_t row1, + int16x4_t row2, + int16x4_t row3, + int16x4_t row4, + int16x4_t row5, + int16x4_t row6, + int16x4_t row7, + int16x4_t quant_row0, + int16x4_t quant_row1, + int16x4_t quant_row2, + int16x4_t quant_row3, + int16x4_t quant_row4, + int16x4_t quant_row5, + int16x4_t quant_row6, + int16x4_t quant_row7, + int16_t *workspace_1, + int16_t *workspace_2) +{ + /* Load constants for IDCT calculation. */ + const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts); + + /* Even part. */ + int16x4_t z2_s16 = vmul_s16(row2, quant_row2); + int16x4_t z3_s16 = vmul_s16(row6, quant_row6); + + int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1); + int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2); + tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1); + tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1); + + z2_s16 = vmul_s16(row0, quant_row0); + z3_s16 = vmul_s16(row4, quant_row4); + + int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS); + int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS); + + int32x4_t tmp10 = vaddq_s32(tmp0, tmp3); + int32x4_t tmp13 = vsubq_s32(tmp0, tmp3); + int32x4_t tmp11 = vaddq_s32(tmp1, tmp2); + int32x4_t tmp12 = vsubq_s32(tmp1, tmp2); + + /* Odd part. */ + int16x4_t tmp0_s16 = vmul_s16(row7, quant_row7); + int16x4_t tmp1_s16 = vmul_s16(row5, quant_row5); + int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3); + int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1); + + z3_s16 = vadd_s16(tmp0_s16, tmp2_s16); + int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16); + + /* Implementation as per 'jpeg_idct_islow' in jidctint.c: + * z5 = (z3 + z4) * 1.175875602; + * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + * z3 += z5; z4 += z5; + * + * This implementation: + * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + */ + + int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3); + int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3); + z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3); + z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0); + + /* Implementation as per 'jpeg_idct_islow' in jidctint.c: + * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + * tmp0 += z1 + z3; tmp1 += z2 + z4; + * tmp2 += z2 + z3; tmp3 += z1 + z4; + * + * This implementation: + * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + * tmp0 += z3; tmp1 += z4; + * tmp2 += z3; tmp3 += z4; + */ + + tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3); + tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1); + tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2); + tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0); + + tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0); + tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2); + tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2); + tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0); + + tmp0 = vaddq_s32(tmp0, z3); + tmp1 = vaddq_s32(tmp1, z4); + tmp2 = vaddq_s32(tmp2, z3); + tmp3 = vaddq_s32(tmp3, z4); + + /* Final output stage: descale and narrow to 16-bit. */ + int16x4x4_t rows_0123 = { vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1), + vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1), + vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1), + vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1) + }; + int16x4x4_t rows_4567 = { vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1), + vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1), + vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1), + vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1) + }; + + /* Store 4x4 blocks to the intermediate workspace ready for second pass. */ + /* (VST4 transposes the blocks - we need to operate on rows in next pass.) */ + vst4_s16(workspace_1, rows_0123); + vst4_s16(workspace_2, rows_4567); +} + + +/* Performs dequantization and the first pass of the slow-but-accurate inverse + * DCT on a 4x8 block of coefficients. + * + * This 'sparse' version assumes that the AC coefficients in rows 4, 5, 6 and 7 + * are all 0. This simplifies the IDCT calculation, accelerating overall + * performance. + */ + +static inline void jsimd_idct_islow_pass1_sparse(int16x4_t row0, + int16x4_t row1, + int16x4_t row2, + int16x4_t row3, + int16x4_t quant_row0, + int16x4_t quant_row1, + int16x4_t quant_row2, + int16x4_t quant_row3, + int16_t *workspace_1, + int16_t *workspace_2) +{ + /* Load constants for IDCT computation. */ + const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts); + + /* Even part. */ + int16x4_t z2_s16 = vmul_s16(row2, quant_row2); + /* z3 is all 0. */ + + int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1); + int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2); + + z2_s16 = vmul_s16(row0, quant_row0); + int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS); + int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS); + + int32x4_t tmp10 = vaddq_s32(tmp0, tmp3); + int32x4_t tmp13 = vsubq_s32(tmp0, tmp3); + int32x4_t tmp11 = vaddq_s32(tmp1, tmp2); + int32x4_t tmp12 = vsubq_s32(tmp1, tmp2); + + /* Odd part. */ + /* tmp0 and tmp1 are both all 0. */ + int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3); + int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1); + + int16x4_t z3_s16 = tmp2_s16; + int16x4_t z4_s16 = tmp3_s16; + + int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3); + int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3); + z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3); + z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0); + + tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0); + tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2); + tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2); + tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0); + + /* Final output stage: descale and narrow to 16-bit. */ + int16x4x4_t rows_0123 = { vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1), + vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1), + vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1), + vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1) + }; + int16x4x4_t rows_4567 = { vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1), + vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1), + vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1), + vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1) + }; + + /* Store 4x4 blocks to the intermediate workspace ready for second pass. */ + /* (VST4 transposes the blocks - we need to operate on rows in next pass.) */ + vst4_s16(workspace_1, rows_0123); + vst4_s16(workspace_2, rows_4567); +} + + +/* Performs the second pass of the slow-but-accurate inverse DCT on a 4x8 block + * of coefficients. (To process the full 8x8 DCT block this function - or some + * other optimized variant - needs to be called on both the right and left 4x8 + * blocks.) + * + * This 'regular' version assumes that no optimization can be made to the IDCT + * calculation since no useful set of coefficient values are all 0 after the + * first pass. + * + * Again, the original C implementation of the slow IDCT 'jpeg_idct_slow' can + * be found in jidctint.c. Algorithmic changes made here are documented inline. + */ + +static inline void jsimd_idct_islow_pass2_regular(int16_t *workspace, + JSAMPARRAY output_buf, + JDIMENSION output_col, + unsigned buf_offset) +{ + /* Load constants for IDCT computation. */ + const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts); + + /* Even part. */ + int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2); + int16x4_t z3_s16 = vld1_s16(workspace + 6 * DCTSIZE / 2); + + int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1); + int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2); + tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1); + tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1); + + z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2); + z3_s16 = vld1_s16(workspace + 4 * DCTSIZE / 2); + + int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS); + int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS); + + int32x4_t tmp10 = vaddq_s32(tmp0, tmp3); + int32x4_t tmp13 = vsubq_s32(tmp0, tmp3); + int32x4_t tmp11 = vaddq_s32(tmp1, tmp2); + int32x4_t tmp12 = vsubq_s32(tmp1, tmp2); + + /* Odd part. */ + int16x4_t tmp0_s16 = vld1_s16(workspace + 7 * DCTSIZE / 2); + int16x4_t tmp1_s16 = vld1_s16(workspace + 5 * DCTSIZE / 2); + int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2); + int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2); + + z3_s16 = vadd_s16(tmp0_s16, tmp2_s16); + int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16); + + /* Implementation as per 'jpeg_idct_islow' in jidctint.c: + * z5 = (z3 + z4) * 1.175875602; + * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + * z3 += z5; z4 += z5; + * + * This implementation: + * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + */ + + int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3); + int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3); + z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3); + z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0); + + /* Implementation as per 'jpeg_idct_islow' in jidctint.c: + * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + * tmp0 += z1 + z3; tmp1 += z2 + z4; + * tmp2 += z2 + z3; tmp3 += z1 + z4; + * + * This implementation: + * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + * tmp0 += z3; tmp1 += z4; + * tmp2 += z3; tmp3 += z4; + */ + + tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3); + tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1); + tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2); + tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0); + + tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0); + tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2); + tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2); + tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0); + + tmp0 = vaddq_s32(tmp0, z3); + tmp1 = vaddq_s32(tmp1, z4); + tmp2 = vaddq_s32(tmp2, z3); + tmp3 = vaddq_s32(tmp3, z4); + + /* Final output stage: descale and narrow to 16-bit. */ + int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3), + vaddhn_s32(tmp12, tmp1)); + int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2), + vaddhn_s32(tmp13, tmp0)); + int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0), + vsubhn_s32(tmp11, tmp2)); + int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1), + vsubhn_s32(tmp10, tmp3)); + /* Descale and narrow to 8-bit. */ + int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16); + int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16); + int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16); + int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16); + /* Clamp to range [0-255]. */ + uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8), + vdup_n_u8(CENTERJSAMPLE)); + uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8), + vdup_n_u8(CENTERJSAMPLE)); + uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8), + vdup_n_u8(CENTERJSAMPLE)); + uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8), + vdup_n_u8(CENTERJSAMPLE)); + + /* Transpose 4x8 block and store to memory. */ + /* Zipping adjacent columns together allows us to store 16-bit elements. */ + uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8); + uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8); + uint16x4x4_t cols_01_23_45_67 = { vreinterpret_u16_u8(cols_01_23.val[0]), + vreinterpret_u16_u8(cols_01_23.val[1]), + vreinterpret_u16_u8(cols_45_67.val[0]), + vreinterpret_u16_u8(cols_45_67.val[1]) + }; + + JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col; + JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col; + JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col; + JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col; + /* VST4 of 16-bit elements completes the transpose. */ + vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0); + vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1); + vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2); + vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3); +} + + +/* Performs the second pass of the slow-but-accurate inverse DCT on a 4x8 block + * of coefficients. + * + * This 'sparse' version assumes that the coefficient values (after the first + * pass) in rows 4, 5, 6 and 7 are all 0. This simplifies the IDCT calculation, + * accelerating overall performance. + */ + +static inline void jsimd_idct_islow_pass2_sparse(int16_t *workspace, + JSAMPARRAY output_buf, + JDIMENSION output_col, + unsigned buf_offset) +{ + /* Load constants for IDCT computation. */ + const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts); + + /* Even part. */ + int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2); + /* z3 is all 0. */ + + int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1); + int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2); + + z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2); + int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS); + int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS); + + int32x4_t tmp10 = vaddq_s32(tmp0, tmp3); + int32x4_t tmp13 = vsubq_s32(tmp0, tmp3); + int32x4_t tmp11 = vaddq_s32(tmp1, tmp2); + int32x4_t tmp12 = vsubq_s32(tmp1, tmp2); + + /* Odd part. */ + /* tmp0 and tmp1 are both all 0. */ + int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2); + int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2); + + int16x4_t z3_s16 = tmp2_s16; + int16x4_t z4_s16 = tmp3_s16; + + int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3); + z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3); + int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3); + z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0); + + tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0); + tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2); + tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2); + tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0); + + /* Final output stage: descale and narrow to 16-bit. */ + int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3), + vaddhn_s32(tmp12, tmp1)); + int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2), + vaddhn_s32(tmp13, tmp0)); + int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0), + vsubhn_s32(tmp11, tmp2)); + int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1), + vsubhn_s32(tmp10, tmp3)); + /* Descale and narrow to 8-bit. */ + int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16); + int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16); + int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16); + int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16); + /* Clamp to range [0-255]. */ + uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8), + vdup_n_u8(CENTERJSAMPLE)); + uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8), + vdup_n_u8(CENTERJSAMPLE)); + uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8), + vdup_n_u8(CENTERJSAMPLE)); + uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8), + vdup_n_u8(CENTERJSAMPLE)); + + /* Transpose 4x8 block and store to memory. */ + /* Zipping adjacent columns together allow us to store 16-bit elements. */ + uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8); + uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8); + uint16x4x4_t cols_01_23_45_67 = { vreinterpret_u16_u8(cols_01_23.val[0]), + vreinterpret_u16_u8(cols_01_23.val[1]), + vreinterpret_u16_u8(cols_45_67.val[0]), + vreinterpret_u16_u8(cols_45_67.val[1]) + }; + + JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col; + JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col; + JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col; + JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col; + /* VST4 of 16-bit elements completes the transpose. */ + vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0); + vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1); + vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2); + vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3); +} diff --git a/simd/arm/common/jidctred-neon.c b/simd/arm/common/jidctred-neon.c new file mode 100644 index 0000000..ed4232c --- /dev/null +++ b/simd/arm/common/jidctred-neon.c @@ -0,0 +1,469 @@ +/* + * jidctred-neon.c - reduced-size IDCT (Arm NEON) + * + * Copyright 2019 The Chromium Authors. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../../jconfigint.h" +#include "../../../jinclude.h" +#include "../../../jpeglib.h" +#include "../../../jsimd.h" +#include "../../../jdct.h" +#include "../../../jsimddct.h" +#include "../../jsimd.h" + +#include <arm_neon.h> + +#define CONST_BITS 13 +#define PASS1_BITS 2 + +#define F_0_211 1730 +#define F_0_509 4176 +#define F_0_601 4926 +#define F_0_720 5906 +#define F_0_765 6270 +#define F_0_850 6967 +#define F_0_899 7373 +#define F_1_061 8697 +#define F_1_272 10426 +#define F_1_451 11893 +#define F_1_847 15137 +#define F_2_172 17799 +#define F_2_562 20995 +#define F_3_624 29692 + +/* + * 'jsimd_idct_2x2_neon' is an inverse-DCT function for getting reduced-size + * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations and + * produces exactly the same output as IJG's original 'jpeg_idct_2x2' function + * from jpeg-6b, which can be found in jidctred.c. + * + * Scaled integer constants are used to avoid floating-point arithmetic: + * 0.720959822 = 5906 * 2^-13 + * 0.850430095 = 6967 * 2^-13 + * 1.272758580 = 10426 * 2^-13 + * 3.624509785 = 29692 * 2^-13 + * + * See jidctred.c for further details of the 2x2 reduced IDCT algorithm. Where + * possible, the variable names and comments here in 'jsimd_idct_2x2_neon' + * match up with those in 'jpeg_idct_2x2'. + * + * NOTE: jpeg-8 has an improved implementation of the 2x2 inverse-DCT which + * requires fewer arithmetic operations and hence should be faster. The + * primary purpose of this particular NEON optimized function is bit + * exact compatibility with jpeg-6b. + */ + +void jsimd_idct_2x2_neon(void *dct_table, + JCOEFPTR coef_block, + JSAMPARRAY restrict output_buf, + JDIMENSION output_col) +{ + ISLOW_MULT_TYPE *quantptr = dct_table; + + /* Load DCT coefficients. */ + int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE); + int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE); + int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE); + int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE); + int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE); + + /* Load DCT quantization table. */ + int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE); + int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE); + int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE); + int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE); + int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE); + + /* Dequantize DCT coefficients. */ + row0 = vmulq_s16(row0, quant_row0); + row1 = vmulq_s16(row1, quant_row1); + row3 = vmulq_s16(row3, quant_row3); + row5 = vmulq_s16(row5, quant_row5); + row7 = vmulq_s16(row7, quant_row7); + + /* Pass 1: process input columns; put results in vectors row0 and row1. */ + /* Even part. */ + int32x4_t tmp10_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 2); + int32x4_t tmp10_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 2); + + /* Odd part. */ + int32x4_t tmp0_l = vmull_n_s16(vget_low_s16(row1), F_3_624); + tmp0_l = vmlal_n_s16(tmp0_l, vget_low_s16(row3), -F_1_272); + tmp0_l = vmlal_n_s16(tmp0_l, vget_low_s16(row5), F_0_850); + tmp0_l = vmlal_n_s16(tmp0_l, vget_low_s16(row7), -F_0_720); + int32x4_t tmp0_h = vmull_n_s16(vget_high_s16(row1), F_3_624); + tmp0_h = vmlal_n_s16(tmp0_h, vget_high_s16(row3), -F_1_272); + tmp0_h = vmlal_n_s16(tmp0_h, vget_high_s16(row5), F_0_850); + tmp0_h = vmlal_n_s16(tmp0_h, vget_high_s16(row7), -F_0_720); + + /* Final output stage: descale and narrow to 16-bit. */ + row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp0_l), CONST_BITS), + vrshrn_n_s32(vaddq_s32(tmp10_h, tmp0_h), CONST_BITS)); + row1 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp0_l), CONST_BITS), + vrshrn_n_s32(vsubq_s32(tmp10_h, tmp0_h), CONST_BITS)); + + /* Transpose two rows ready for second pass. */ + int16x8x2_t cols_0246_1357 = vtrnq_s16(row0, row1); + int16x8_t cols_0246 = cols_0246_1357.val[0]; + int16x8_t cols_1357 = cols_0246_1357.val[1]; + /* Duplicate columns such that each is accessible in its own vector. */ + int32x4x2_t cols_1155_3377 = vtrnq_s32(vreinterpretq_s32_s16(cols_1357), + vreinterpretq_s32_s16(cols_1357)); + int16x8_t cols_1155 = vreinterpretq_s16_s32(cols_1155_3377.val[0]); + int16x8_t cols_3377 = vreinterpretq_s16_s32(cols_1155_3377.val[1]); + + /* Pass 2: process 2 rows, store to output array. */ + /* Even part: only interested in col0; top half of tmp10 is "don't care". */ + int32x4_t tmp10 = vshll_n_s16(vget_low_s16(cols_0246), CONST_BITS + 2); + + /* Odd part. Only interested in bottom half of tmp0. */ + int32x4_t tmp0 = vmull_n_s16(vget_low_s16(cols_1155), F_3_624); + tmp0 = vmlal_n_s16(tmp0, vget_low_s16(cols_3377), -F_1_272); + tmp0 = vmlal_n_s16(tmp0, vget_high_s16(cols_1155), F_0_850); + tmp0 = vmlal_n_s16(tmp0, vget_high_s16(cols_3377), -F_0_720); + + /* Final output stage: descale and clamp to range [0-255]. */ + int16x8_t output_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp0), + vsubhn_s32(tmp10, tmp0)); + output_s16 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_s16, + CONST_BITS + PASS1_BITS + 3 + 2 - 16); + /* Narrow to 8-bit and convert to unsigned. */ + uint8x8_t output_u8 = vqmovun_s16(output_s16); + + /* Store 2x2 block to memory. */ + vst1_lane_u8(output_buf[0] + output_col, output_u8, 0); + vst1_lane_u8(output_buf[1] + output_col, output_u8, 1); + vst1_lane_u8(output_buf[0] + output_col + 1, output_u8, 4); + vst1_lane_u8(output_buf[1] + output_col + 1, output_u8, 5); +} + + +/* + * 'jsimd_idct_4x4_neon' is an inverse-DCT function for getting reduced-size + * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations and + * produces exactly the same output as IJG's original 'jpeg_idct_4x4' function + * from jpeg-6b, which can be found in jidctred.c. + * + * Scaled integer constants are used to avoid floating-point arithmetic: + * 0.211164243 = 1730 * 2^-13 + * 0.509795579 = 4176 * 2^-13 + * 0.601344887 = 4926 * 2^-13 + * 0.765366865 = 6270 * 2^-13 + * 0.899976223 = 7373 * 2^-13 + * 1.061594337 = 8697 * 2^-13 + * 1.451774981 = 11893 * 2^-13 + * 1.847759065 = 15137 * 2^-13 + * 2.172734803 = 17799 * 2^-13 + * 2.562915447 = 20995 * 2^-13 + * + * See jidctred.c for further details of the 4x4 reduced IDCT algorithm. Where + * possible, the variable names and comments here in 'jsimd_idct_4x4_neon' + * match up with those in 'jpeg_idct_4x4'. + * + * NOTE: jpeg-8 has an improved implementation of the 4x4 inverse-DCT which + * requires fewer arithmetic operations and hence should be faster. The + * primary purpose of this particular NEON optimized function is bit + * exact compatibility with jpeg-6b. + */ + +ALIGN(16) static const int16_t jsimd_idct_4x4_neon_consts[] = { + F_1_847, -F_0_765, -F_0_211, F_1_451, + -F_2_172, F_1_061, -F_0_509, -F_0_601, + F_0_899, F_2_562, 0, 0 + }; + +void jsimd_idct_4x4_neon(void *dct_table, + JCOEFPTR coef_block, + JSAMPARRAY restrict output_buf, + JDIMENSION output_col) +{ + ISLOW_MULT_TYPE *quantptr = dct_table; + + /* Load DCT coefficients. */ + int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE); + int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE); + int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE); + int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE); + int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE); + int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE); + int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE); + + /* Load quantization table values for DC coefficients. */ + int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE); + /* Dequantize DC coefficients. */ + row0 = vmulq_s16(row0, quant_row0); + + /* Construct bitmap to test if all AC coefficients are 0. */ + int16x8_t bitmap = vorrq_s16(row1, row2); + bitmap = vorrq_s16(bitmap, row3); + bitmap = vorrq_s16(bitmap, row5); + bitmap = vorrq_s16(bitmap, row6); + bitmap = vorrq_s16(bitmap, row7); + + int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0); + int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1); + + /* Load constants for IDCT computation. */ + const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_4x4_neon_consts); + + if (left_ac_bitmap == 0 && right_ac_bitmap == 0) { + /* All AC coefficients are zero. */ + /* Compute DC values and duplicate into row vectors 0, 1, 2 and 3. */ + int16x8_t dcval = vshlq_n_s16(row0, PASS1_BITS); + row0 = dcval; + row1 = dcval; + row2 = dcval; + row3 = dcval; + } else if (left_ac_bitmap == 0) { + /* AC coefficients are zero for columns 0, 1, 2 and 3. */ + /* Compute DC values for these columns. */ + int16x4_t dcval = vshl_n_s16(vget_low_s16(row0), PASS1_BITS); + + /* Commence regular IDCT computation for columns 4, 5, 6 and 7. */ + /* Load quantization table. */ + int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4); + int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4); + int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4); + int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4); + int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4); + int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4); + + /* Even part. */ + int32x4_t tmp0 = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1); + + int16x4_t z2 = vmul_s16(vget_high_s16(row2), quant_row2); + int16x4_t z3 = vmul_s16(vget_high_s16(row6), quant_row6); + + int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0); + tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1); + + int32x4_t tmp10 = vaddq_s32(tmp0, tmp2); + int32x4_t tmp12 = vsubq_s32(tmp0, tmp2); + + /* Odd part. */ + int16x4_t z1 = vmul_s16(vget_high_s16(row7), quant_row7); + z2 = vmul_s16(vget_high_s16(row5), quant_row5); + z3 = vmul_s16(vget_high_s16(row3), quant_row3); + int16x4_t z4 = vmul_s16(vget_high_s16(row1), quant_row1); + + tmp0 = vmull_lane_s16(z1, consts.val[0], 2); + tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3); + tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0); + tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1); + + tmp2 = vmull_lane_s16(z1, consts.val[1], 2); + tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3); + tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0); + tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1); + + /* Final output stage: descale and narrow to 16-bit. */ + row0 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp10, tmp2), + CONST_BITS - PASS1_BITS + 1)); + row3 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp10, tmp2), + CONST_BITS - PASS1_BITS + 1)); + row1 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp12, tmp0), + CONST_BITS - PASS1_BITS + 1)); + row2 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp12, tmp0), + CONST_BITS - PASS1_BITS + 1)); + } else if (right_ac_bitmap == 0) { + /* AC coefficients are zero for columns 4, 5, 6 and 7. */ + /* Compute DC values for these columns. */ + int16x4_t dcval = vshl_n_s16(vget_high_s16(row0), PASS1_BITS); + + /* Commence regular IDCT computation for columns 0, 1, 2 and 3. */ + /* Load quantization table. */ + int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE); + int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE); + int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE); + int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE); + int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE); + int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE); + + /* Even part. */ + int32x4_t tmp0 = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1); + + int16x4_t z2 = vmul_s16(vget_low_s16(row2), quant_row2); + int16x4_t z3 = vmul_s16(vget_low_s16(row6), quant_row6); + + int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0); + tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1); + + int32x4_t tmp10 = vaddq_s32(tmp0, tmp2); + int32x4_t tmp12 = vsubq_s32(tmp0, tmp2); + + /* Odd part. */ + int16x4_t z1 = vmul_s16(vget_low_s16(row7), quant_row7); + z2 = vmul_s16(vget_low_s16(row5), quant_row5); + z3 = vmul_s16(vget_low_s16(row3), quant_row3); + int16x4_t z4 = vmul_s16(vget_low_s16(row1), quant_row1); + + tmp0 = vmull_lane_s16(z1, consts.val[0], 2); + tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3); + tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0); + tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1); + + tmp2 = vmull_lane_s16(z1, consts.val[1], 2); + tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3); + tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0); + tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1); + + /* Final output stage: descale and narrow to 16-bit. */ + row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10, tmp2), + CONST_BITS - PASS1_BITS + 1), dcval); + row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10, tmp2), + CONST_BITS - PASS1_BITS + 1), dcval); + row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12, tmp0), + CONST_BITS - PASS1_BITS + 1), dcval); + row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12, tmp0), + CONST_BITS - PASS1_BITS + 1), dcval); + } else { + /* All AC coefficients are non-zero; full IDCT calculation required. */ + int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE); + int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE); + int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE); + int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE); + int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE); + int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE); + + /* Even part. */ + int32x4_t tmp0_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1); + int32x4_t tmp0_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1); + + int16x8_t z2 = vmulq_s16(row2, quant_row2); + int16x8_t z3 = vmulq_s16(row6, quant_row6); + + int32x4_t tmp2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[0], 0); + int32x4_t tmp2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[0], 0); + tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[0], 1); + tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[0], 1); + + int32x4_t tmp10_l = vaddq_s32(tmp0_l, tmp2_l); + int32x4_t tmp10_h = vaddq_s32(tmp0_h, tmp2_h); + int32x4_t tmp12_l = vsubq_s32(tmp0_l, tmp2_l); + int32x4_t tmp12_h = vsubq_s32(tmp0_h, tmp2_h); + + /* Odd part. */ + int16x8_t z1 = vmulq_s16(row7, quant_row7); + z2 = vmulq_s16(row5, quant_row5); + z3 = vmulq_s16(row3, quant_row3); + int16x8_t z4 = vmulq_s16(row1, quant_row1); + + tmp0_l = vmull_lane_s16(vget_low_s16(z1), consts.val[0], 2); + tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z2), consts.val[0], 3); + tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z3), consts.val[1], 0); + tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z4), consts.val[1], 1); + tmp0_h = vmull_lane_s16(vget_high_s16(z1), consts.val[0], 2); + tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z2), consts.val[0], 3); + tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z3), consts.val[1], 0); + tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z4), consts.val[1], 1); + + tmp2_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 2); + tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z2), consts.val[1], 3); + tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[2], 0); + tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z4), consts.val[2], 1); + tmp2_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 2); + tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z2), consts.val[1], 3); + tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[2], 0); + tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z4), consts.val[2], 1); + + /* Final output stage: descale and narrow to 16-bit. */ + row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp2_l), + CONST_BITS - PASS1_BITS + 1), + vrshrn_n_s32(vaddq_s32(tmp10_h, tmp2_h), + CONST_BITS - PASS1_BITS + 1)); + row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp2_l), + CONST_BITS - PASS1_BITS + 1), + vrshrn_n_s32(vsubq_s32(tmp10_h, tmp2_h), + CONST_BITS - PASS1_BITS + 1)); + row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12_l, tmp0_l), + CONST_BITS - PASS1_BITS + 1), + vrshrn_n_s32(vaddq_s32(tmp12_h, tmp0_h), + CONST_BITS - PASS1_BITS + 1)); + row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12_l, tmp0_l), + CONST_BITS - PASS1_BITS + 1), + vrshrn_n_s32(vsubq_s32(tmp12_h, tmp0_h), + CONST_BITS - PASS1_BITS + 1)); + } + + /* Transpose 8x4 block to perform IDCT on rows in second pass. */ + int16x8x2_t row_01 = vtrnq_s16(row0, row1); + int16x8x2_t row_23 = vtrnq_s16(row2, row3); + + int32x4x2_t cols_0426 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[0]), + vreinterpretq_s32_s16(row_23.val[0])); + int32x4x2_t cols_1537 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[1]), + vreinterpretq_s32_s16(row_23.val[1])); + + int16x4_t col0 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[0])); + int16x4_t col1 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[0])); + int16x4_t col2 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[1])); + int16x4_t col3 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[1])); + int16x4_t col5 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[0])); + int16x4_t col6 = vreinterpret_s16_s32(vget_high_s32(cols_0426.val[1])); + int16x4_t col7 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[1])); + + /* Commence second pass of IDCT. */ + /* Even part. */ + int32x4_t tmp0 = vshll_n_s16(col0, CONST_BITS + 1); + int32x4_t tmp2 = vmull_lane_s16(col2, consts.val[0], 0); + tmp2 = vmlal_lane_s16(tmp2, col6, consts.val[0], 1); + + int32x4_t tmp10 = vaddq_s32(tmp0, tmp2); + int32x4_t tmp12 = vsubq_s32(tmp0, tmp2); + + /* Odd part. */ + tmp0 = vmull_lane_s16(col7, consts.val[0], 2); + tmp0 = vmlal_lane_s16(tmp0, col5, consts.val[0], 3); + tmp0 = vmlal_lane_s16(tmp0, col3, consts.val[1], 0); + tmp0 = vmlal_lane_s16(tmp0, col1, consts.val[1], 1); + + tmp2 = vmull_lane_s16(col7, consts.val[1], 2); + tmp2 = vmlal_lane_s16(tmp2, col5, consts.val[1], 3); + tmp2 = vmlal_lane_s16(tmp2, col3, consts.val[2], 0); + tmp2 = vmlal_lane_s16(tmp2, col1, consts.val[2], 1); + + /* Final output stage: descale and clamp to range [0-255]. */ + int16x8_t output_cols_02 = vcombine_s16(vaddhn_s32(tmp10, tmp2), + vsubhn_s32(tmp12, tmp0)); + int16x8_t output_cols_13 = vcombine_s16(vaddhn_s32(tmp12, tmp0), + vsubhn_s32(tmp10, tmp2)); + output_cols_02 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_02, + CONST_BITS + PASS1_BITS + 3 + 1 - 16); + output_cols_13 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_13, + CONST_BITS + PASS1_BITS + 3 + 1 - 16); + /* Narrow to 8-bit and convert to unsigned while zipping 8-bit elements. */ + /* Interleaving store completes the transpose. */ + uint8x8x2_t output_0123 = vzip_u8(vqmovun_s16(output_cols_02), + vqmovun_s16(output_cols_13)); + uint16x4x2_t output_01_23 = { vreinterpret_u16_u8(output_0123.val[0]), + vreinterpret_u16_u8(output_0123.val[1]) + }; + + /* Store 4x4 block to memory. */ + JSAMPROW outptr0 = output_buf[0] + output_col; + JSAMPROW outptr1 = output_buf[1] + output_col; + JSAMPROW outptr2 = output_buf[2] + output_col; + JSAMPROW outptr3 = output_buf[3] + output_col; + vst2_lane_u16((uint16_t *)outptr0, output_01_23, 0); + vst2_lane_u16((uint16_t *)outptr1, output_01_23, 1); + vst2_lane_u16((uint16_t *)outptr2, output_01_23, 2); + vst2_lane_u16((uint16_t *)outptr3, output_01_23, 3); +} diff --git a/simd/arm/common/jquanti-neon.c b/simd/arm/common/jquanti-neon.c new file mode 100644 index 0000000..6f8a3ab --- /dev/null +++ b/simd/arm/common/jquanti-neon.c @@ -0,0 +1,190 @@ +/* + * jquanti-neon.c - sample conversion and integer quantization (Arm NEON) + * + * Copyright 2020 The Chromium Authors. All Rights Reserved. + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#define JPEG_INTERNALS +#include "../../../jinclude.h" +#include "../../../jpeglib.h" +#include "../../../jsimd.h" +#include "../../../jdct.h" +#include "../../../jsimddct.h" +#include "../../jsimd.h" + +#include <arm_neon.h> + +/* + * Pixel channel sample values have range [0,255]. The Discrete Cosine + * Transform (DCT) operates on values centered around 0. + * + * To prepare sample values for the DCT, load samples into a DCT workspace, + * subtracting CENTREJSAMPLE (128). The samples, now in range [-128, 127], + * are also widened from 8- to 16-bit. + * + * The equivalent scalar C function 'convsamp' can be found in jcdctmgr.c. + */ + +void jsimd_convsamp_neon(JSAMPARRAY sample_data, + JDIMENSION start_col, + DCTELEM *workspace) +{ + uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col); + uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col); + uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col); + uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col); + uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col); + uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col); + uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col); + uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col); + + int16x8_t row0 = vreinterpretq_s16_u16(vsubl_u8(samp_row0, + vdup_n_u8(CENTERJSAMPLE))); + int16x8_t row1 = vreinterpretq_s16_u16(vsubl_u8(samp_row1, + vdup_n_u8(CENTERJSAMPLE))); + int16x8_t row2 = vreinterpretq_s16_u16(vsubl_u8(samp_row2, + vdup_n_u8(CENTERJSAMPLE))); + int16x8_t row3 = vreinterpretq_s16_u16(vsubl_u8(samp_row3, + vdup_n_u8(CENTERJSAMPLE))); + int16x8_t row4 = vreinterpretq_s16_u16(vsubl_u8(samp_row4, + vdup_n_u8(CENTERJSAMPLE))); + int16x8_t row5 = vreinterpretq_s16_u16(vsubl_u8(samp_row5, + vdup_n_u8(CENTERJSAMPLE))); + int16x8_t row6 = vreinterpretq_s16_u16(vsubl_u8(samp_row6, + vdup_n_u8(CENTERJSAMPLE))); + int16x8_t row7 = vreinterpretq_s16_u16(vsubl_u8(samp_row7, + vdup_n_u8(CENTERJSAMPLE))); + + vst1q_s16(workspace + 0 * DCTSIZE, row0); + vst1q_s16(workspace + 1 * DCTSIZE, row1); + vst1q_s16(workspace + 2 * DCTSIZE, row2); + vst1q_s16(workspace + 3 * DCTSIZE, row3); + vst1q_s16(workspace + 4 * DCTSIZE, row4); + vst1q_s16(workspace + 5 * DCTSIZE, row5); + vst1q_s16(workspace + 6 * DCTSIZE, row6); + vst1q_s16(workspace + 7 * DCTSIZE, row7); +} + + +/* + * After the DCT, the resulting coefficient values need to be divided by a + * quantization value. + * + * To avoid a slow division operation, the DCT coefficients are multiplied by + * the (scaled) reciprocal of the quantization values and then right-shifted. + * + * The equivalent scalar C function 'quantize' can be found in jcdctmgr.c. + */ + +void jsimd_quantize_neon(JCOEFPTR coef_block, + DCTELEM *divisors, + DCTELEM *workspace) +{ + JCOEFPTR out_ptr = coef_block; + UDCTELEM *recip_ptr = (UDCTELEM *)divisors; + UDCTELEM *corr_ptr = (UDCTELEM *)divisors + DCTSIZE2; + DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2; + + for (int i = 0; i < DCTSIZE; i += DCTSIZE / 2) { + /* Load DCT coefficients. */ + int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE); + int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE); + int16x8_t row2 = vld1q_s16(workspace + (i + 2) * DCTSIZE); + int16x8_t row3 = vld1q_s16(workspace + (i + 3) * DCTSIZE); + /* Load reciprocals of quantization values. */ + uint16x8_t recip0 = vld1q_u16(recip_ptr + (i + 0) * DCTSIZE); + uint16x8_t recip1 = vld1q_u16(recip_ptr + (i + 1) * DCTSIZE); + uint16x8_t recip2 = vld1q_u16(recip_ptr + (i + 2) * DCTSIZE); + uint16x8_t recip3 = vld1q_u16(recip_ptr + (i + 3) * DCTSIZE); + uint16x8_t corr0 = vld1q_u16(corr_ptr + (i + 0) * DCTSIZE); + uint16x8_t corr1 = vld1q_u16(corr_ptr + (i + 1) * DCTSIZE); + uint16x8_t corr2 = vld1q_u16(corr_ptr + (i + 2) * DCTSIZE); + uint16x8_t corr3 = vld1q_u16(corr_ptr + (i + 3) * DCTSIZE); + int16x8_t shift0 = vld1q_s16(shift_ptr + (i + 0) * DCTSIZE); + int16x8_t shift1 = vld1q_s16(shift_ptr + (i + 1) * DCTSIZE); + int16x8_t shift2 = vld1q_s16(shift_ptr + (i + 2) * DCTSIZE); + int16x8_t shift3 = vld1q_s16(shift_ptr + (i + 3) * DCTSIZE); + + /* Extract sign from coefficients. */ + int16x8_t sign_row0 = vshrq_n_s16(row0, 15); + int16x8_t sign_row1 = vshrq_n_s16(row1, 15); + int16x8_t sign_row2 = vshrq_n_s16(row2, 15); + int16x8_t sign_row3 = vshrq_n_s16(row3, 15); + /* Get absolute value of DCT coefficients. */ + uint16x8_t abs_row0 = vreinterpretq_u16_s16(vabsq_s16(row0)); + uint16x8_t abs_row1 = vreinterpretq_u16_s16(vabsq_s16(row1)); + uint16x8_t abs_row2 = vreinterpretq_u16_s16(vabsq_s16(row2)); + uint16x8_t abs_row3 = vreinterpretq_u16_s16(vabsq_s16(row3)); + /* Add correction. */ + abs_row0 = vaddq_u16(abs_row0, corr0); + abs_row1 = vaddq_u16(abs_row1, corr1); + abs_row2 = vaddq_u16(abs_row2, corr2); + abs_row3 = vaddq_u16(abs_row3, corr3); + + /* Multiply DCT coefficients by quantization reciprocal. */ + int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0), + vget_low_u16(recip0))); + int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0), + vget_high_u16(recip0))); + int32x4_t row1_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row1), + vget_low_u16(recip1))); + int32x4_t row1_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row1), + vget_high_u16(recip1))); + int32x4_t row2_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row2), + vget_low_u16(recip2))); + int32x4_t row2_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row2), + vget_high_u16(recip2))); + int32x4_t row3_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row3), + vget_low_u16(recip3))); + int32x4_t row3_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row3), + vget_high_u16(recip3))); + /* Narrow back to 16-bit. */ + row0 = vcombine_s16(vshrn_n_s32(row0_l, 16), vshrn_n_s32(row0_h, 16)); + row1 = vcombine_s16(vshrn_n_s32(row1_l, 16), vshrn_n_s32(row1_h, 16)); + row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16)); + row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16)); + + /* Since VSHR only supports an immediate as its second argument, negate */ + /* the shift value and shift left. */ + row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0), + vnegq_s16(shift0))); + row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1), + vnegq_s16(shift1))); + row2 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row2), + vnegq_s16(shift2))); + row3 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row3), + vnegq_s16(shift3))); + + /* Restore sign to original product. */ + row0 = veorq_s16(row0, sign_row0); + row0 = vsubq_s16(row0, sign_row0); + row1 = veorq_s16(row1, sign_row1); + row1 = vsubq_s16(row1, sign_row1); + row2 = veorq_s16(row2, sign_row2); + row2 = vsubq_s16(row2, sign_row2); + row3 = veorq_s16(row3, sign_row3); + row3 = vsubq_s16(row3, sign_row3); + + /* Store quantized coefficients to memory. */ + vst1q_s16(out_ptr + (i + 0) * DCTSIZE, row0); + vst1q_s16(out_ptr + (i + 1) * DCTSIZE, row1); + vst1q_s16(out_ptr + (i + 2) * DCTSIZE, row2); + vst1q_s16(out_ptr + (i + 3) * DCTSIZE, row3); + } +} diff --git a/simd/arm/jsimd_neon.S b/simd/arm/jsimd_neon.S deleted file mode 100644 index af929fe..0000000 --- a/simd/arm/jsimd_neon.S +++ /dev/null @@ -1,2878 +0,0 @@ -/* - * ARMv7 NEON optimizations for libjpeg-turbo - * - * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies). - * All Rights Reserved. - * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> - * Copyright (C) 2014, Siarhei Siamashka. All Rights Reserved. - * Copyright (C) 2014, Linaro Limited. All Rights Reserved. - * Copyright (C) 2015, D. R. Commander. All Rights Reserved. - * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved. - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */ -#endif - -.text -.fpu neon -.arch armv7a -.object_arch armv4 -.arm -.syntax unified - - -#define RESPECT_STRICT_ALIGNMENT 1 - - -/*****************************************************************************/ - -/* Supplementary macro for setting function attributes */ -.macro asm_function fname -#ifdef __APPLE__ - .private_extern _\fname - .globl _\fname -_\fname: -#else - .global \fname -#ifdef __ELF__ - .hidden \fname - .type \fname, %function -#endif -\fname: -#endif -.endm - -/* Transpose a block of 4x4 coefficients in four 64-bit registers */ -.macro transpose_4x4 x0, x1, x2, x3 - vtrn.16 \x0, \x1 - vtrn.16 \x2, \x3 - vtrn.32 \x0, \x2 - vtrn.32 \x1, \x3 -.endm - - -#define CENTERJSAMPLE 128 - -/*****************************************************************************/ - -/* - * Perform dequantization and inverse DCT on one block of coefficients. - * - * GLOBAL(void) - * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block, - * JSAMPARRAY output_buf, JDIMENSION output_col) - */ - -#define FIX_0_298631336 (2446) -#define FIX_0_390180644 (3196) -#define FIX_0_541196100 (4433) -#define FIX_0_765366865 (6270) -#define FIX_0_899976223 (7373) -#define FIX_1_175875602 (9633) -#define FIX_1_501321110 (12299) -#define FIX_1_847759065 (15137) -#define FIX_1_961570560 (16069) -#define FIX_2_053119869 (16819) -#define FIX_2_562915447 (20995) -#define FIX_3_072711026 (25172) - -#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560) -#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644) -#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065) -#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447) -#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223) -#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223) -#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447) -#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865) - -/* - * Reference SIMD-friendly 1-D ISLOW iDCT C implementation. - * Uses some ideas from the comments in 'simd/jiss2int-64.asm' - */ -#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) { \ - DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \ - JLONG q1, q2, q3, q4, q5, q6, q7; \ - JLONG tmp11_plus_tmp2, tmp11_minus_tmp2; \ - \ - /* 1-D iDCT input data */ \ - row0 = xrow0; \ - row1 = xrow1; \ - row2 = xrow2; \ - row3 = xrow3; \ - row4 = xrow4; \ - row5 = xrow5; \ - row6 = xrow6; \ - row7 = xrow7; \ - \ - q5 = row7 + row3; \ - q4 = row5 + row1; \ - q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \ - MULTIPLY(q4, FIX_1_175875602); \ - q7 = MULTIPLY(q5, FIX_1_175875602) + \ - MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \ - q2 = MULTIPLY(row2, FIX_0_541196100) + \ - MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \ - q4 = q6; \ - q3 = ((JLONG)row0 - (JLONG)row4) << 13; \ - q6 += MULTIPLY(row5, -FIX_2_562915447) + \ - MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \ - /* now we can use q1 (reloadable constants have been used up) */ \ - q1 = q3 + q2; \ - q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \ - MULTIPLY(row1, -FIX_0_899976223); \ - q5 = q7; \ - q1 = q1 + q6; \ - q7 += MULTIPLY(row7, -FIX_0_899976223) + \ - MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \ - \ - /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \ - tmp11_plus_tmp2 = q1; \ - row1 = 0; \ - \ - q1 = q1 - q6; \ - q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \ - MULTIPLY(row3, -FIX_2_562915447); \ - q1 = q1 - q6; \ - q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \ - MULTIPLY(row6, FIX_0_541196100); \ - q3 = q3 - q2; \ - \ - /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \ - tmp11_minus_tmp2 = q1; \ - \ - q1 = ((JLONG)row0 + (JLONG)row4) << 13; \ - q2 = q1 + q6; \ - q1 = q1 - q6; \ - \ - /* pick up the results */ \ - tmp0 = q4; \ - tmp1 = q5; \ - tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ - tmp3 = q7; \ - tmp10 = q2; \ - tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ - tmp12 = q3; \ - tmp13 = q1; \ -} - -#define XFIX_0_899976223 d0[0] -#define XFIX_0_541196100 d0[1] -#define XFIX_2_562915447 d0[2] -#define XFIX_0_298631336_MINUS_0_899976223 d0[3] -#define XFIX_1_501321110_MINUS_0_899976223 d1[0] -#define XFIX_2_053119869_MINUS_2_562915447 d1[1] -#define XFIX_0_541196100_PLUS_0_765366865 d1[2] -#define XFIX_1_175875602 d1[3] -#define XFIX_1_175875602_MINUS_0_390180644 d2[0] -#define XFIX_0_541196100_MINUS_1_847759065 d2[1] -#define XFIX_3_072711026_MINUS_2_562915447 d2[2] -#define XFIX_1_175875602_MINUS_1_961570560 d2[3] - -.balign 16 -jsimd_idct_islow_neon_consts: - .short FIX_0_899976223 /* d0[0] */ - .short FIX_0_541196100 /* d0[1] */ - .short FIX_2_562915447 /* d0[2] */ - .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ - .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ - .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ - .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ - .short FIX_1_175875602 /* d1[3] */ - /* reloadable constants */ - .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */ - .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */ - .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */ - .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */ - -asm_function jsimd_idct_islow_neon - - DCT_TABLE .req r0 - COEF_BLOCK .req r1 - OUTPUT_BUF .req r2 - OUTPUT_COL .req r3 - TMP1 .req r0 - TMP2 .req r1 - TMP3 .req r2 - TMP4 .req ip - - ROW0L .req d16 - ROW0R .req d17 - ROW1L .req d18 - ROW1R .req d19 - ROW2L .req d20 - ROW2R .req d21 - ROW3L .req d22 - ROW3R .req d23 - ROW4L .req d24 - ROW4R .req d25 - ROW5L .req d26 - ROW5R .req d27 - ROW6L .req d28 - ROW6R .req d29 - ROW7L .req d30 - ROW7R .req d31 - - /* Load and dequantize coefficients into NEON registers - * with the following allocation: - * 0 1 2 3 | 4 5 6 7 - * ---------+-------- - * 0 | d16 | d17 ( q8 ) - * 1 | d18 | d19 ( q9 ) - * 2 | d20 | d21 ( q10 ) - * 3 | d22 | d23 ( q11 ) - * 4 | d24 | d25 ( q12 ) - * 5 | d26 | d27 ( q13 ) - * 6 | d28 | d29 ( q14 ) - * 7 | d30 | d31 ( q15 ) - */ - adr ip, jsimd_idct_islow_neon_consts - vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! - vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! - vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! - vmul.s16 q8, q8, q0 - vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! - vmul.s16 q9, q9, q1 - vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! - vmul.s16 q10, q10, q2 - vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! - vmul.s16 q11, q11, q3 - vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] - vmul.s16 q12, q12, q0 - vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! - vmul.s16 q14, q14, q2 - vmul.s16 q13, q13, q1 - vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */ - add ip, ip, #16 - vmul.s16 q15, q15, q3 - vpush {d8-d15} /* save NEON registers */ - /* 1-D IDCT, pass 1, left 4x8 half */ - vadd.s16 d4, ROW7L, ROW3L - vadd.s16 d5, ROW5L, ROW1L - vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560 - vmlal.s16 q6, d5, XFIX_1_175875602 - vmull.s16 q7, d4, XFIX_1_175875602 - /* Check for the zero coefficients in the right 4x8 half */ - push {r4, r5} - vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644 - vsubl.s16 q3, ROW0L, ROW4L - ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))] - vmull.s16 q2, ROW2L, XFIX_0_541196100 - vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065 - orr r0, r4, r5 - vmov q4, q6 - vmlsl.s16 q6, ROW5L, XFIX_2_562915447 - ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))] - vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 - vshl.s32 q3, q3, #13 - orr r0, r0, r4 - vmlsl.s16 q4, ROW1L, XFIX_0_899976223 - orr r0, r0, r5 - vadd.s32 q1, q3, q2 - ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))] - vmov q5, q7 - vadd.s32 q1, q1, q6 - orr r0, r0, r4 - vmlsl.s16 q7, ROW7L, XFIX_0_899976223 - orr r0, r0, r5 - vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 - vrshrn.s32 ROW1L, q1, #11 - ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))] - vsub.s32 q1, q1, q6 - vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447 - orr r0, r0, r4 - vmlsl.s16 q5, ROW3L, XFIX_2_562915447 - orr r0, r0, r5 - vsub.s32 q1, q1, q6 - vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 - ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))] - vmlal.s16 q6, ROW6L, XFIX_0_541196100 - vsub.s32 q3, q3, q2 - orr r0, r0, r4 - vrshrn.s32 ROW6L, q1, #11 - orr r0, r0, r5 - vadd.s32 q1, q3, q5 - ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))] - vsub.s32 q3, q3, q5 - vaddl.s16 q5, ROW0L, ROW4L - orr r0, r0, r4 - vrshrn.s32 ROW2L, q1, #11 - orr r0, r0, r5 - vrshrn.s32 ROW5L, q3, #11 - ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))] - vshl.s32 q5, q5, #13 - vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223 - orr r0, r0, r4 - vadd.s32 q2, q5, q6 - orrs r0, r0, r5 - vsub.s32 q1, q5, q6 - vadd.s32 q6, q2, q7 - ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))] - vsub.s32 q2, q2, q7 - vadd.s32 q5, q1, q4 - orr r0, r4, r5 - vsub.s32 q3, q1, q4 - pop {r4, r5} - vrshrn.s32 ROW7L, q2, #11 - vrshrn.s32 ROW3L, q5, #11 - vrshrn.s32 ROW0L, q6, #11 - vrshrn.s32 ROW4L, q3, #11 - - beq 3f /* Go to do some special handling for the sparse - right 4x8 half */ - - /* 1-D IDCT, pass 1, right 4x8 half */ - vld1.s16 {d2}, [ip, :64] /* reload constants */ - vadd.s16 d10, ROW7R, ROW3R - vadd.s16 d8, ROW5R, ROW1R - /* Transpose left 4x8 half */ - vtrn.16 ROW6L, ROW7L - vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560 - vmlal.s16 q6, d8, XFIX_1_175875602 - vtrn.16 ROW2L, ROW3L - vmull.s16 q7, d10, XFIX_1_175875602 - vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644 - vtrn.16 ROW0L, ROW1L - vsubl.s16 q3, ROW0R, ROW4R - vmull.s16 q2, ROW2R, XFIX_0_541196100 - vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 - vtrn.16 ROW4L, ROW5L - vmov q4, q6 - vmlsl.s16 q6, ROW5R, XFIX_2_562915447 - vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447 - vtrn.32 ROW1L, ROW3L - vshl.s32 q3, q3, #13 - vmlsl.s16 q4, ROW1R, XFIX_0_899976223 - vtrn.32 ROW4L, ROW6L - vadd.s32 q1, q3, q2 - vmov q5, q7 - vadd.s32 q1, q1, q6 - vtrn.32 ROW0L, ROW2L - vmlsl.s16 q7, ROW7R, XFIX_0_899976223 - vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223 - vrshrn.s32 ROW1R, q1, #11 - vtrn.32 ROW5L, ROW7L - vsub.s32 q1, q1, q6 - vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 - vmlsl.s16 q5, ROW3R, XFIX_2_562915447 - vsub.s32 q1, q1, q6 - vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865 - vmlal.s16 q6, ROW6R, XFIX_0_541196100 - vsub.s32 q3, q3, q2 - vrshrn.s32 ROW6R, q1, #11 - vadd.s32 q1, q3, q5 - vsub.s32 q3, q3, q5 - vaddl.s16 q5, ROW0R, ROW4R - vrshrn.s32 ROW2R, q1, #11 - vrshrn.s32 ROW5R, q3, #11 - vshl.s32 q5, q5, #13 - vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 - vadd.s32 q2, q5, q6 - vsub.s32 q1, q5, q6 - vadd.s32 q6, q2, q7 - vsub.s32 q2, q2, q7 - vadd.s32 q5, q1, q4 - vsub.s32 q3, q1, q4 - vrshrn.s32 ROW7R, q2, #11 - vrshrn.s32 ROW3R, q5, #11 - vrshrn.s32 ROW0R, q6, #11 - vrshrn.s32 ROW4R, q3, #11 - /* Transpose right 4x8 half */ - vtrn.16 ROW6R, ROW7R - vtrn.16 ROW2R, ROW3R - vtrn.16 ROW0R, ROW1R - vtrn.16 ROW4R, ROW5R - vtrn.32 ROW1R, ROW3R - vtrn.32 ROW4R, ROW6R - vtrn.32 ROW0R, ROW2R - vtrn.32 ROW5R, ROW7R - -1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */ - vld1.s16 {d2}, [ip, :64] /* reload constants */ - vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */ - vmlal.s16 q6, ROW1L, XFIX_1_175875602 - vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ - vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 - vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */ - vmlal.s16 q7, ROW3L, XFIX_1_175875602 - vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ - vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 - vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */ - vmull.s16 q2, ROW2L, XFIX_0_541196100 - vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */ - vmov q4, q6 - vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */ - vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 - vshl.s32 q3, q3, #13 - vmlsl.s16 q4, ROW1L, XFIX_0_899976223 - vadd.s32 q1, q3, q2 - vmov q5, q7 - vadd.s32 q1, q1, q6 - vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */ - vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 - vshrn.s32 ROW1L, q1, #16 - vsub.s32 q1, q1, q6 - vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */ - vmlsl.s16 q5, ROW3L, XFIX_2_562915447 - vsub.s32 q1, q1, q6 - vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 - vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */ - vsub.s32 q3, q3, q2 - vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ - vadd.s32 q1, q3, q5 - vsub.s32 q3, q3, q5 - vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */ - vshrn.s32 ROW2L, q1, #16 - vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ - vshl.s32 q5, q5, #13 - vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */ - vadd.s32 q2, q5, q6 - vsub.s32 q1, q5, q6 - vadd.s32 q6, q2, q7 - vsub.s32 q2, q2, q7 - vadd.s32 q5, q1, q4 - vsub.s32 q3, q1, q4 - vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ - vshrn.s32 ROW3L, q5, #16 - vshrn.s32 ROW0L, q6, #16 - vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ - /* 1-D IDCT, pass 2, right 4x8 half */ - vld1.s16 {d2}, [ip, :64] /* reload constants */ - vmull.s16 q6, ROW5R, XFIX_1_175875602 - vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */ - vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560 - vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ - vmull.s16 q7, ROW7R, XFIX_1_175875602 - vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */ - vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644 - vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ - vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */ - vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */ - vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 - vmov q4, q6 - vmlsl.s16 q6, ROW5R, XFIX_2_562915447 - vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */ - vshl.s32 q3, q3, #13 - vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */ - vadd.s32 q1, q3, q2 - vmov q5, q7 - vadd.s32 q1, q1, q6 - vmlsl.s16 q7, ROW7R, XFIX_0_899976223 - vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */ - vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ - vsub.s32 q1, q1, q6 - vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 - vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */ - vsub.s32 q1, q1, q6 - vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */ - vmlal.s16 q6, ROW6R, XFIX_0_541196100 - vsub.s32 q3, q3, q2 - vshrn.s32 ROW6R, q1, #16 - vadd.s32 q1, q3, q5 - vsub.s32 q3, q3, q5 - vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */ - vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ - vshrn.s32 ROW5R, q3, #16 - vshl.s32 q5, q5, #13 - vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 - vadd.s32 q2, q5, q6 - vsub.s32 q1, q5, q6 - vadd.s32 q6, q2, q7 - vsub.s32 q2, q2, q7 - vadd.s32 q5, q1, q4 - vsub.s32 q3, q1, q4 - vshrn.s32 ROW7R, q2, #16 - vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ - vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ - vshrn.s32 ROW4R, q3, #16 - -2: /* Descale to 8-bit and range limit */ - vqrshrn.s16 d16, q8, #2 - vqrshrn.s16 d17, q9, #2 - vqrshrn.s16 d18, q10, #2 - vqrshrn.s16 d19, q11, #2 - vpop {d8-d15} /* restore NEON registers */ - vqrshrn.s16 d20, q12, #2 - /* Transpose the final 8-bit samples and do signed->unsigned conversion */ - vtrn.16 q8, q9 - vqrshrn.s16 d21, q13, #2 - vqrshrn.s16 d22, q14, #2 - vmov.u8 q0, #(CENTERJSAMPLE) - vqrshrn.s16 d23, q15, #2 - vtrn.8 d16, d17 - vtrn.8 d18, d19 - vadd.u8 q8, q8, q0 - vadd.u8 q9, q9, q0 - vtrn.16 q10, q11 - /* Store results to the output buffer */ - ldmia OUTPUT_BUF!, {TMP1, TMP2} - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - vst1.8 {d16}, [TMP1] - vtrn.8 d20, d21 - vst1.8 {d17}, [TMP2] - ldmia OUTPUT_BUF!, {TMP1, TMP2} - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - vst1.8 {d18}, [TMP1] - vadd.u8 q10, q10, q0 - vst1.8 {d19}, [TMP2] - ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - add TMP3, TMP3, OUTPUT_COL - add TMP4, TMP4, OUTPUT_COL - vtrn.8 d22, d23 - vst1.8 {d20}, [TMP1] - vadd.u8 q11, q11, q0 - vst1.8 {d21}, [TMP2] - vst1.8 {d22}, [TMP3] - vst1.8 {d23}, [TMP4] - bx lr - -3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */ - - /* Transpose left 4x8 half */ - vtrn.16 ROW6L, ROW7L - vtrn.16 ROW2L, ROW3L - vtrn.16 ROW0L, ROW1L - vtrn.16 ROW4L, ROW5L - vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */ - vtrn.32 ROW1L, ROW3L - vtrn.32 ROW4L, ROW6L - vtrn.32 ROW0L, ROW2L - vtrn.32 ROW5L, ROW7L - - cmp r0, #0 - beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second - pass */ - - /* Only row 0 is non-zero for the right 4x8 half */ - vdup.s16 ROW1R, ROW0R[1] - vdup.s16 ROW2R, ROW0R[2] - vdup.s16 ROW3R, ROW0R[3] - vdup.s16 ROW4R, ROW0R[0] - vdup.s16 ROW5R, ROW0R[1] - vdup.s16 ROW6R, ROW0R[2] - vdup.s16 ROW7R, ROW0R[3] - vdup.s16 ROW0R, ROW0R[0] - b 1b /* Go to 'normal' second pass */ - -4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ - vld1.s16 {d2}, [ip, :64] /* reload constants */ - vmull.s16 q6, ROW1L, XFIX_1_175875602 - vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 - vmull.s16 q7, ROW3L, XFIX_1_175875602 - vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 - vmull.s16 q2, ROW2L, XFIX_0_541196100 - vshll.s16 q3, ROW0L, #13 - vmov q4, q6 - vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 - vmlsl.s16 q4, ROW1L, XFIX_0_899976223 - vadd.s32 q1, q3, q2 - vmov q5, q7 - vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 - vadd.s32 q1, q1, q6 - vadd.s32 q6, q6, q6 - vmlsl.s16 q5, ROW3L, XFIX_2_562915447 - vshrn.s32 ROW1L, q1, #16 - vsub.s32 q1, q1, q6 - vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 - vsub.s32 q3, q3, q2 - vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ - vadd.s32 q1, q3, q5 - vsub.s32 q3, q3, q5 - vshll.s16 q5, ROW0L, #13 - vshrn.s32 ROW2L, q1, #16 - vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ - vadd.s32 q2, q5, q6 - vsub.s32 q1, q5, q6 - vadd.s32 q6, q2, q7 - vsub.s32 q2, q2, q7 - vadd.s32 q5, q1, q4 - vsub.s32 q3, q1, q4 - vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ - vshrn.s32 ROW3L, q5, #16 - vshrn.s32 ROW0L, q6, #16 - vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ - /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */ - vld1.s16 {d2}, [ip, :64] /* reload constants */ - vmull.s16 q6, ROW5L, XFIX_1_175875602 - vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 - vmull.s16 q7, ROW7L, XFIX_1_175875602 - vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 - vmull.s16 q2, ROW6L, XFIX_0_541196100 - vshll.s16 q3, ROW4L, #13 - vmov q4, q6 - vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 - vmlsl.s16 q4, ROW5L, XFIX_0_899976223 - vadd.s32 q1, q3, q2 - vmov q5, q7 - vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 - vadd.s32 q1, q1, q6 - vadd.s32 q6, q6, q6 - vmlsl.s16 q5, ROW7L, XFIX_2_562915447 - vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ - vsub.s32 q1, q1, q6 - vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 - vsub.s32 q3, q3, q2 - vshrn.s32 ROW6R, q1, #16 - vadd.s32 q1, q3, q5 - vsub.s32 q3, q3, q5 - vshll.s16 q5, ROW4L, #13 - vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ - vshrn.s32 ROW5R, q3, #16 - vadd.s32 q2, q5, q6 - vsub.s32 q1, q5, q6 - vadd.s32 q6, q2, q7 - vsub.s32 q2, q2, q7 - vadd.s32 q5, q1, q4 - vsub.s32 q3, q1, q4 - vshrn.s32 ROW7R, q2, #16 - vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ - vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ - vshrn.s32 ROW4R, q3, #16 - b 2b /* Go to epilogue */ - - .unreq DCT_TABLE - .unreq COEF_BLOCK - .unreq OUTPUT_BUF - .unreq OUTPUT_COL - .unreq TMP1 - .unreq TMP2 - .unreq TMP3 - .unreq TMP4 - - .unreq ROW0L - .unreq ROW0R - .unreq ROW1L - .unreq ROW1R - .unreq ROW2L - .unreq ROW2R - .unreq ROW3L - .unreq ROW3R - .unreq ROW4L - .unreq ROW4R - .unreq ROW5L - .unreq ROW5R - .unreq ROW6L - .unreq ROW6R - .unreq ROW7L - .unreq ROW7R - - -/*****************************************************************************/ - -/* - * jsimd_idct_ifast_neon - * - * This function contains a fast, not so accurate integer implementation of - * the inverse DCT (Discrete Cosine Transform). It uses the same calculations - * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' - * function from jidctfst.c - * - * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. - * But in ARM NEON case some extra additions are required because VQDMULH - * instruction can't handle the constants larger than 1. So the expressions - * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", - * which introduces an extra addition. Overall, there are 6 extra additions - * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. - */ - -#define XFIX_1_082392200 d0[0] -#define XFIX_1_414213562 d0[1] -#define XFIX_1_847759065 d0[2] -#define XFIX_2_613125930 d0[3] - -.balign 16 -jsimd_idct_ifast_neon_consts: - .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ - .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ - .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ - .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ - -asm_function jsimd_idct_ifast_neon - - DCT_TABLE .req r0 - COEF_BLOCK .req r1 - OUTPUT_BUF .req r2 - OUTPUT_COL .req r3 - TMP1 .req r0 - TMP2 .req r1 - TMP3 .req r2 - TMP4 .req ip - - /* Load and dequantize coefficients into NEON registers - * with the following allocation: - * 0 1 2 3 | 4 5 6 7 - * ---------+-------- - * 0 | d16 | d17 ( q8 ) - * 1 | d18 | d19 ( q9 ) - * 2 | d20 | d21 ( q10 ) - * 3 | d22 | d23 ( q11 ) - * 4 | d24 | d25 ( q12 ) - * 5 | d26 | d27 ( q13 ) - * 6 | d28 | d29 ( q14 ) - * 7 | d30 | d31 ( q15 ) - */ - adr ip, jsimd_idct_ifast_neon_consts - vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! - vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! - vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! - vmul.s16 q8, q8, q0 - vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! - vmul.s16 q9, q9, q1 - vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! - vmul.s16 q10, q10, q2 - vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! - vmul.s16 q11, q11, q3 - vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] - vmul.s16 q12, q12, q0 - vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! - vmul.s16 q14, q14, q2 - vmul.s16 q13, q13, q1 - vld1.16 {d0}, [ip, :64] /* load constants */ - vmul.s16 q15, q15, q3 - vpush {d8-d13} /* save NEON registers */ - /* 1-D IDCT, pass 1 */ - vsub.s16 q2, q10, q14 - vadd.s16 q14, q10, q14 - vsub.s16 q1, q11, q13 - vadd.s16 q13, q11, q13 - vsub.s16 q5, q9, q15 - vadd.s16 q15, q9, q15 - vqdmulh.s16 q4, q2, XFIX_1_414213562 - vqdmulh.s16 q6, q1, XFIX_2_613125930 - vadd.s16 q3, q1, q1 - vsub.s16 q1, q5, q1 - vadd.s16 q10, q2, q4 - vqdmulh.s16 q4, q1, XFIX_1_847759065 - vsub.s16 q2, q15, q13 - vadd.s16 q3, q3, q6 - vqdmulh.s16 q6, q2, XFIX_1_414213562 - vadd.s16 q1, q1, q4 - vqdmulh.s16 q4, q5, XFIX_1_082392200 - vsub.s16 q10, q10, q14 - vadd.s16 q2, q2, q6 - vsub.s16 q6, q8, q12 - vadd.s16 q12, q8, q12 - vadd.s16 q9, q5, q4 - vadd.s16 q5, q6, q10 - vsub.s16 q10, q6, q10 - vadd.s16 q6, q15, q13 - vadd.s16 q8, q12, q14 - vsub.s16 q3, q6, q3 - vsub.s16 q12, q12, q14 - vsub.s16 q3, q3, q1 - vsub.s16 q1, q9, q1 - vadd.s16 q2, q3, q2 - vsub.s16 q15, q8, q6 - vadd.s16 q1, q1, q2 - vadd.s16 q8, q8, q6 - vadd.s16 q14, q5, q3 - vsub.s16 q9, q5, q3 - vsub.s16 q13, q10, q2 - vadd.s16 q10, q10, q2 - /* Transpose */ - vtrn.16 q8, q9 - vsub.s16 q11, q12, q1 - vtrn.16 q14, q15 - vadd.s16 q12, q12, q1 - vtrn.16 q10, q11 - vtrn.16 q12, q13 - vtrn.32 q9, q11 - vtrn.32 q12, q14 - vtrn.32 q8, q10 - vtrn.32 q13, q15 - vswp d28, d21 - vswp d26, d19 - /* 1-D IDCT, pass 2 */ - vsub.s16 q2, q10, q14 - vswp d30, d23 - vadd.s16 q14, q10, q14 - vswp d24, d17 - vsub.s16 q1, q11, q13 - vadd.s16 q13, q11, q13 - vsub.s16 q5, q9, q15 - vadd.s16 q15, q9, q15 - vqdmulh.s16 q4, q2, XFIX_1_414213562 - vqdmulh.s16 q6, q1, XFIX_2_613125930 - vadd.s16 q3, q1, q1 - vsub.s16 q1, q5, q1 - vadd.s16 q10, q2, q4 - vqdmulh.s16 q4, q1, XFIX_1_847759065 - vsub.s16 q2, q15, q13 - vadd.s16 q3, q3, q6 - vqdmulh.s16 q6, q2, XFIX_1_414213562 - vadd.s16 q1, q1, q4 - vqdmulh.s16 q4, q5, XFIX_1_082392200 - vsub.s16 q10, q10, q14 - vadd.s16 q2, q2, q6 - vsub.s16 q6, q8, q12 - vadd.s16 q12, q8, q12 - vadd.s16 q9, q5, q4 - vadd.s16 q5, q6, q10 - vsub.s16 q10, q6, q10 - vadd.s16 q6, q15, q13 - vadd.s16 q8, q12, q14 - vsub.s16 q3, q6, q3 - vsub.s16 q12, q12, q14 - vsub.s16 q3, q3, q1 - vsub.s16 q1, q9, q1 - vadd.s16 q2, q3, q2 - vsub.s16 q15, q8, q6 - vadd.s16 q1, q1, q2 - vadd.s16 q8, q8, q6 - vadd.s16 q14, q5, q3 - vsub.s16 q9, q5, q3 - vsub.s16 q13, q10, q2 - vpop {d8-d13} /* restore NEON registers */ - vadd.s16 q10, q10, q2 - vsub.s16 q11, q12, q1 - vadd.s16 q12, q12, q1 - /* Descale to 8-bit and range limit */ - vmov.u8 q0, #0x80 - vqshrn.s16 d16, q8, #5 - vqshrn.s16 d17, q9, #5 - vqshrn.s16 d18, q10, #5 - vqshrn.s16 d19, q11, #5 - vqshrn.s16 d20, q12, #5 - vqshrn.s16 d21, q13, #5 - vqshrn.s16 d22, q14, #5 - vqshrn.s16 d23, q15, #5 - vadd.u8 q8, q8, q0 - vadd.u8 q9, q9, q0 - vadd.u8 q10, q10, q0 - vadd.u8 q11, q11, q0 - /* Transpose the final 8-bit samples */ - vtrn.16 q8, q9 - vtrn.16 q10, q11 - vtrn.32 q8, q10 - vtrn.32 q9, q11 - vtrn.8 d16, d17 - vtrn.8 d18, d19 - /* Store results to the output buffer */ - ldmia OUTPUT_BUF!, {TMP1, TMP2} - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - vst1.8 {d16}, [TMP1] - vst1.8 {d17}, [TMP2] - ldmia OUTPUT_BUF!, {TMP1, TMP2} - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - vst1.8 {d18}, [TMP1] - vtrn.8 d20, d21 - vst1.8 {d19}, [TMP2] - ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - add TMP3, TMP3, OUTPUT_COL - add TMP4, TMP4, OUTPUT_COL - vst1.8 {d20}, [TMP1] - vtrn.8 d22, d23 - vst1.8 {d21}, [TMP2] - vst1.8 {d22}, [TMP3] - vst1.8 {d23}, [TMP4] - bx lr - - .unreq DCT_TABLE - .unreq COEF_BLOCK - .unreq OUTPUT_BUF - .unreq OUTPUT_COL - .unreq TMP1 - .unreq TMP2 - .unreq TMP3 - .unreq TMP4 - - -/*****************************************************************************/ - -/* - * jsimd_idct_4x4_neon - * - * This function contains inverse-DCT code for getting reduced-size - * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations - * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' - * function from jpeg-6b (jidctred.c). - * - * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which - * requires much less arithmetic operations and hence should be faster. - * The primary purpose of this particular NEON optimized function is - * bit exact compatibility with jpeg-6b. - * - * TODO: a bit better instructions scheduling can be achieved by expanding - * idct_helper/transpose_4x4 macros and reordering instructions, - * but readability will suffer somewhat. - */ - -#define CONST_BITS 13 - -#define FIX_0_211164243 (1730) /* FIX(0.211164243) */ -#define FIX_0_509795579 (4176) /* FIX(0.509795579) */ -#define FIX_0_601344887 (4926) /* FIX(0.601344887) */ -#define FIX_0_720959822 (5906) /* FIX(0.720959822) */ -#define FIX_0_765366865 (6270) /* FIX(0.765366865) */ -#define FIX_0_850430095 (6967) /* FIX(0.850430095) */ -#define FIX_0_899976223 (7373) /* FIX(0.899976223) */ -#define FIX_1_061594337 (8697) /* FIX(1.061594337) */ -#define FIX_1_272758580 (10426) /* FIX(1.272758580) */ -#define FIX_1_451774981 (11893) /* FIX(1.451774981) */ -#define FIX_1_847759065 (15137) /* FIX(1.847759065) */ -#define FIX_2_172734803 (17799) /* FIX(2.172734803) */ -#define FIX_2_562915447 (20995) /* FIX(2.562915447) */ -#define FIX_3_624509785 (29692) /* FIX(3.624509785) */ - -.balign 16 -jsimd_idct_4x4_neon_consts: - .short FIX_1_847759065 /* d0[0] */ - .short -FIX_0_765366865 /* d0[1] */ - .short -FIX_0_211164243 /* d0[2] */ - .short FIX_1_451774981 /* d0[3] */ - .short -FIX_2_172734803 /* d1[0] */ - .short FIX_1_061594337 /* d1[1] */ - .short -FIX_0_509795579 /* d1[2] */ - .short -FIX_0_601344887 /* d1[3] */ - .short FIX_0_899976223 /* d2[0] */ - .short FIX_2_562915447 /* d2[1] */ - .short 1 << (CONST_BITS + 1) /* d2[2] */ - .short 0 /* d2[3] */ - -.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 - vmull.s16 q14, \x4, d2[2] - vmlal.s16 q14, \x8, d0[0] - vmlal.s16 q14, \x14, d0[1] - - vmull.s16 q13, \x16, d1[2] - vmlal.s16 q13, \x12, d1[3] - vmlal.s16 q13, \x10, d2[0] - vmlal.s16 q13, \x6, d2[1] - - vmull.s16 q15, \x4, d2[2] - vmlsl.s16 q15, \x8, d0[0] - vmlsl.s16 q15, \x14, d0[1] - - vmull.s16 q12, \x16, d0[2] - vmlal.s16 q12, \x12, d0[3] - vmlal.s16 q12, \x10, d1[0] - vmlal.s16 q12, \x6, d1[1] - - vadd.s32 q10, q14, q13 - vsub.s32 q14, q14, q13 - - .if \shift > 16 - vrshr.s32 q10, q10, #\shift - vrshr.s32 q14, q14, #\shift - vmovn.s32 \y26, q10 - vmovn.s32 \y29, q14 - .else - vrshrn.s32 \y26, q10, #\shift - vrshrn.s32 \y29, q14, #\shift - .endif - - vadd.s32 q10, q15, q12 - vsub.s32 q15, q15, q12 - - .if \shift > 16 - vrshr.s32 q10, q10, #\shift - vrshr.s32 q15, q15, #\shift - vmovn.s32 \y27, q10 - vmovn.s32 \y28, q15 - .else - vrshrn.s32 \y27, q10, #\shift - vrshrn.s32 \y28, q15, #\shift - .endif -.endm - -asm_function jsimd_idct_4x4_neon - - DCT_TABLE .req r0 - COEF_BLOCK .req r1 - OUTPUT_BUF .req r2 - OUTPUT_COL .req r3 - TMP1 .req r0 - TMP2 .req r1 - TMP3 .req r2 - TMP4 .req ip - - vpush {d8-d15} - - /* Load constants (d3 is just used for padding) */ - adr TMP4, jsimd_idct_4x4_neon_consts - vld1.16 {d0, d1, d2, d3}, [TMP4, :128] - - /* Load all COEF_BLOCK into NEON registers with the following allocation: - * 0 1 2 3 | 4 5 6 7 - * ---------+-------- - * 0 | d4 | d5 - * 1 | d6 | d7 - * 2 | d8 | d9 - * 3 | d10 | d11 - * 4 | - | - - * 5 | d12 | d13 - * 6 | d14 | d15 - * 7 | d16 | d17 - */ - vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! - vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]! - add COEF_BLOCK, COEF_BLOCK, #16 - vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]! - vld1.16 {d16, d17}, [COEF_BLOCK, :128]! - /* dequantize */ - vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! - vmul.s16 q2, q2, q9 - vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]! - vmul.s16 q3, q3, q10 - vmul.s16 q4, q4, q11 - add DCT_TABLE, DCT_TABLE, #16 - vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]! - vmul.s16 q5, q5, q12 - vmul.s16 q6, q6, q13 - vld1.16 {d30, d31}, [DCT_TABLE, :128]! - vmul.s16 q7, q7, q14 - vmul.s16 q8, q8, q15 - - /* Pass 1 */ - idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10 - transpose_4x4 d4, d6, d8, d10 - idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11 - transpose_4x4 d5, d7, d9, d11 - - /* Pass 2 */ - idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29 - transpose_4x4 d26, d27, d28, d29 - - /* Range limit */ - vmov.u16 q15, #0x80 - vadd.s16 q13, q13, q15 - vadd.s16 q14, q14, q15 - vqmovun.s16 d26, q13 - vqmovun.s16 d27, q14 - - /* Store results to the output buffer */ - ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - add TMP3, TMP3, OUTPUT_COL - add TMP4, TMP4, OUTPUT_COL - -#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT - /* We can use much less instructions on little endian systems if the - * OS kernel is not configured to trap unaligned memory accesses - */ - vst1.32 {d26[0]}, [TMP1]! - vst1.32 {d27[0]}, [TMP3]! - vst1.32 {d26[1]}, [TMP2]! - vst1.32 {d27[1]}, [TMP4]! -#else - vst1.8 {d26[0]}, [TMP1]! - vst1.8 {d27[0]}, [TMP3]! - vst1.8 {d26[1]}, [TMP1]! - vst1.8 {d27[1]}, [TMP3]! - vst1.8 {d26[2]}, [TMP1]! - vst1.8 {d27[2]}, [TMP3]! - vst1.8 {d26[3]}, [TMP1]! - vst1.8 {d27[3]}, [TMP3]! - - vst1.8 {d26[4]}, [TMP2]! - vst1.8 {d27[4]}, [TMP4]! - vst1.8 {d26[5]}, [TMP2]! - vst1.8 {d27[5]}, [TMP4]! - vst1.8 {d26[6]}, [TMP2]! - vst1.8 {d27[6]}, [TMP4]! - vst1.8 {d26[7]}, [TMP2]! - vst1.8 {d27[7]}, [TMP4]! -#endif - - vpop {d8-d15} - bx lr - - .unreq DCT_TABLE - .unreq COEF_BLOCK - .unreq OUTPUT_BUF - .unreq OUTPUT_COL - .unreq TMP1 - .unreq TMP2 - .unreq TMP3 - .unreq TMP4 - -.purgem idct_helper - - -/*****************************************************************************/ - -/* - * jsimd_idct_2x2_neon - * - * This function contains inverse-DCT code for getting reduced-size - * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations - * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' - * function from jpeg-6b (jidctred.c). - * - * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which - * requires much less arithmetic operations and hence should be faster. - * The primary purpose of this particular NEON optimized function is - * bit exact compatibility with jpeg-6b. - */ - -.balign 8 -jsimd_idct_2x2_neon_consts: - .short -FIX_0_720959822 /* d0[0] */ - .short FIX_0_850430095 /* d0[1] */ - .short -FIX_1_272758580 /* d0[2] */ - .short FIX_3_624509785 /* d0[3] */ - -.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 - vshll.s16 q14, \x4, #15 - vmull.s16 q13, \x6, d0[3] - vmlal.s16 q13, \x10, d0[2] - vmlal.s16 q13, \x12, d0[1] - vmlal.s16 q13, \x16, d0[0] - - vadd.s32 q10, q14, q13 - vsub.s32 q14, q14, q13 - - .if \shift > 16 - vrshr.s32 q10, q10, #\shift - vrshr.s32 q14, q14, #\shift - vmovn.s32 \y26, q10 - vmovn.s32 \y27, q14 - .else - vrshrn.s32 \y26, q10, #\shift - vrshrn.s32 \y27, q14, #\shift - .endif -.endm - -asm_function jsimd_idct_2x2_neon - - DCT_TABLE .req r0 - COEF_BLOCK .req r1 - OUTPUT_BUF .req r2 - OUTPUT_COL .req r3 - TMP1 .req r0 - TMP2 .req ip - - vpush {d8-d15} - - /* Load constants */ - adr TMP2, jsimd_idct_2x2_neon_consts - vld1.16 {d0}, [TMP2, :64] - - /* Load all COEF_BLOCK into NEON registers with the following allocation: - * 0 1 2 3 | 4 5 6 7 - * ---------+-------- - * 0 | d4 | d5 - * 1 | d6 | d7 - * 2 | - | - - * 3 | d10 | d11 - * 4 | - | - - * 5 | d12 | d13 - * 6 | - | - - * 7 | d16 | d17 - */ - vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! - add COEF_BLOCK, COEF_BLOCK, #16 - vld1.16 {d10, d11}, [COEF_BLOCK, :128]! - add COEF_BLOCK, COEF_BLOCK, #16 - vld1.16 {d12, d13}, [COEF_BLOCK, :128]! - add COEF_BLOCK, COEF_BLOCK, #16 - vld1.16 {d16, d17}, [COEF_BLOCK, :128]! - /* Dequantize */ - vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! - vmul.s16 q2, q2, q9 - vmul.s16 q3, q3, q10 - add DCT_TABLE, DCT_TABLE, #16 - vld1.16 {d24, d25}, [DCT_TABLE, :128]! - vmul.s16 q5, q5, q12 - add DCT_TABLE, DCT_TABLE, #16 - vld1.16 {d26, d27}, [DCT_TABLE, :128]! - vmul.s16 q6, q6, q13 - add DCT_TABLE, DCT_TABLE, #16 - vld1.16 {d30, d31}, [DCT_TABLE, :128]! - vmul.s16 q8, q8, q15 - - /* Pass 1 */ -#if 0 - idct_helper d4, d6, d10, d12, d16, 13, d4, d6 - transpose_4x4 d4, d6, d8, d10 - idct_helper d5, d7, d11, d13, d17, 13, d5, d7 - transpose_4x4 d5, d7, d9, d11 -#else - vmull.s16 q13, d6, d0[3] - vmlal.s16 q13, d10, d0[2] - vmlal.s16 q13, d12, d0[1] - vmlal.s16 q13, d16, d0[0] - vmull.s16 q12, d7, d0[3] - vmlal.s16 q12, d11, d0[2] - vmlal.s16 q12, d13, d0[1] - vmlal.s16 q12, d17, d0[0] - vshll.s16 q14, d4, #15 - vshll.s16 q15, d5, #15 - vadd.s32 q10, q14, q13 - vsub.s32 q14, q14, q13 - vrshrn.s32 d4, q10, #13 - vrshrn.s32 d6, q14, #13 - vadd.s32 q10, q15, q12 - vsub.s32 q14, q15, q12 - vrshrn.s32 d5, q10, #13 - vrshrn.s32 d7, q14, #13 - vtrn.16 q2, q3 - vtrn.32 q3, q5 -#endif - - /* Pass 2 */ - idct_helper d4, d6, d10, d7, d11, 20, d26, d27 - - /* Range limit */ - vmov.u16 q15, #0x80 - vadd.s16 q13, q13, q15 - vqmovun.s16 d26, q13 - vqmovun.s16 d27, q13 - - /* Store results to the output buffer */ - ldmia OUTPUT_BUF, {TMP1, TMP2} - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - - vst1.8 {d26[0]}, [TMP1]! - vst1.8 {d27[4]}, [TMP1]! - vst1.8 {d26[1]}, [TMP2]! - vst1.8 {d27[5]}, [TMP2]! - - vpop {d8-d15} - bx lr - - .unreq DCT_TABLE - .unreq COEF_BLOCK - .unreq OUTPUT_BUF - .unreq OUTPUT_COL - .unreq TMP1 - .unreq TMP2 - -.purgem idct_helper - - -/*****************************************************************************/ - -/* - * jsimd_ycc_extrgb_convert_neon - * jsimd_ycc_extbgr_convert_neon - * jsimd_ycc_extrgbx_convert_neon - * jsimd_ycc_extbgrx_convert_neon - * jsimd_ycc_extxbgr_convert_neon - * jsimd_ycc_extxrgb_convert_neon - * - * Colorspace conversion YCbCr -> RGB - */ - - -.macro do_load size - .if \size == 8 - vld1.8 {d4}, [U, :64]! - vld1.8 {d5}, [V, :64]! - vld1.8 {d0}, [Y, :64]! - pld [U, #64] - pld [V, #64] - pld [Y, #64] - .elseif \size == 4 - vld1.8 {d4[0]}, [U]! - vld1.8 {d4[1]}, [U]! - vld1.8 {d4[2]}, [U]! - vld1.8 {d4[3]}, [U]! - vld1.8 {d5[0]}, [V]! - vld1.8 {d5[1]}, [V]! - vld1.8 {d5[2]}, [V]! - vld1.8 {d5[3]}, [V]! - vld1.8 {d0[0]}, [Y]! - vld1.8 {d0[1]}, [Y]! - vld1.8 {d0[2]}, [Y]! - vld1.8 {d0[3]}, [Y]! - .elseif \size == 2 - vld1.8 {d4[4]}, [U]! - vld1.8 {d4[5]}, [U]! - vld1.8 {d5[4]}, [V]! - vld1.8 {d5[5]}, [V]! - vld1.8 {d0[4]}, [Y]! - vld1.8 {d0[5]}, [Y]! - .elseif \size == 1 - vld1.8 {d4[6]}, [U]! - vld1.8 {d5[6]}, [V]! - vld1.8 {d0[6]}, [Y]! - .else - .error unsupported macroblock size - .endif -.endm - -.macro do_store bpp, size - .if \bpp == 24 - .if \size == 8 - vst3.8 {d10, d11, d12}, [RGB]! - .elseif \size == 4 - vst3.8 {d10[0], d11[0], d12[0]}, [RGB]! - vst3.8 {d10[1], d11[1], d12[1]}, [RGB]! - vst3.8 {d10[2], d11[2], d12[2]}, [RGB]! - vst3.8 {d10[3], d11[3], d12[3]}, [RGB]! - .elseif \size == 2 - vst3.8 {d10[4], d11[4], d12[4]}, [RGB]! - vst3.8 {d10[5], d11[5], d12[5]}, [RGB]! - .elseif \size == 1 - vst3.8 {d10[6], d11[6], d12[6]}, [RGB]! - .else - .error unsupported macroblock size - .endif - .elseif \bpp == 32 - .if \size == 8 - vst4.8 {d10, d11, d12, d13}, [RGB]! - .elseif \size == 4 - vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! - vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! - vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! - vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! - .elseif \size == 2 - vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! - vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! - .elseif \size == 1 - vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! - .else - .error unsupported macroblock size - .endif - .elseif \bpp == 16 - .if \size == 8 - vst1.16 {q15}, [RGB]! - .elseif \size == 4 - vst1.16 {d30}, [RGB]! - .elseif \size == 2 - vst1.16 {d31[0]}, [RGB]! - vst1.16 {d31[1]}, [RGB]! - .elseif \size == 1 - vst1.16 {d31[2]}, [RGB]! - .else - .error unsupported macroblock size - .endif - .else - .error unsupported bpp - .endif -.endm - -.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs - -/* - * 2-stage pipelined YCbCr->RGB conversion - */ - -.macro do_yuv_to_rgb_stage1 - vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ - vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ - vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ - vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ - vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ - vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ - vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ - vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ - vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ - vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ -.endm - -.macro do_yuv_to_rgb_stage2 - vrshrn.s32 d20, q10, #15 - vrshrn.s32 d21, q11, #15 - vrshrn.s32 d24, q12, #14 - vrshrn.s32 d25, q13, #14 - vrshrn.s32 d28, q14, #14 - vrshrn.s32 d29, q15, #14 - vaddw.u8 q11, q10, d0 - vaddw.u8 q12, q12, d0 - vaddw.u8 q14, q14, d0 - .if \bpp != 16 - vqmovun.s16 d1\g_offs, q11 - vqmovun.s16 d1\r_offs, q12 - vqmovun.s16 d1\b_offs, q14 - .else /* rgb565 */ - vqshlu.s16 q13, q11, #8 - vqshlu.s16 q15, q12, #8 - vqshlu.s16 q14, q14, #8 - vsri.u16 q15, q13, #5 - vsri.u16 q15, q14, #11 - .endif -.endm - -.macro do_yuv_to_rgb_stage2_store_load_stage1 - /* "do_yuv_to_rgb_stage2" and "store" */ - vrshrn.s32 d20, q10, #15 - /* "load" and "do_yuv_to_rgb_stage1" */ - pld [U, #64] - vrshrn.s32 d21, q11, #15 - pld [V, #64] - vrshrn.s32 d24, q12, #14 - vrshrn.s32 d25, q13, #14 - vld1.8 {d4}, [U, :64]! - vrshrn.s32 d28, q14, #14 - vld1.8 {d5}, [V, :64]! - vrshrn.s32 d29, q15, #14 - vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ - vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ - vaddw.u8 q11, q10, d0 - vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ - vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ - vaddw.u8 q12, q12, d0 - vaddw.u8 q14, q14, d0 - .if \bpp != 16 /**************** rgb24/rgb32 ******************************/ - vqmovun.s16 d1\g_offs, q11 - pld [Y, #64] - vqmovun.s16 d1\r_offs, q12 - vld1.8 {d0}, [Y, :64]! - vqmovun.s16 d1\b_offs, q14 - vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ - vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ - do_store \bpp, 8 - vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ - vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ - vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ - vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ - .else /**************************** rgb565 ********************************/ - vqshlu.s16 q13, q11, #8 - pld [Y, #64] - vqshlu.s16 q15, q12, #8 - vqshlu.s16 q14, q14, #8 - vld1.8 {d0}, [Y, :64]! - vmull.s16 q11, d7, d1[1] - vmlal.s16 q11, d9, d1[2] - vsri.u16 q15, q13, #5 - vmull.s16 q12, d8, d1[0] - vsri.u16 q15, q14, #11 - vmull.s16 q13, d9, d1[0] - vmull.s16 q14, d6, d1[3] - do_store \bpp, 8 - vmull.s16 q15, d7, d1[3] - .endif -.endm - -.macro do_yuv_to_rgb - do_yuv_to_rgb_stage1 - do_yuv_to_rgb_stage2 -.endm - -/* Apple gas crashes on adrl, work around that by using adr. - * But this requires a copy of these constants for each function. - */ - -.balign 16 -jsimd_ycc_\colorid\()_neon_consts: - .short 0, 0, 0, 0 - .short 22971, -11277, -23401, 29033 - .short -128, -128, -128, -128 - .short -128, -128, -128, -128 - -asm_function jsimd_ycc_\colorid\()_convert_neon - OUTPUT_WIDTH .req r0 - INPUT_BUF .req r1 - INPUT_ROW .req r2 - OUTPUT_BUF .req r3 - NUM_ROWS .req r4 - - INPUT_BUF0 .req r5 - INPUT_BUF1 .req r6 - INPUT_BUF2 .req INPUT_BUF - - RGB .req r7 - Y .req r8 - U .req r9 - V .req r10 - N .req ip - - /* Load constants to d1, d2, d3 (d0 is just used for padding) */ - adr ip, jsimd_ycc_\colorid\()_neon_consts - vld1.16 {d0, d1, d2, d3}, [ip, :128] - - /* Save ARM registers and handle input arguments */ - push {r4, r5, r6, r7, r8, r9, r10, lr} - ldr NUM_ROWS, [sp, #(4 * 8)] - ldr INPUT_BUF0, [INPUT_BUF] - ldr INPUT_BUF1, [INPUT_BUF, #4] - ldr INPUT_BUF2, [INPUT_BUF, #8] - .unreq INPUT_BUF - - /* Save NEON registers */ - vpush {d8-d15} - - /* Initially set d10, d11, d12, d13 to 0xFF */ - vmov.u8 q5, #255 - vmov.u8 q6, #255 - - /* Outer loop over scanlines */ - cmp NUM_ROWS, #1 - blt 9f -0: - ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2] - ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2] - mov N, OUTPUT_WIDTH - ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2] - add INPUT_ROW, INPUT_ROW, #1 - ldr RGB, [OUTPUT_BUF], #4 - - /* Inner loop over pixels */ - subs N, N, #8 - blt 3f - do_load 8 - do_yuv_to_rgb_stage1 - subs N, N, #8 - blt 2f -1: - do_yuv_to_rgb_stage2_store_load_stage1 - subs N, N, #8 - bge 1b -2: - do_yuv_to_rgb_stage2 - do_store \bpp, 8 - tst N, #7 - beq 8f -3: - tst N, #4 - beq 3f - do_load 4 -3: - tst N, #2 - beq 4f - do_load 2 -4: - tst N, #1 - beq 5f - do_load 1 -5: - do_yuv_to_rgb - tst N, #4 - beq 6f - do_store \bpp, 4 -6: - tst N, #2 - beq 7f - do_store \bpp, 2 -7: - tst N, #1 - beq 8f - do_store \bpp, 1 -8: - subs NUM_ROWS, NUM_ROWS, #1 - bgt 0b -9: - /* Restore all registers and return */ - vpop {d8-d15} - pop {r4, r5, r6, r7, r8, r9, r10, pc} - - .unreq OUTPUT_WIDTH - .unreq INPUT_ROW - .unreq OUTPUT_BUF - .unreq NUM_ROWS - .unreq INPUT_BUF0 - .unreq INPUT_BUF1 - .unreq INPUT_BUF2 - .unreq RGB - .unreq Y - .unreq U - .unreq V - .unreq N - -.purgem do_yuv_to_rgb -.purgem do_yuv_to_rgb_stage1 -.purgem do_yuv_to_rgb_stage2 -.purgem do_yuv_to_rgb_stage2_store_load_stage1 - -.endm - -/*--------------------------------- id ----- bpp R G B */ -generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2 -generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0 -generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2 -generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0 -generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1 -generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3 -generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, 0, 0 - -.purgem do_load -.purgem do_store - - -/*****************************************************************************/ - -/* - * jsimd_extrgb_ycc_convert_neon - * jsimd_extbgr_ycc_convert_neon - * jsimd_extrgbx_ycc_convert_neon - * jsimd_extbgrx_ycc_convert_neon - * jsimd_extxbgr_ycc_convert_neon - * jsimd_extxrgb_ycc_convert_neon - * - * Colorspace conversion RGB -> YCbCr - */ - -.macro do_store size - .if \size == 8 - vst1.8 {d20}, [Y]! - vst1.8 {d21}, [U]! - vst1.8 {d22}, [V]! - .elseif \size == 4 - vst1.8 {d20[0]}, [Y]! - vst1.8 {d20[1]}, [Y]! - vst1.8 {d20[2]}, [Y]! - vst1.8 {d20[3]}, [Y]! - vst1.8 {d21[0]}, [U]! - vst1.8 {d21[1]}, [U]! - vst1.8 {d21[2]}, [U]! - vst1.8 {d21[3]}, [U]! - vst1.8 {d22[0]}, [V]! - vst1.8 {d22[1]}, [V]! - vst1.8 {d22[2]}, [V]! - vst1.8 {d22[3]}, [V]! - .elseif \size == 2 - vst1.8 {d20[4]}, [Y]! - vst1.8 {d20[5]}, [Y]! - vst1.8 {d21[4]}, [U]! - vst1.8 {d21[5]}, [U]! - vst1.8 {d22[4]}, [V]! - vst1.8 {d22[5]}, [V]! - .elseif \size == 1 - vst1.8 {d20[6]}, [Y]! - vst1.8 {d21[6]}, [U]! - vst1.8 {d22[6]}, [V]! - .else - .error unsupported macroblock size - .endif -.endm - -.macro do_load bpp, size - .if \bpp == 24 - .if \size == 8 - vld3.8 {d10, d11, d12}, [RGB]! - pld [RGB, #128] - .elseif \size == 4 - vld3.8 {d10[0], d11[0], d12[0]}, [RGB]! - vld3.8 {d10[1], d11[1], d12[1]}, [RGB]! - vld3.8 {d10[2], d11[2], d12[2]}, [RGB]! - vld3.8 {d10[3], d11[3], d12[3]}, [RGB]! - .elseif \size == 2 - vld3.8 {d10[4], d11[4], d12[4]}, [RGB]! - vld3.8 {d10[5], d11[5], d12[5]}, [RGB]! - .elseif \size == 1 - vld3.8 {d10[6], d11[6], d12[6]}, [RGB]! - .else - .error unsupported macroblock size - .endif - .elseif \bpp == 32 - .if \size == 8 - vld4.8 {d10, d11, d12, d13}, [RGB]! - pld [RGB, #128] - .elseif \size == 4 - vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! - vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! - vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! - vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! - .elseif \size == 2 - vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! - vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! - .elseif \size == 1 - vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! - .else - .error unsupported macroblock size - .endif - .else - .error unsupported bpp - .endif -.endm - -.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs - -/* - * 2-stage pipelined RGB->YCbCr conversion - */ - -.macro do_rgb_to_yuv_stage1 - vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ - vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ - vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ - vmull.u16 q7, d4, d0[0] - vmlal.u16 q7, d6, d0[1] - vmlal.u16 q7, d8, d0[2] - vmull.u16 q8, d5, d0[0] - vmlal.u16 q8, d7, d0[1] - vmlal.u16 q8, d9, d0[2] - vrev64.32 q9, q1 - vrev64.32 q13, q1 - vmlsl.u16 q9, d4, d0[3] - vmlsl.u16 q9, d6, d1[0] - vmlal.u16 q9, d8, d1[1] - vmlsl.u16 q13, d5, d0[3] - vmlsl.u16 q13, d7, d1[0] - vmlal.u16 q13, d9, d1[1] - vrev64.32 q14, q1 - vrev64.32 q15, q1 - vmlal.u16 q14, d4, d1[1] - vmlsl.u16 q14, d6, d1[2] - vmlsl.u16 q14, d8, d1[3] - vmlal.u16 q15, d5, d1[1] - vmlsl.u16 q15, d7, d1[2] - vmlsl.u16 q15, d9, d1[3] -.endm - -.macro do_rgb_to_yuv_stage2 - vrshrn.u32 d20, q7, #16 - vrshrn.u32 d21, q8, #16 - vshrn.u32 d22, q9, #16 - vshrn.u32 d23, q13, #16 - vshrn.u32 d24, q14, #16 - vshrn.u32 d25, q15, #16 - vmovn.u16 d20, q10 /* d20 = y */ - vmovn.u16 d21, q11 /* d21 = u */ - vmovn.u16 d22, q12 /* d22 = v */ -.endm - -.macro do_rgb_to_yuv - do_rgb_to_yuv_stage1 - do_rgb_to_yuv_stage2 -.endm - -.macro do_rgb_to_yuv_stage2_store_load_stage1 - vrshrn.u32 d20, q7, #16 - vrshrn.u32 d21, q8, #16 - vshrn.u32 d22, q9, #16 - vrev64.32 q9, q1 - vshrn.u32 d23, q13, #16 - vrev64.32 q13, q1 - vshrn.u32 d24, q14, #16 - vshrn.u32 d25, q15, #16 - do_load \bpp, 8 - vmovn.u16 d20, q10 /* d20 = y */ - vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ - vmovn.u16 d21, q11 /* d21 = u */ - vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ - vmovn.u16 d22, q12 /* d22 = v */ - vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ - vmull.u16 q7, d4, d0[0] - vmlal.u16 q7, d6, d0[1] - vmlal.u16 q7, d8, d0[2] - vst1.8 {d20}, [Y]! - vmull.u16 q8, d5, d0[0] - vmlal.u16 q8, d7, d0[1] - vmlal.u16 q8, d9, d0[2] - vmlsl.u16 q9, d4, d0[3] - vmlsl.u16 q9, d6, d1[0] - vmlal.u16 q9, d8, d1[1] - vst1.8 {d21}, [U]! - vmlsl.u16 q13, d5, d0[3] - vmlsl.u16 q13, d7, d1[0] - vmlal.u16 q13, d9, d1[1] - vrev64.32 q14, q1 - vrev64.32 q15, q1 - vmlal.u16 q14, d4, d1[1] - vmlsl.u16 q14, d6, d1[2] - vmlsl.u16 q14, d8, d1[3] - vst1.8 {d22}, [V]! - vmlal.u16 q15, d5, d1[1] - vmlsl.u16 q15, d7, d1[2] - vmlsl.u16 q15, d9, d1[3] -.endm - -.balign 16 -jsimd_\colorid\()_ycc_neon_consts: - .short 19595, 38470, 7471, 11059 - .short 21709, 32768, 27439, 5329 - .short 32767, 128, 32767, 128 - .short 32767, 128, 32767, 128 - -asm_function jsimd_\colorid\()_ycc_convert_neon - OUTPUT_WIDTH .req r0 - INPUT_BUF .req r1 - OUTPUT_BUF .req r2 - OUTPUT_ROW .req r3 - NUM_ROWS .req r4 - - OUTPUT_BUF0 .req r5 - OUTPUT_BUF1 .req r6 - OUTPUT_BUF2 .req OUTPUT_BUF - - RGB .req r7 - Y .req r8 - U .req r9 - V .req r10 - N .req ip - - /* Load constants to d0, d1, d2, d3 */ - adr ip, jsimd_\colorid\()_ycc_neon_consts - vld1.16 {d0, d1, d2, d3}, [ip, :128] - - /* Save ARM registers and handle input arguments */ - push {r4, r5, r6, r7, r8, r9, r10, lr} - ldr NUM_ROWS, [sp, #(4 * 8)] - ldr OUTPUT_BUF0, [OUTPUT_BUF] - ldr OUTPUT_BUF1, [OUTPUT_BUF, #4] - ldr OUTPUT_BUF2, [OUTPUT_BUF, #8] - .unreq OUTPUT_BUF - - /* Save NEON registers */ - vpush {d8-d15} - - /* Outer loop over scanlines */ - cmp NUM_ROWS, #1 - blt 9f -0: - ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2] - ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2] - mov N, OUTPUT_WIDTH - ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2] - add OUTPUT_ROW, OUTPUT_ROW, #1 - ldr RGB, [INPUT_BUF], #4 - - /* Inner loop over pixels */ - subs N, N, #8 - blt 3f - do_load \bpp, 8 - do_rgb_to_yuv_stage1 - subs N, N, #8 - blt 2f -1: - do_rgb_to_yuv_stage2_store_load_stage1 - subs N, N, #8 - bge 1b -2: - do_rgb_to_yuv_stage2 - do_store 8 - tst N, #7 - beq 8f -3: - tst N, #4 - beq 3f - do_load \bpp, 4 -3: - tst N, #2 - beq 4f - do_load \bpp, 2 -4: - tst N, #1 - beq 5f - do_load \bpp, 1 -5: - do_rgb_to_yuv - tst N, #4 - beq 6f - do_store 4 -6: - tst N, #2 - beq 7f - do_store 2 -7: - tst N, #1 - beq 8f - do_store 1 -8: - subs NUM_ROWS, NUM_ROWS, #1 - bgt 0b -9: - /* Restore all registers and return */ - vpop {d8-d15} - pop {r4, r5, r6, r7, r8, r9, r10, pc} - - .unreq OUTPUT_WIDTH - .unreq OUTPUT_ROW - .unreq INPUT_BUF - .unreq NUM_ROWS - .unreq OUTPUT_BUF0 - .unreq OUTPUT_BUF1 - .unreq OUTPUT_BUF2 - .unreq RGB - .unreq Y - .unreq U - .unreq V - .unreq N - -.purgem do_rgb_to_yuv -.purgem do_rgb_to_yuv_stage1 -.purgem do_rgb_to_yuv_stage2 -.purgem do_rgb_to_yuv_stage2_store_load_stage1 - -.endm - -/*--------------------------------- id ----- bpp R G B */ -generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2 -generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0 -generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2 -generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0 -generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1 -generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3 - -.purgem do_load -.purgem do_store - - -/*****************************************************************************/ - -/* - * Load data into workspace, applying unsigned->signed conversion - * - * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get - * rid of VST1.16 instructions - */ - -asm_function jsimd_convsamp_neon - SAMPLE_DATA .req r0 - START_COL .req r1 - WORKSPACE .req r2 - TMP1 .req r3 - TMP2 .req r4 - TMP3 .req r5 - TMP4 .req ip - - push {r4, r5} - vmov.u8 d0, #128 - - ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} - add TMP1, TMP1, START_COL - add TMP2, TMP2, START_COL - add TMP3, TMP3, START_COL - add TMP4, TMP4, START_COL - vld1.8 {d16}, [TMP1] - vsubl.u8 q8, d16, d0 - vld1.8 {d18}, [TMP2] - vsubl.u8 q9, d18, d0 - vld1.8 {d20}, [TMP3] - vsubl.u8 q10, d20, d0 - vld1.8 {d22}, [TMP4] - ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} - vsubl.u8 q11, d22, d0 - vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]! - add TMP1, TMP1, START_COL - add TMP2, TMP2, START_COL - vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]! - add TMP3, TMP3, START_COL - add TMP4, TMP4, START_COL - vld1.8 {d24}, [TMP1] - vsubl.u8 q12, d24, d0 - vld1.8 {d26}, [TMP2] - vsubl.u8 q13, d26, d0 - vld1.8 {d28}, [TMP3] - vsubl.u8 q14, d28, d0 - vld1.8 {d30}, [TMP4] - vsubl.u8 q15, d30, d0 - vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]! - vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]! - pop {r4, r5} - bx lr - - .unreq SAMPLE_DATA - .unreq START_COL - .unreq WORKSPACE - .unreq TMP1 - .unreq TMP2 - .unreq TMP3 - .unreq TMP4 - - -/*****************************************************************************/ - -/* - * jsimd_fdct_ifast_neon - * - * This function contains a fast, not so accurate integer implementation of - * the forward DCT (Discrete Cosine Transform). It uses the same calculations - * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' - * function from jfdctfst.c - * - * TODO: can be combined with 'jsimd_convsamp_neon' to get - * rid of a bunch of VLD1.16 instructions - */ - -#define XFIX_0_382683433 d0[0] -#define XFIX_0_541196100 d0[1] -#define XFIX_0_707106781 d0[2] -#define XFIX_1_306562965 d0[3] - -.balign 16 -jsimd_fdct_ifast_neon_consts: - .short (98 * 128) /* XFIX_0_382683433 */ - .short (139 * 128) /* XFIX_0_541196100 */ - .short (181 * 128) /* XFIX_0_707106781 */ - .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ - -asm_function jsimd_fdct_ifast_neon - - DATA .req r0 - TMP .req ip - - vpush {d8-d15} - - /* Load constants */ - adr TMP, jsimd_fdct_ifast_neon_consts - vld1.16 {d0}, [TMP, :64] - - /* Load all DATA into NEON registers with the following allocation: - * 0 1 2 3 | 4 5 6 7 - * ---------+-------- - * 0 | d16 | d17 | q8 - * 1 | d18 | d19 | q9 - * 2 | d20 | d21 | q10 - * 3 | d22 | d23 | q11 - * 4 | d24 | d25 | q12 - * 5 | d26 | d27 | q13 - * 6 | d28 | d29 | q14 - * 7 | d30 | d31 | q15 - */ - - vld1.16 {d16, d17, d18, d19}, [DATA, :128]! - vld1.16 {d20, d21, d22, d23}, [DATA, :128]! - vld1.16 {d24, d25, d26, d27}, [DATA, :128]! - vld1.16 {d28, d29, d30, d31}, [DATA, :128] - sub DATA, DATA, #(128 - 32) - - mov TMP, #2 -1: - /* Transpose */ - vtrn.16 q12, q13 - vtrn.16 q10, q11 - vtrn.16 q8, q9 - vtrn.16 q14, q15 - vtrn.32 q9, q11 - vtrn.32 q13, q15 - vtrn.32 q8, q10 - vtrn.32 q12, q14 - vswp d30, d23 - vswp d24, d17 - vswp d26, d19 - /* 1-D FDCT */ - vadd.s16 q2, q11, q12 - vswp d28, d21 - vsub.s16 q12, q11, q12 - vsub.s16 q6, q10, q13 - vadd.s16 q10, q10, q13 - vsub.s16 q7, q9, q14 - vadd.s16 q9, q9, q14 - vsub.s16 q1, q8, q15 - vadd.s16 q8, q8, q15 - vsub.s16 q4, q9, q10 - vsub.s16 q5, q8, q2 - vadd.s16 q3, q9, q10 - vadd.s16 q4, q4, q5 - vadd.s16 q2, q8, q2 - vqdmulh.s16 q4, q4, XFIX_0_707106781 - vadd.s16 q11, q12, q6 - vadd.s16 q8, q2, q3 - vsub.s16 q12, q2, q3 - vadd.s16 q3, q6, q7 - vadd.s16 q7, q7, q1 - vqdmulh.s16 q3, q3, XFIX_0_707106781 - vsub.s16 q6, q11, q7 - vadd.s16 q10, q5, q4 - vqdmulh.s16 q6, q6, XFIX_0_382683433 - vsub.s16 q14, q5, q4 - vqdmulh.s16 q11, q11, XFIX_0_541196100 - vqdmulh.s16 q5, q7, XFIX_1_306562965 - vadd.s16 q4, q1, q3 - vsub.s16 q3, q1, q3 - vadd.s16 q7, q7, q6 - vadd.s16 q11, q11, q6 - vadd.s16 q7, q7, q5 - vadd.s16 q13, q3, q11 - vsub.s16 q11, q3, q11 - vadd.s16 q9, q4, q7 - vsub.s16 q15, q4, q7 - subs TMP, TMP, #1 - bne 1b - - /* store results */ - vst1.16 {d16, d17, d18, d19}, [DATA, :128]! - vst1.16 {d20, d21, d22, d23}, [DATA, :128]! - vst1.16 {d24, d25, d26, d27}, [DATA, :128]! - vst1.16 {d28, d29, d30, d31}, [DATA, :128] - - vpop {d8-d15} - bx lr - - .unreq DATA - .unreq TMP - - -/*****************************************************************************/ - -/* - * GLOBAL(void) - * jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors, - * DCTELEM *workspace); - * - * Note: the code uses 2 stage pipelining in order to improve instructions - * scheduling and eliminate stalls (this provides ~15% better - * performance for this function on both ARM Cortex-A8 and - * ARM Cortex-A9 when compared to the non-pipelined variant). - * The instructions which belong to the second stage use different - * indentation for better readiability. - */ -asm_function jsimd_quantize_neon - - COEF_BLOCK .req r0 - DIVISORS .req r1 - WORKSPACE .req r2 - - RECIPROCAL .req DIVISORS - CORRECTION .req r3 - SHIFT .req ip - LOOP_COUNT .req r4 - - vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! - vabs.s16 q12, q0 - add CORRECTION, DIVISORS, #(64 * 2) - add SHIFT, DIVISORS, #(64 * 6) - vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! - vabs.s16 q13, q1 - vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! - vadd.u16 q12, q12, q10 /* add correction */ - vadd.u16 q13, q13, q11 - vmull.u16 q10, d24, d16 /* multiply by reciprocal */ - vmull.u16 q11, d25, d17 - vmull.u16 q8, d26, d18 - vmull.u16 q9, d27, d19 - vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! - vshrn.u32 d20, q10, #16 - vshrn.u32 d21, q11, #16 - vshrn.u32 d22, q8, #16 - vshrn.u32 d23, q9, #16 - vneg.s16 q12, q12 - vneg.s16 q13, q13 - vshr.s16 q2, q0, #15 /* extract sign */ - vshr.s16 q3, q1, #15 - vshl.u16 q14, q10, q12 /* shift */ - vshl.u16 q15, q11, q13 - - push {r4, r5} - mov LOOP_COUNT, #3 -1: - vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! - veor.u16 q14, q14, q2 /* restore sign */ - vabs.s16 q12, q0 - vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! - vabs.s16 q13, q1 - veor.u16 q15, q15, q3 - vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! - vadd.u16 q12, q12, q10 /* add correction */ - vadd.u16 q13, q13, q11 - vmull.u16 q10, d24, d16 /* multiply by reciprocal */ - vmull.u16 q11, d25, d17 - vmull.u16 q8, d26, d18 - vmull.u16 q9, d27, d19 - vsub.u16 q14, q14, q2 - vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! - vsub.u16 q15, q15, q3 - vshrn.u32 d20, q10, #16 - vshrn.u32 d21, q11, #16 - vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! - vshrn.u32 d22, q8, #16 - vshrn.u32 d23, q9, #16 - vneg.s16 q12, q12 - vneg.s16 q13, q13 - vshr.s16 q2, q0, #15 /* extract sign */ - vshr.s16 q3, q1, #15 - vshl.u16 q14, q10, q12 /* shift */ - vshl.u16 q15, q11, q13 - subs LOOP_COUNT, LOOP_COUNT, #1 - bne 1b - pop {r4, r5} - - veor.u16 q14, q14, q2 /* restore sign */ - veor.u16 q15, q15, q3 - vsub.u16 q14, q14, q2 - vsub.u16 q15, q15, q3 - vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! - - bx lr /* return */ - - .unreq COEF_BLOCK - .unreq DIVISORS - .unreq WORKSPACE - .unreq RECIPROCAL - .unreq CORRECTION - .unreq SHIFT - .unreq LOOP_COUNT - - -/*****************************************************************************/ - -/* - * GLOBAL(void) - * jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor, - * JDIMENSION downsampled_width, - * JSAMPARRAY input_data, - * JSAMPARRAY *output_data_ptr); - * - * Note: the use of unaligned writes is the main remaining bottleneck in - * this code, which can be potentially solved to get up to tens - * of percents performance improvement on Cortex-A8/Cortex-A9. - */ - -/* - * Upsample 16 source pixels to 32 destination pixels. The new 16 source - * pixels are loaded to q0. The previous 16 source pixels are in q1. The - * shifted-by-one source pixels are constructed in q2 by using q0 and q1. - * Register d28 is used for multiplication by 3. Register q15 is used - * for adding +1 bias. - */ -.macro upsample16 OUTPTR, INPTR - vld1.8 {q0}, [\INPTR]! - vmovl.u8 q8, d0 - vext.8 q2, q1, q0, #15 - vmovl.u8 q9, d1 - vaddw.u8 q10, q15, d4 - vaddw.u8 q11, q15, d5 - vmlal.u8 q8, d4, d28 - vmlal.u8 q9, d5, d28 - vmlal.u8 q10, d0, d28 - vmlal.u8 q11, d1, d28 - vmov q1, q0 /* backup source pixels to q1 */ - vrshrn.u16 d6, q8, #2 - vrshrn.u16 d7, q9, #2 - vshrn.u16 d8, q10, #2 - vshrn.u16 d9, q11, #2 - vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! -.endm - -/* - * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16' - * macro, the roles of q0 and q1 registers are reversed for even and odd - * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed. - * Also this unrolling allows to reorder loads and stores to compensate - * multiplication latency and reduce stalls. - */ -.macro upsample32 OUTPTR, INPTR - /* even 16 pixels group */ - vld1.8 {q0}, [\INPTR]! - vmovl.u8 q8, d0 - vext.8 q2, q1, q0, #15 - vmovl.u8 q9, d1 - vaddw.u8 q10, q15, d4 - vaddw.u8 q11, q15, d5 - vmlal.u8 q8, d4, d28 - vmlal.u8 q9, d5, d28 - vmlal.u8 q10, d0, d28 - vmlal.u8 q11, d1, d28 - /* odd 16 pixels group */ - vld1.8 {q1}, [\INPTR]! - vrshrn.u16 d6, q8, #2 - vrshrn.u16 d7, q9, #2 - vshrn.u16 d8, q10, #2 - vshrn.u16 d9, q11, #2 - vmovl.u8 q8, d2 - vext.8 q2, q0, q1, #15 - vmovl.u8 q9, d3 - vaddw.u8 q10, q15, d4 - vaddw.u8 q11, q15, d5 - vmlal.u8 q8, d4, d28 - vmlal.u8 q9, d5, d28 - vmlal.u8 q10, d2, d28 - vmlal.u8 q11, d3, d28 - vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! - vrshrn.u16 d6, q8, #2 - vrshrn.u16 d7, q9, #2 - vshrn.u16 d8, q10, #2 - vshrn.u16 d9, q11, #2 - vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! -.endm - -/* - * Upsample a row of WIDTH pixels from INPTR to OUTPTR. - */ -.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1 - /* special case for the first and last pixels */ - sub \WIDTH, \WIDTH, #1 - add \OUTPTR, \OUTPTR, #1 - ldrb \TMP1, [\INPTR, \WIDTH] - strb \TMP1, [\OUTPTR, \WIDTH, asl #1] - ldrb \TMP1, [\INPTR], #1 - strb \TMP1, [\OUTPTR, #-1] - vmov.8 d3[7], \TMP1 - - subs \WIDTH, \WIDTH, #32 - blt 5f -0: /* process 32 pixels per iteration */ - upsample32 \OUTPTR, \INPTR - subs \WIDTH, \WIDTH, #32 - bge 0b -5: - adds \WIDTH, \WIDTH, #16 - blt 1f -0: /* process 16 pixels if needed */ - upsample16 \OUTPTR, \INPTR - subs \WIDTH, \WIDTH, #16 -1: - adds \WIDTH, \WIDTH, #16 - beq 9f - - /* load the remaining 1-15 pixels */ - add \INPTR, \INPTR, \WIDTH - tst \WIDTH, #1 - beq 2f - sub \INPTR, \INPTR, #1 - vld1.8 {d0[0]}, [\INPTR] -2: - tst \WIDTH, #2 - beq 2f - vext.8 d0, d0, d0, #6 - sub \INPTR, \INPTR, #1 - vld1.8 {d0[1]}, [\INPTR] - sub \INPTR, \INPTR, #1 - vld1.8 {d0[0]}, [\INPTR] -2: - tst \WIDTH, #4 - beq 2f - vrev64.32 d0, d0 - sub \INPTR, \INPTR, #1 - vld1.8 {d0[3]}, [\INPTR] - sub \INPTR, \INPTR, #1 - vld1.8 {d0[2]}, [\INPTR] - sub \INPTR, \INPTR, #1 - vld1.8 {d0[1]}, [\INPTR] - sub \INPTR, \INPTR, #1 - vld1.8 {d0[0]}, [\INPTR] -2: - tst \WIDTH, #8 - beq 2f - vmov d1, d0 - sub \INPTR, \INPTR, #8 - vld1.8 {d0}, [\INPTR] -2: /* upsample the remaining pixels */ - vmovl.u8 q8, d0 - vext.8 q2, q1, q0, #15 - vmovl.u8 q9, d1 - vaddw.u8 q10, q15, d4 - vaddw.u8 q11, q15, d5 - vmlal.u8 q8, d4, d28 - vmlal.u8 q9, d5, d28 - vmlal.u8 q10, d0, d28 - vmlal.u8 q11, d1, d28 - vrshrn.u16 d10, q8, #2 - vrshrn.u16 d12, q9, #2 - vshrn.u16 d11, q10, #2 - vshrn.u16 d13, q11, #2 - vzip.8 d10, d11 - vzip.8 d12, d13 - /* store the remaining pixels */ - tst \WIDTH, #8 - beq 2f - vst1.8 {d10, d11}, [\OUTPTR]! - vmov q5, q6 -2: - tst \WIDTH, #4 - beq 2f - vst1.8 {d10}, [\OUTPTR]! - vmov d10, d11 -2: - tst \WIDTH, #2 - beq 2f - vst1.8 {d10[0]}, [\OUTPTR]! - vst1.8 {d10[1]}, [\OUTPTR]! - vst1.8 {d10[2]}, [\OUTPTR]! - vst1.8 {d10[3]}, [\OUTPTR]! - vext.8 d10, d10, d10, #4 -2: - tst \WIDTH, #1 - beq 2f - vst1.8 {d10[0]}, [\OUTPTR]! - vst1.8 {d10[1]}, [\OUTPTR]! -2: -9: -.endm - -asm_function jsimd_h2v1_fancy_upsample_neon - - MAX_V_SAMP_FACTOR .req r0 - DOWNSAMPLED_WIDTH .req r1 - INPUT_DATA .req r2 - OUTPUT_DATA_PTR .req r3 - OUTPUT_DATA .req OUTPUT_DATA_PTR - - OUTPTR .req r4 - INPTR .req r5 - WIDTH .req ip - TMP .req lr - - push {r4, r5, r6, lr} - vpush {d8-d15} - - ldr OUTPUT_DATA, [OUTPUT_DATA_PTR] - cmp MAX_V_SAMP_FACTOR, #0 - ble 99f - - /* initialize constants */ - vmov.u8 d28, #3 - vmov.u16 q15, #1 -11: - ldr INPTR, [INPUT_DATA], #4 - ldr OUTPTR, [OUTPUT_DATA], #4 - mov WIDTH, DOWNSAMPLED_WIDTH - upsample_row OUTPTR, INPTR, WIDTH, TMP - subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1 - bgt 11b - -99: - vpop {d8-d15} - pop {r4, r5, r6, pc} - - .unreq MAX_V_SAMP_FACTOR - .unreq DOWNSAMPLED_WIDTH - .unreq INPUT_DATA - .unreq OUTPUT_DATA_PTR - .unreq OUTPUT_DATA - - .unreq OUTPTR - .unreq INPTR - .unreq WIDTH - .unreq TMP - -.purgem upsample16 -.purgem upsample32 -.purgem upsample_row - - -/*****************************************************************************/ - -/* - * GLOBAL(JOCTET *) - * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer, - * JCOEFPTR block, int last_dc_val, - * c_derived_tbl *dctbl, c_derived_tbl *actbl) - * - */ - -.macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP - sub \PUT_BITS, \PUT_BITS, #0x8 - lsr \TMP, \PUT_BUFFER, \PUT_BITS - uxtb \TMP, \TMP - strb \TMP, [\BUFFER, #1]! - cmp \TMP, #0xff - /*it eq*/ - strbeq \ZERO, [\BUFFER, #1]! -.endm - -.macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE - /*lsl \PUT_BUFFER, \PUT_BUFFER, \SIZE*/ - add \PUT_BITS, \SIZE - /*orr \PUT_BUFFER, \PUT_BUFFER, \CODE*/ - orr \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE -.endm - -.macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP - cmp \PUT_BITS, #0x10 - blt 15f - eor \ZERO, \ZERO, \ZERO - emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP - emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP -15: -.endm - -.balign 16 -jsimd_huff_encode_one_block_neon_consts: - .byte 0x01 - .byte 0x02 - .byte 0x04 - .byte 0x08 - .byte 0x10 - .byte 0x20 - .byte 0x40 - .byte 0x80 - -asm_function jsimd_huff_encode_one_block_neon - push {r4, r5, r6, r7, r8, r9, r10, r11, lr} - add r7, sp, #0x1c - sub r4, sp, #0x40 - bfc r4, #0, #5 - mov sp, r4 /* align sp on 32 bytes */ - vst1.64 {d8, d9, d10, d11}, [r4, :128]! - vst1.64 {d12, d13, d14, d15}, [r4, :128] - sub sp, #0x140 /* reserve 320 bytes */ - str r0, [sp, #0x18] /* working state > sp + Ox18 */ - add r4, sp, #0x20 /* r4 = t1 */ - ldr lr, [r7, #0x8] /* lr = dctbl */ - sub r10, r1, #0x1 /* r10=buffer-- */ - ldrsh r1, [r2] - mov r9, #0x10 - mov r8, #0x1 - adr r5, jsimd_huff_encode_one_block_neon_consts - /* prepare data */ - vld1.8 {d26}, [r5, :64] - veor q8, q8, q8 - veor q9, q9, q9 - vdup.16 q14, r9 - vdup.16 q15, r8 - veor q10, q10, q10 - veor q11, q11, q11 - sub r1, r1, r3 - add r9, r2, #0x22 - add r8, r2, #0x18 - add r3, r2, #0x36 - vmov.16 d0[0], r1 - vld1.16 {d2[0]}, [r9, :16] - vld1.16 {d4[0]}, [r8, :16] - vld1.16 {d6[0]}, [r3, :16] - add r1, r2, #0x2 - add r9, r2, #0x30 - add r8, r2, #0x26 - add r3, r2, #0x28 - vld1.16 {d0[1]}, [r1, :16] - vld1.16 {d2[1]}, [r9, :16] - vld1.16 {d4[1]}, [r8, :16] - vld1.16 {d6[1]}, [r3, :16] - add r1, r2, #0x10 - add r9, r2, #0x40 - add r8, r2, #0x34 - add r3, r2, #0x1a - vld1.16 {d0[2]}, [r1, :16] - vld1.16 {d2[2]}, [r9, :16] - vld1.16 {d4[2]}, [r8, :16] - vld1.16 {d6[2]}, [r3, :16] - add r1, r2, #0x20 - add r9, r2, #0x32 - add r8, r2, #0x42 - add r3, r2, #0xc - vld1.16 {d0[3]}, [r1, :16] - vld1.16 {d2[3]}, [r9, :16] - vld1.16 {d4[3]}, [r8, :16] - vld1.16 {d6[3]}, [r3, :16] - add r1, r2, #0x12 - add r9, r2, #0x24 - add r8, r2, #0x50 - add r3, r2, #0xe - vld1.16 {d1[0]}, [r1, :16] - vld1.16 {d3[0]}, [r9, :16] - vld1.16 {d5[0]}, [r8, :16] - vld1.16 {d7[0]}, [r3, :16] - add r1, r2, #0x4 - add r9, r2, #0x16 - add r8, r2, #0x60 - add r3, r2, #0x1c - vld1.16 {d1[1]}, [r1, :16] - vld1.16 {d3[1]}, [r9, :16] - vld1.16 {d5[1]}, [r8, :16] - vld1.16 {d7[1]}, [r3, :16] - add r1, r2, #0x6 - add r9, r2, #0x8 - add r8, r2, #0x52 - add r3, r2, #0x2a - vld1.16 {d1[2]}, [r1, :16] - vld1.16 {d3[2]}, [r9, :16] - vld1.16 {d5[2]}, [r8, :16] - vld1.16 {d7[2]}, [r3, :16] - add r1, r2, #0x14 - add r9, r2, #0xa - add r8, r2, #0x44 - add r3, r2, #0x38 - vld1.16 {d1[3]}, [r1, :16] - vld1.16 {d3[3]}, [r9, :16] - vld1.16 {d5[3]}, [r8, :16] - vld1.16 {d7[3]}, [r3, :16] - vcgt.s16 q8, q8, q0 - vcgt.s16 q9, q9, q1 - vcgt.s16 q10, q10, q2 - vcgt.s16 q11, q11, q3 - vabs.s16 q0, q0 - vabs.s16 q1, q1 - vabs.s16 q2, q2 - vabs.s16 q3, q3 - veor q8, q8, q0 - veor q9, q9, q1 - veor q10, q10, q2 - veor q11, q11, q3 - add r9, r4, #0x20 - add r8, r4, #0x80 - add r3, r4, #0xa0 - vclz.i16 q0, q0 - vclz.i16 q1, q1 - vclz.i16 q2, q2 - vclz.i16 q3, q3 - vsub.i16 q0, q14, q0 - vsub.i16 q1, q14, q1 - vsub.i16 q2, q14, q2 - vsub.i16 q3, q14, q3 - vst1.16 {d0, d1, d2, d3}, [r4, :256] - vst1.16 {d4, d5, d6, d7}, [r9, :256] - vshl.s16 q0, q15, q0 - vshl.s16 q1, q15, q1 - vshl.s16 q2, q15, q2 - vshl.s16 q3, q15, q3 - vsub.i16 q0, q0, q15 - vsub.i16 q1, q1, q15 - vsub.i16 q2, q2, q15 - vsub.i16 q3, q3, q15 - vand q8, q8, q0 - vand q9, q9, q1 - vand q10, q10, q2 - vand q11, q11, q3 - vst1.16 {d16, d17, d18, d19}, [r8, :256] - vst1.16 {d20, d21, d22, d23}, [r3, :256] - add r1, r2, #0x46 - add r9, r2, #0x3a - add r8, r2, #0x74 - add r3, r2, #0x6a - vld1.16 {d8[0]}, [r1, :16] - vld1.16 {d10[0]}, [r9, :16] - vld1.16 {d12[0]}, [r8, :16] - vld1.16 {d14[0]}, [r3, :16] - veor q8, q8, q8 - veor q9, q9, q9 - veor q10, q10, q10 - veor q11, q11, q11 - add r1, r2, #0x54 - add r9, r2, #0x2c - add r8, r2, #0x76 - add r3, r2, #0x78 - vld1.16 {d8[1]}, [r1, :16] - vld1.16 {d10[1]}, [r9, :16] - vld1.16 {d12[1]}, [r8, :16] - vld1.16 {d14[1]}, [r3, :16] - add r1, r2, #0x62 - add r9, r2, #0x1e - add r8, r2, #0x68 - add r3, r2, #0x7a - vld1.16 {d8[2]}, [r1, :16] - vld1.16 {d10[2]}, [r9, :16] - vld1.16 {d12[2]}, [r8, :16] - vld1.16 {d14[2]}, [r3, :16] - add r1, r2, #0x70 - add r9, r2, #0x2e - add r8, r2, #0x5a - add r3, r2, #0x6c - vld1.16 {d8[3]}, [r1, :16] - vld1.16 {d10[3]}, [r9, :16] - vld1.16 {d12[3]}, [r8, :16] - vld1.16 {d14[3]}, [r3, :16] - add r1, r2, #0x72 - add r9, r2, #0x3c - add r8, r2, #0x4c - add r3, r2, #0x5e - vld1.16 {d9[0]}, [r1, :16] - vld1.16 {d11[0]}, [r9, :16] - vld1.16 {d13[0]}, [r8, :16] - vld1.16 {d15[0]}, [r3, :16] - add r1, r2, #0x64 - add r9, r2, #0x4a - add r8, r2, #0x3e - add r3, r2, #0x6e - vld1.16 {d9[1]}, [r1, :16] - vld1.16 {d11[1]}, [r9, :16] - vld1.16 {d13[1]}, [r8, :16] - vld1.16 {d15[1]}, [r3, :16] - add r1, r2, #0x56 - add r9, r2, #0x58 - add r8, r2, #0x4e - add r3, r2, #0x7c - vld1.16 {d9[2]}, [r1, :16] - vld1.16 {d11[2]}, [r9, :16] - vld1.16 {d13[2]}, [r8, :16] - vld1.16 {d15[2]}, [r3, :16] - add r1, r2, #0x48 - add r9, r2, #0x66 - add r8, r2, #0x5c - add r3, r2, #0x7e - vld1.16 {d9[3]}, [r1, :16] - vld1.16 {d11[3]}, [r9, :16] - vld1.16 {d13[3]}, [r8, :16] - vld1.16 {d15[3]}, [r3, :16] - vcgt.s16 q8, q8, q4 - vcgt.s16 q9, q9, q5 - vcgt.s16 q10, q10, q6 - vcgt.s16 q11, q11, q7 - vabs.s16 q4, q4 - vabs.s16 q5, q5 - vabs.s16 q6, q6 - vabs.s16 q7, q7 - veor q8, q8, q4 - veor q9, q9, q5 - veor q10, q10, q6 - veor q11, q11, q7 - add r1, r4, #0x40 - add r9, r4, #0x60 - add r8, r4, #0xc0 - add r3, r4, #0xe0 - vclz.i16 q4, q4 - vclz.i16 q5, q5 - vclz.i16 q6, q6 - vclz.i16 q7, q7 - vsub.i16 q4, q14, q4 - vsub.i16 q5, q14, q5 - vsub.i16 q6, q14, q6 - vsub.i16 q7, q14, q7 - vst1.16 {d8, d9, d10, d11}, [r1, :256] - vst1.16 {d12, d13, d14, d15}, [r9, :256] - vshl.s16 q4, q15, q4 - vshl.s16 q5, q15, q5 - vshl.s16 q6, q15, q6 - vshl.s16 q7, q15, q7 - vsub.i16 q4, q4, q15 - vsub.i16 q5, q5, q15 - vsub.i16 q6, q6, q15 - vsub.i16 q7, q7, q15 - vand q8, q8, q4 - vand q9, q9, q5 - vand q10, q10, q6 - vand q11, q11, q7 - vst1.16 {d16, d17, d18, d19}, [r8, :256] - vst1.16 {d20, d21, d22, d23}, [r3, :256] - ldr r12, [r7, #0xc] /* r12 = actbl */ - add r1, lr, #0x400 /* r1 = dctbl->ehufsi */ - mov r9, r12 /* r9 = actbl */ - add r6, r4, #0x80 /* r6 = t2 */ - ldr r11, [r0, #0x8] /* r11 = put_buffer */ - ldr r4, [r0, #0xc] /* r4 = put_bits */ - ldrh r2, [r6, #-128] /* r2 = nbits */ - ldrh r3, [r6] /* r3 = temp2 & (((JLONG)1)<<nbits) - 1; */ - ldr r0, [lr, r2, lsl #2] - ldrb r5, [r1, r2] - put_bits r11, r4, r0, r5 - checkbuf15 r10, r11, r4, r5, r0 - put_bits r11, r4, r3, r2 - checkbuf15 r10, r11, r4, r5, r0 - mov lr, r6 /* lr = t2 */ - add r5, r9, #0x400 /* r5 = actbl->ehufsi */ - ldrsb r6, [r5, #0xf0] /* r6 = actbl->ehufsi[0xf0] */ - veor q8, q8, q8 - vceq.i16 q0, q0, q8 - vceq.i16 q1, q1, q8 - vceq.i16 q2, q2, q8 - vceq.i16 q3, q3, q8 - vceq.i16 q4, q4, q8 - vceq.i16 q5, q5, q8 - vceq.i16 q6, q6, q8 - vceq.i16 q7, q7, q8 - vmovn.i16 d0, q0 - vmovn.i16 d2, q1 - vmovn.i16 d4, q2 - vmovn.i16 d6, q3 - vmovn.i16 d8, q4 - vmovn.i16 d10, q5 - vmovn.i16 d12, q6 - vmovn.i16 d14, q7 - vand d0, d0, d26 - vand d2, d2, d26 - vand d4, d4, d26 - vand d6, d6, d26 - vand d8, d8, d26 - vand d10, d10, d26 - vand d12, d12, d26 - vand d14, d14, d26 - vpadd.i8 d0, d0, d2 - vpadd.i8 d4, d4, d6 - vpadd.i8 d8, d8, d10 - vpadd.i8 d12, d12, d14 - vpadd.i8 d0, d0, d4 - vpadd.i8 d8, d8, d12 - vpadd.i8 d0, d0, d8 - vmov.32 r1, d0[1] - vmov.32 r8, d0[0] - mvn r1, r1 - mvn r8, r8 - lsrs r1, r1, #0x1 - rrx r8, r8 /* shift in last r1 bit while shifting out DC bit */ - rbit r1, r1 /* r1 = index1 */ - rbit r8, r8 /* r8 = index0 */ - ldr r0, [r9, #0x3c0] /* r0 = actbl->ehufco[0xf0] */ - str r1, [sp, #0x14] /* index1 > sp + 0x14 */ - cmp r8, #0x0 - beq 6f -1: - clz r2, r8 - add lr, lr, r2, lsl #1 - lsl r8, r8, r2 - ldrh r1, [lr, #-126] -2: - cmp r2, #0x10 - blt 3f - sub r2, r2, #0x10 - put_bits r11, r4, r0, r6 - cmp r4, #0x10 - blt 2b - eor r3, r3, r3 - emit_byte r10, r11, r4, r3, r12 - emit_byte r10, r11, r4, r3, r12 - b 2b -3: - add r2, r1, r2, lsl #4 - ldrh r3, [lr, #2]! - ldr r12, [r9, r2, lsl #2] - ldrb r2, [r5, r2] - put_bits r11, r4, r12, r2 - checkbuf15 r10, r11, r4, r2, r12 - put_bits r11, r4, r3, r1 - checkbuf15 r10, r11, r4, r2, r12 - lsls r8, r8, #0x1 - bne 1b -6: - add r12, sp, #0x20 /* r12 = t1 */ - ldr r8, [sp, #0x14] /* r8 = index1 */ - adds r12, #0xc0 /* r12 = t2 + (DCTSIZE2/2) */ - cmp r8, #0x0 - beq 6f - clz r2, r8 - sub r12, r12, lr - lsl r8, r8, r2 - add r2, r2, r12, lsr #1 - add lr, lr, r2, lsl #1 - b 7f -1: - clz r2, r8 - add lr, lr, r2, lsl #1 - lsl r8, r8, r2 -7: - ldrh r1, [lr, #-126] -2: - cmp r2, #0x10 - blt 3f - sub r2, r2, #0x10 - put_bits r11, r4, r0, r6 - cmp r4, #0x10 - blt 2b - eor r3, r3, r3 - emit_byte r10, r11, r4, r3, r12 - emit_byte r10, r11, r4, r3, r12 - b 2b -3: - add r2, r1, r2, lsl #4 - ldrh r3, [lr, #2]! - ldr r12, [r9, r2, lsl #2] - ldrb r2, [r5, r2] - put_bits r11, r4, r12, r2 - checkbuf15 r10, r11, r4, r2, r12 - put_bits r11, r4, r3, r1 - checkbuf15 r10, r11, r4, r2, r12 - lsls r8, r8, #0x1 - bne 1b -6: - add r0, sp, #0x20 - add r0, #0xfe - cmp lr, r0 - bhs 1f - ldr r1, [r9] - ldrb r0, [r5] - put_bits r11, r4, r1, r0 - checkbuf15 r10, r11, r4, r0, r1 -1: - ldr r12, [sp, #0x18] - str r11, [r12, #0x8] - str r4, [r12, #0xc] - add r0, r10, #0x1 - add r4, sp, #0x140 - vld1.64 {d8, d9, d10, d11}, [r4, :128]! - vld1.64 {d12, d13, d14, d15}, [r4, :128] - sub r4, r7, #0x1c - mov sp, r4 - pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} - -.purgem emit_byte -.purgem put_bits -.purgem checkbuf15 diff --git a/simd/arm64/jsimd_neon.S b/simd/arm64/jsimd_neon.S deleted file mode 100644 index d30715a..0000000 --- a/simd/arm64/jsimd_neon.S +++ /dev/null @@ -1,3432 +0,0 @@ -/* - * ARMv8 NEON optimizations for libjpeg-turbo - * - * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies). - * All Rights Reserved. - * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> - * Copyright (C) 2013-2014, Linaro Limited. All Rights Reserved. - * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org> - * Copyright (C) 2014-2016, D. R. Commander. All Rights Reserved. - * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved. - * Copyright (C) 2016, Siarhei Siamashka. All Rights Reserved. - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */ -#endif - -#if defined(__APPLE__) -.section __DATA,__const -#else -.section .rodata, "a", %progbits -#endif - -#define F_0_298 2446 /* FIX(0.298631336) */ -#define F_0_390 3196 /* FIX(0.390180644) */ -#define F_0_541 4433 /* FIX(0.541196100) */ -#define F_0_765 6270 /* FIX(0.765366865) */ -#define F_0_899 7373 /* FIX(0.899976223) */ -#define F_1_175 9633 /* FIX(1.175875602) */ -#define F_1_501 12299 /* FIX(1.501321110) */ -#define F_1_847 15137 /* FIX(1.847759065) */ -#define F_1_961 16069 /* FIX(1.961570560) */ -#define F_2_053 16819 /* FIX(2.053119869) */ -#define F_2_562 20995 /* FIX(2.562915447) */ -#define F_3_072 25172 /* FIX(3.072711026) */ - -.balign 16 -Ljsimd_idct_islow_neon_consts: - .short F_0_298 - .short -F_0_390 - .short F_0_541 - .short F_0_765 - .short - F_0_899 - .short F_1_175 - .short F_1_501 - .short - F_1_847 - .short - F_1_961 - .short F_2_053 - .short - F_2_562 - .short F_3_072 - .short 0 /* padding */ - .short 0 - .short 0 - .short 0 - -#undef F_0_298 -#undef F_0_390 -#undef F_0_541 -#undef F_0_765 -#undef F_0_899 -#undef F_1_175 -#undef F_1_501 -#undef F_1_847 -#undef F_1_961 -#undef F_2_053 -#undef F_2_562 -#undef F_3_072 - - -#define XFIX_1_082392200 v0.h[0] -#define XFIX_1_414213562 v0.h[1] -#define XFIX_1_847759065 v0.h[2] -#define XFIX_2_613125930 v0.h[3] - -.balign 16 -Ljsimd_idct_ifast_neon_consts: - .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ - .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ - .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ - .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ - -#define CONST_BITS 13 -#define PASS1_BITS 2 - -#define FIX_0_211164243 (1730) /* FIX(0.211164243) */ -#define FIX_0_509795579 (4176) /* FIX(0.509795579) */ -#define FIX_0_601344887 (4926) /* FIX(0.601344887) */ -#define FIX_0_720959822 (5906) /* FIX(0.720959822) */ -#define FIX_0_765366865 (6270) /* FIX(0.765366865) */ -#define FIX_0_850430095 (6967) /* FIX(0.850430095) */ -#define FIX_0_899976223 (7373) /* FIX(0.899976223) */ -#define FIX_1_061594337 (8697) /* FIX(1.061594337) */ -#define FIX_1_272758580 (10426) /* FIX(1.272758580) */ -#define FIX_1_451774981 (11893) /* FIX(1.451774981) */ -#define FIX_1_847759065 (15137) /* FIX(1.847759065) */ -#define FIX_2_172734803 (17799) /* FIX(2.172734803) */ -#define FIX_2_562915447 (20995) /* FIX(2.562915447) */ -#define FIX_3_624509785 (29692) /* FIX(3.624509785) */ - -.balign 16 -Ljsimd_idct_4x4_neon_consts: - .short FIX_1_847759065 /* v0.h[0] */ - .short -FIX_0_765366865 /* v0.h[1] */ - .short -FIX_0_211164243 /* v0.h[2] */ - .short FIX_1_451774981 /* v0.h[3] */ - .short -FIX_2_172734803 /* d1[0] */ - .short FIX_1_061594337 /* d1[1] */ - .short -FIX_0_509795579 /* d1[2] */ - .short -FIX_0_601344887 /* d1[3] */ - .short FIX_0_899976223 /* v2.h[0] */ - .short FIX_2_562915447 /* v2.h[1] */ - .short 1 << (CONST_BITS + 1) /* v2.h[2] */ - .short 0 /* v2.h[3] */ - -.balign 8 -Ljsimd_idct_2x2_neon_consts: - .short -FIX_0_720959822 /* v14[0] */ - .short FIX_0_850430095 /* v14[1] */ - .short -FIX_1_272758580 /* v14[2] */ - .short FIX_3_624509785 /* v14[3] */ - -.balign 16 -Ljsimd_ycc_colorid_neon_consts: - .short 0, 0, 0, 0 - .short 22971, -11277, -23401, 29033 - .short -128, -128, -128, -128 - .short -128, -128, -128, -128 - -.balign 16 -Ljsimd_colorid_ycc_neon_consts: - .short 19595, 38470, 7471, 11059 - .short 21709, 32768, 27439, 5329 - .short 32767, 128, 32767, 128 - .short 32767, 128, 32767, 128 - -#define F_0_298 2446 /* FIX(0.298631336) */ -#define F_0_390 3196 /* FIX(0.390180644) */ -#define F_0_541 4433 /* FIX(0.541196100) */ -#define F_0_765 6270 /* FIX(0.765366865) */ -#define F_0_899 7373 /* FIX(0.899976223) */ -#define F_1_175 9633 /* FIX(1.175875602) */ -#define F_1_501 12299 /* FIX(1.501321110) */ -#define F_1_847 15137 /* FIX(1.847759065) */ -#define F_1_961 16069 /* FIX(1.961570560) */ -#define F_2_053 16819 /* FIX(2.053119869) */ -#define F_2_562 20995 /* FIX(2.562915447) */ -#define F_3_072 25172 /* FIX(3.072711026) */ - -.balign 16 -Ljsimd_fdct_islow_neon_consts: - .short F_0_298 - .short -F_0_390 - .short F_0_541 - .short F_0_765 - .short - F_0_899 - .short F_1_175 - .short F_1_501 - .short - F_1_847 - .short - F_1_961 - .short F_2_053 - .short - F_2_562 - .short F_3_072 - .short 0 /* padding */ - .short 0 - .short 0 - .short 0 - -#undef F_0_298 -#undef F_0_390 -#undef F_0_541 -#undef F_0_765 -#undef F_0_899 -#undef F_1_175 -#undef F_1_501 -#undef F_1_847 -#undef F_1_961 -#undef F_2_053 -#undef F_2_562 -#undef F_3_072 - -.balign 16 -Ljsimd_fdct_ifast_neon_consts: - .short (98 * 128) /* XFIX_0_382683433 */ - .short (139 * 128) /* XFIX_0_541196100 */ - .short (181 * 128) /* XFIX_0_707106781 */ - .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ - -.balign 16 -Ljsimd_h2_downsample_neon_consts: - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ - 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F /* diff 0 */ - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ - 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E /* diff 1 */ - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ - 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D /* diff 2 */ - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ - 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C /* diff 3 */ - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ - 0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B /* diff 4 */ - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ - 0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A /* diff 5 */ - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ - 0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09 /* diff 6 */ - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ - 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08 /* diff 7 */ - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ - 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 /* diff 8 */ - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \ - 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06 /* diff 9 */ - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \ - 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05 /* diff 10 */ - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \ - 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 /* diff 11 */ - .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \ - 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 /* diff 12 */ - .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \ - 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 /* diff 13 */ - .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \ - 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 /* diff 14 */ - .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* diff 15 */ - -Ljsimd_huff_encode_one_block_neon_consts: - .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \ - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 - .byte 0, 1, 2, 3, 16, 17, 32, 33, \ - 18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */ - .byte 34, 35, 48, 49, 255, 255, 50, 51, \ - 36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */ - .byte 8, 9, 22, 23, 36, 37, 50, 51, \ - 255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */ - .byte 54, 55, 40, 41, 26, 27, 12, 13, \ - 14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */ - .byte 6, 7, 20, 21, 34, 35, 48, 49, \ - 50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */ - .byte 42, 43, 28, 29, 14, 15, 30, 31, \ - 44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */ - .byte 255, 255, 255, 255, 56, 57, 42, 43, \ - 28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */ - .byte 26, 27, 40, 41, 42, 43, 28, 29, \ - 14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */ - .byte 255, 255, 255, 255, 0, 1, 255, 255, \ - 255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */ - .byte 255, 255, 255, 255, 255, 255, 255, 255, \ - 0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */ - .byte 255, 255, 255, 255, 255, 255, 255, 255, \ - 255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */ - .byte 4, 5, 6, 7, 255, 255, 255, 255, \ - 255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */ -Ljsimd_huff_encode_one_block_neon_slowtbl_consts: - .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \ - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 - -.text - - -#define RESPECT_STRICT_ALIGNMENT 1 - - -/*****************************************************************************/ - -/* Supplementary macro for setting function attributes */ -.macro asm_function fname -#ifdef __APPLE__ - .private_extern _\fname - .globl _\fname -_\fname: -#else - .global \fname -#ifdef __ELF__ - .hidden \fname - .type \fname, %function -#endif -\fname: -#endif -.endm - -.macro get_symbol_loc xi, symbol -#ifdef __APPLE__ - adrp \xi, \symbol@PAGE - add \xi, \xi, \symbol@PAGEOFF -#else - adrp \xi, \symbol - add \xi, \xi, :lo12:\symbol -#endif -.endm - -/* Transpose elements of single 128 bit registers */ -.macro transpose_single x0, x1, xi, xilen, literal - ins \xi\xilen[0], \x0\xilen[0] - ins \x1\xilen[0], \x0\xilen[1] - trn1 \x0\literal, \x0\literal, \x1\literal - trn2 \x1\literal, \xi\literal, \x1\literal -.endm - -/* Transpose elements of 2 different registers */ -.macro transpose x0, x1, xi, xilen, literal - mov \xi\xilen, \x0\xilen - trn1 \x0\literal, \x0\literal, \x1\literal - trn2 \x1\literal, \xi\literal, \x1\literal -.endm - -/* Transpose a block of 4x4 coefficients in four 64-bit registers */ -.macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen - mov \xi\xilen, \x0\xilen - trn1 \x0\x0len, \x0\x0len, \x2\x2len - trn2 \x2\x2len, \xi\x0len, \x2\x2len - mov \xi\xilen, \x1\xilen - trn1 \x1\x1len, \x1\x1len, \x3\x3len - trn2 \x3\x3len, \xi\x1len, \x3\x3len -.endm - -.macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen - mov \xi\xilen, \x0\xilen - trn1 \x0\x0len, \x0\x0len, \x1\x1len - trn2 \x1\x2len, \xi\x0len, \x1\x2len - mov \xi\xilen, \x2\xilen - trn1 \x2\x2len, \x2\x2len, \x3\x3len - trn2 \x3\x2len, \xi\x1len, \x3\x3len -.endm - -.macro transpose_4x4 x0, x1, x2, x3, x5 - transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b - transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b -.endm - -.macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3 - trn1 \t0\().8h, \l0\().8h, \l1\().8h - trn1 \t1\().8h, \l2\().8h, \l3\().8h - trn1 \t2\().8h, \l4\().8h, \l5\().8h - trn1 \t3\().8h, \l6\().8h, \l7\().8h - trn2 \l1\().8h, \l0\().8h, \l1\().8h - trn2 \l3\().8h, \l2\().8h, \l3\().8h - trn2 \l5\().8h, \l4\().8h, \l5\().8h - trn2 \l7\().8h, \l6\().8h, \l7\().8h - - trn1 \l4\().4s, \t2\().4s, \t3\().4s - trn2 \t3\().4s, \t2\().4s, \t3\().4s - trn1 \t2\().4s, \t0\().4s, \t1\().4s - trn2 \l2\().4s, \t0\().4s, \t1\().4s - trn1 \t0\().4s, \l1\().4s, \l3\().4s - trn2 \l3\().4s, \l1\().4s, \l3\().4s - trn2 \t1\().4s, \l5\().4s, \l7\().4s - trn1 \l5\().4s, \l5\().4s, \l7\().4s - - trn2 \l6\().2d, \l2\().2d, \t3\().2d - trn1 \l0\().2d, \t2\().2d, \l4\().2d - trn1 \l1\().2d, \t0\().2d, \l5\().2d - trn2 \l7\().2d, \l3\().2d, \t1\().2d - trn1 \l2\().2d, \l2\().2d, \t3\().2d - trn2 \l4\().2d, \t2\().2d, \l4\().2d - trn1 \l3\().2d, \l3\().2d, \t1\().2d - trn2 \l5\().2d, \t0\().2d, \l5\().2d -.endm - - -#define CENTERJSAMPLE 128 - -/*****************************************************************************/ - -/* - * Perform dequantization and inverse DCT on one block of coefficients. - * - * GLOBAL(void) - * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block, - * JSAMPARRAY output_buf, JDIMENSION output_col) - */ - -#define CONST_BITS 13 -#define PASS1_BITS 2 - -#define XFIX_P_0_298 v0.h[0] -#define XFIX_N_0_390 v0.h[1] -#define XFIX_P_0_541 v0.h[2] -#define XFIX_P_0_765 v0.h[3] -#define XFIX_N_0_899 v0.h[4] -#define XFIX_P_1_175 v0.h[5] -#define XFIX_P_1_501 v0.h[6] -#define XFIX_N_1_847 v0.h[7] -#define XFIX_N_1_961 v1.h[0] -#define XFIX_P_2_053 v1.h[1] -#define XFIX_N_2_562 v1.h[2] -#define XFIX_P_3_072 v1.h[3] - -asm_function jsimd_idct_islow_neon - DCT_TABLE .req x0 - COEF_BLOCK .req x1 - OUTPUT_BUF .req x2 - OUTPUT_COL .req x3 - TMP1 .req x0 - TMP2 .req x1 - TMP3 .req x9 - TMP4 .req x10 - TMP5 .req x11 - TMP6 .req x12 - TMP7 .req x13 - TMP8 .req x14 - - /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't - guarantee that the upper (unused) 32 bits of x3 are valid. This - instruction ensures that those bits are set to zero. */ - uxtw x3, w3 - - sub sp, sp, #64 - get_symbol_loc x15, Ljsimd_idct_islow_neon_consts - mov x10, sp - st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32 - st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32 - ld1 {v0.8h, v1.8h}, [x15] - ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64 - ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64 - ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64 - ld1 {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64 - - cmeq v16.8h, v3.8h, #0 - cmeq v26.8h, v4.8h, #0 - cmeq v27.8h, v5.8h, #0 - cmeq v28.8h, v6.8h, #0 - cmeq v29.8h, v7.8h, #0 - cmeq v30.8h, v8.8h, #0 - cmeq v31.8h, v9.8h, #0 - - and v10.16b, v16.16b, v26.16b - and v11.16b, v27.16b, v28.16b - and v12.16b, v29.16b, v30.16b - and v13.16b, v31.16b, v10.16b - and v14.16b, v11.16b, v12.16b - mul v2.8h, v2.8h, v18.8h - and v15.16b, v13.16b, v14.16b - shl v10.8h, v2.8h, #(PASS1_BITS) - sqxtn v16.8b, v15.8h - mov TMP1, v16.d[0] - mvn TMP2, TMP1 - - cbnz TMP2, 2f - /* case all AC coeffs are zeros */ - dup v2.2d, v10.d[0] - dup v6.2d, v10.d[1] - mov v3.16b, v2.16b - mov v7.16b, v6.16b - mov v4.16b, v2.16b - mov v8.16b, v6.16b - mov v5.16b, v2.16b - mov v9.16b, v6.16b -1: - /* for this transpose, we should organise data like this: - * 00, 01, 02, 03, 40, 41, 42, 43 - * 10, 11, 12, 13, 50, 51, 52, 53 - * 20, 21, 22, 23, 60, 61, 62, 63 - * 30, 31, 32, 33, 70, 71, 72, 73 - * 04, 05, 06, 07, 44, 45, 46, 47 - * 14, 15, 16, 17, 54, 55, 56, 57 - * 24, 25, 26, 27, 64, 65, 66, 67 - * 34, 35, 36, 37, 74, 75, 76, 77 - */ - trn1 v28.8h, v2.8h, v3.8h - trn1 v29.8h, v4.8h, v5.8h - trn1 v30.8h, v6.8h, v7.8h - trn1 v31.8h, v8.8h, v9.8h - trn2 v16.8h, v2.8h, v3.8h - trn2 v17.8h, v4.8h, v5.8h - trn2 v18.8h, v6.8h, v7.8h - trn2 v19.8h, v8.8h, v9.8h - trn1 v2.4s, v28.4s, v29.4s - trn1 v6.4s, v30.4s, v31.4s - trn1 v3.4s, v16.4s, v17.4s - trn1 v7.4s, v18.4s, v19.4s - trn2 v4.4s, v28.4s, v29.4s - trn2 v8.4s, v30.4s, v31.4s - trn2 v5.4s, v16.4s, v17.4s - trn2 v9.4s, v18.4s, v19.4s - /* Even part: reverse the even part of the forward DCT. */ - add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ - add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ - smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ - sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ - smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ - sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ - mov v21.16b, v19.16b /* tmp3 = z1 */ - mov v20.16b, v18.16b /* tmp3 = z1 */ - smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ - smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ - sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ - smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ - smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ - sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ - sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ - add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */ - sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */ - add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */ - sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */ - add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */ - sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */ - add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */ - sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */ - - /* Odd part per figure 8; the matrix is unitary and hence its - * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. - */ - - add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ - add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ - add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ - add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ - add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ - - smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ - smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ - smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ - smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ - smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ - smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ - smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ - smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ - smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ - - smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ - smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ - smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ - smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ - smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ - smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ - smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ - smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ - smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ - - add v23.4s, v23.4s, v27.4s /* z3 += z5 */ - add v22.4s, v22.4s, v26.4s /* z3 += z5 */ - add v25.4s, v25.4s, v27.4s /* z4 += z5 */ - add v24.4s, v24.4s, v26.4s /* z4 += z5 */ - - add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ - add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ - add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ - add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ - add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ - add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ - add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ - add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ - - add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ - add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ - add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ - add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ - add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ - add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ - add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ - add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ - - /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ - - add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ - add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ - sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ - sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ - add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ - add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ - sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ - sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ - add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ - add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ - sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ - sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ - add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ - add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ - sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ - sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ - - shrn v2.4h, v18.4s, #16 /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */ - shrn v9.4h, v20.4s, #16 /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */ - shrn v3.4h, v22.4s, #16 /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */ - shrn v8.4h, v24.4s, #16 /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */ - shrn v4.4h, v26.4s, #16 /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */ - shrn v7.4h, v28.4s, #16 /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */ - shrn v5.4h, v14.4s, #16 /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */ - shrn v6.4h, v16.4s, #16 /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */ - shrn2 v2.8h, v19.4s, #16 /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */ - shrn2 v9.8h, v21.4s, #16 /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */ - shrn2 v3.8h, v23.4s, #16 /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */ - shrn2 v8.8h, v25.4s, #16 /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */ - shrn2 v4.8h, v27.4s, #16 /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */ - shrn2 v7.8h, v29.4s, #16 /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */ - shrn2 v5.8h, v15.4s, #16 /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */ - shrn2 v6.8h, v17.4s, #16 /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */ - movi v0.16b, #(CENTERJSAMPLE) - /* Prepare pointers (dual-issue with NEON instructions) */ - ldp TMP1, TMP2, [OUTPUT_BUF], 16 - sqrshrn v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16) - ldp TMP3, TMP4, [OUTPUT_BUF], 16 - sqrshrn v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16) - add TMP1, TMP1, OUTPUT_COL - sqrshrn v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16) - add TMP2, TMP2, OUTPUT_COL - sqrshrn v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16) - add TMP3, TMP3, OUTPUT_COL - sqrshrn2 v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16) - add TMP4, TMP4, OUTPUT_COL - sqrshrn2 v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16) - ldp TMP5, TMP6, [OUTPUT_BUF], 16 - sqrshrn2 v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16) - ldp TMP7, TMP8, [OUTPUT_BUF], 16 - sqrshrn2 v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16) - add TMP5, TMP5, OUTPUT_COL - add v16.16b, v28.16b, v0.16b - add TMP6, TMP6, OUTPUT_COL - add v18.16b, v29.16b, v0.16b - add TMP7, TMP7, OUTPUT_COL - add v20.16b, v30.16b, v0.16b - add TMP8, TMP8, OUTPUT_COL - add v22.16b, v31.16b, v0.16b - - /* Transpose the final 8-bit samples */ - trn1 v28.16b, v16.16b, v18.16b - trn1 v30.16b, v20.16b, v22.16b - trn2 v29.16b, v16.16b, v18.16b - trn2 v31.16b, v20.16b, v22.16b - - trn1 v16.8h, v28.8h, v30.8h - trn2 v18.8h, v28.8h, v30.8h - trn1 v20.8h, v29.8h, v31.8h - trn2 v22.8h, v29.8h, v31.8h - - uzp1 v28.4s, v16.4s, v18.4s - uzp2 v30.4s, v16.4s, v18.4s - uzp1 v29.4s, v20.4s, v22.4s - uzp2 v31.4s, v20.4s, v22.4s - - /* Store results to the output buffer */ - st1 {v28.d}[0], [TMP1] - st1 {v29.d}[0], [TMP2] - st1 {v28.d}[1], [TMP3] - st1 {v29.d}[1], [TMP4] - st1 {v30.d}[0], [TMP5] - st1 {v31.d}[0], [TMP6] - st1 {v30.d}[1], [TMP7] - st1 {v31.d}[1], [TMP8] - ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32 - ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32 - blr x30 - -.balign 16 -2: - mul v3.8h, v3.8h, v19.8h - mul v4.8h, v4.8h, v20.8h - mul v5.8h, v5.8h, v21.8h - add TMP4, xzr, TMP2, LSL #32 - mul v6.8h, v6.8h, v22.8h - mul v7.8h, v7.8h, v23.8h - adds TMP3, xzr, TMP2, LSR #32 - mul v8.8h, v8.8h, v24.8h - mul v9.8h, v9.8h, v25.8h - b.ne 3f - /* Right AC coef is zero */ - dup v15.2d, v10.d[1] - /* Even part: reverse the even part of the forward DCT. */ - add v18.4h, v4.4h, v8.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ - add v22.4h, v2.4h, v6.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ - sub v26.4h, v2.4h, v6.4h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ - smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ - sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ - mov v20.16b, v18.16b /* tmp3 = z1 */ - sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ - smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ - smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ - add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */ - sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */ - add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */ - sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */ - - /* Odd part per figure 8; the matrix is unitary and hence its - * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. - */ - - add v22.4h, v9.4h, v5.4h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ - add v24.4h, v7.4h, v3.4h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ - add v18.4h, v9.4h, v3.4h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ - add v20.4h, v7.4h, v5.4h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ - add v26.4h, v22.4h, v24.4h /* z5 = z3 + z4 */ - - smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ - smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ - smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ - smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ - smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ - smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ - smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ - smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ - smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ - - add v22.4s, v22.4s, v26.4s /* z3 += z5 */ - add v24.4s, v24.4s, v26.4s /* z4 += z5 */ - - add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ - add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ - add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ - add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ - - add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ - add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ - add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ - add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ - - /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ - - add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ - sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ - add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ - sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ - add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ - sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ - add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ - sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ - - rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ - rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ - rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ - rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ - rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ - rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ - rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ - rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ - mov v6.16b, v15.16b - mov v7.16b, v15.16b - mov v8.16b, v15.16b - mov v9.16b, v15.16b - b 1b - -.balign 16 -3: - cbnz TMP4, 4f - /* Left AC coef is zero */ - dup v14.2d, v10.d[0] - /* Even part: reverse the even part of the forward DCT. */ - add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ - add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ - smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ - sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ - sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ - mov v21.16b, v19.16b /* tmp3 = z1 */ - smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ - sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ - smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ - add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */ - sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */ - add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */ - sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */ - - /* Odd part per figure 8; the matrix is unitary and hence its - * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. - */ - - add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ - add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ - add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ - add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ - add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ - - smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ - smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ - smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ - smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ - smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ - smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ - smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ - smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ - smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ - - add v23.4s, v23.4s, v27.4s /* z3 += z5 */ - add v22.4s, v22.4s, v26.4s /* z3 += z5 */ - add v25.4s, v25.4s, v27.4s /* z4 += z5 */ - add v24.4s, v24.4s, v26.4s /* z4 += z5 */ - - add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ - add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ - add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ - add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ - - add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ - add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ - add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ - add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ - - /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ - - add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ - sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ - add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ - sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ - add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ - sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ - add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ - sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ - - mov v2.16b, v14.16b - mov v3.16b, v14.16b - mov v4.16b, v14.16b - mov v5.16b, v14.16b - rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ - rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ - rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ - rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ - rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ - rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ - rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ - rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ - b 1b - -.balign 16 -4: - /* "No" AC coef is zero */ - /* Even part: reverse the even part of the forward DCT. */ - add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ - add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ - smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ - sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ - smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ - sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ - mov v21.16b, v19.16b /* tmp3 = z1 */ - mov v20.16b, v18.16b /* tmp3 = z1 */ - smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ - smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ - sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ - smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ - smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ - sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ - sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ - add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */ - sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */ - add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */ - sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */ - add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */ - sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */ - add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */ - sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */ - - /* Odd part per figure 8; the matrix is unitary and hence its - * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. - */ - - add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ - add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ - add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ - add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ - add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ - - smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ - smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ - smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ - smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ - smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ - smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ - smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ - smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ - smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ - - smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ - smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ - smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ - smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ - smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ - smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ - smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ - smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ - smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ - - add v23.4s, v23.4s, v27.4s /* z3 += z5 */ - add v22.4s, v22.4s, v26.4s /* z3 += z5 */ - add v25.4s, v25.4s, v27.4s /* z4 += z5 */ - add v24.4s, v24.4s, v26.4s /* z4 += z5 */ - - add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ - add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ - add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ - add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ - add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ - add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ - add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ - add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ - - add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ - add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ - add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ - add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ - add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ - add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ - add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ - add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ - - /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ - - add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ - add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ - sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ - sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ - add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ - add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ - sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ - sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ - add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ - add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ - sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ - sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ - add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ - add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ - sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ - sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ - - rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ - rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ - rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ - rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ - rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ - rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ - rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ - rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ - rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ - rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ - rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ - rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ - rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ - rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ - rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ - rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ - b 1b - - .unreq DCT_TABLE - .unreq COEF_BLOCK - .unreq OUTPUT_BUF - .unreq OUTPUT_COL - .unreq TMP1 - .unreq TMP2 - .unreq TMP3 - .unreq TMP4 - .unreq TMP5 - .unreq TMP6 - .unreq TMP7 - .unreq TMP8 - -#undef CENTERJSAMPLE -#undef CONST_BITS -#undef PASS1_BITS -#undef XFIX_P_0_298 -#undef XFIX_N_0_390 -#undef XFIX_P_0_541 -#undef XFIX_P_0_765 -#undef XFIX_N_0_899 -#undef XFIX_P_1_175 -#undef XFIX_P_1_501 -#undef XFIX_N_1_847 -#undef XFIX_N_1_961 -#undef XFIX_P_2_053 -#undef XFIX_N_2_562 -#undef XFIX_P_3_072 - - -/*****************************************************************************/ - -/* - * jsimd_idct_ifast_neon - * - * This function contains a fast, not so accurate integer implementation of - * the inverse DCT (Discrete Cosine Transform). It uses the same calculations - * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' - * function from jidctfst.c - * - * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. - * But in ARM NEON case some extra additions are required because VQDMULH - * instruction can't handle the constants larger than 1. So the expressions - * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", - * which introduces an extra addition. Overall, there are 6 extra additions - * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. - */ - -asm_function jsimd_idct_ifast_neon - - DCT_TABLE .req x0 - COEF_BLOCK .req x1 - OUTPUT_BUF .req x2 - OUTPUT_COL .req x3 - TMP1 .req x0 - TMP2 .req x1 - TMP3 .req x9 - TMP4 .req x10 - TMP5 .req x11 - TMP6 .req x12 - TMP7 .req x13 - TMP8 .req x14 - - /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't - guarantee that the upper (unused) 32 bits of x3 are valid. This - instruction ensures that those bits are set to zero. */ - uxtw x3, w3 - - /* Load and dequantize coefficients into NEON registers - * with the following allocation: - * 0 1 2 3 | 4 5 6 7 - * ---------+-------- - * 0 | d16 | d17 ( v16.8h ) - * 1 | d18 | d19 ( v17.8h ) - * 2 | d20 | d21 ( v18.8h ) - * 3 | d22 | d23 ( v19.8h ) - * 4 | d24 | d25 ( v20.8h ) - * 5 | d26 | d27 ( v21.8h ) - * 6 | d28 | d29 ( v22.8h ) - * 7 | d30 | d31 ( v23.8h ) - */ - /* Save NEON registers used in fast IDCT */ - get_symbol_loc TMP5, Ljsimd_idct_ifast_neon_consts - ld1 {v16.8h, v17.8h}, [COEF_BLOCK], 32 - ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32 - ld1 {v18.8h, v19.8h}, [COEF_BLOCK], 32 - mul v16.8h, v16.8h, v0.8h - ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32 - mul v17.8h, v17.8h, v1.8h - ld1 {v20.8h, v21.8h}, [COEF_BLOCK], 32 - mul v18.8h, v18.8h, v2.8h - ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32 - mul v19.8h, v19.8h, v3.8h - ld1 {v22.8h, v23.8h}, [COEF_BLOCK], 32 - mul v20.8h, v20.8h, v0.8h - ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32 - mul v22.8h, v22.8h, v2.8h - mul v21.8h, v21.8h, v1.8h - ld1 {v0.4h}, [TMP5] /* load constants */ - mul v23.8h, v23.8h, v3.8h - - /* 1-D IDCT, pass 1 */ - sub v2.8h, v18.8h, v22.8h - add v22.8h, v18.8h, v22.8h - sub v1.8h, v19.8h, v21.8h - add v21.8h, v19.8h, v21.8h - sub v5.8h, v17.8h, v23.8h - add v23.8h, v17.8h, v23.8h - sqdmulh v4.8h, v2.8h, XFIX_1_414213562 - sqdmulh v6.8h, v1.8h, XFIX_2_613125930 - add v3.8h, v1.8h, v1.8h - sub v1.8h, v5.8h, v1.8h - add v18.8h, v2.8h, v4.8h - sqdmulh v4.8h, v1.8h, XFIX_1_847759065 - sub v2.8h, v23.8h, v21.8h - add v3.8h, v3.8h, v6.8h - sqdmulh v6.8h, v2.8h, XFIX_1_414213562 - add v1.8h, v1.8h, v4.8h - sqdmulh v4.8h, v5.8h, XFIX_1_082392200 - sub v18.8h, v18.8h, v22.8h - add v2.8h, v2.8h, v6.8h - sub v6.8h, v16.8h, v20.8h - add v20.8h, v16.8h, v20.8h - add v17.8h, v5.8h, v4.8h - add v5.8h, v6.8h, v18.8h - sub v18.8h, v6.8h, v18.8h - add v6.8h, v23.8h, v21.8h - add v16.8h, v20.8h, v22.8h - sub v3.8h, v6.8h, v3.8h - sub v20.8h, v20.8h, v22.8h - sub v3.8h, v3.8h, v1.8h - sub v1.8h, v17.8h, v1.8h - add v2.8h, v3.8h, v2.8h - sub v23.8h, v16.8h, v6.8h - add v1.8h, v1.8h, v2.8h - add v16.8h, v16.8h, v6.8h - add v22.8h, v5.8h, v3.8h - sub v17.8h, v5.8h, v3.8h - sub v21.8h, v18.8h, v2.8h - add v18.8h, v18.8h, v2.8h - sub v19.8h, v20.8h, v1.8h - add v20.8h, v20.8h, v1.8h - transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31 - /* 1-D IDCT, pass 2 */ - sub v2.8h, v18.8h, v22.8h - add v22.8h, v18.8h, v22.8h - sub v1.8h, v19.8h, v21.8h - add v21.8h, v19.8h, v21.8h - sub v5.8h, v17.8h, v23.8h - add v23.8h, v17.8h, v23.8h - sqdmulh v4.8h, v2.8h, XFIX_1_414213562 - sqdmulh v6.8h, v1.8h, XFIX_2_613125930 - add v3.8h, v1.8h, v1.8h - sub v1.8h, v5.8h, v1.8h - add v18.8h, v2.8h, v4.8h - sqdmulh v4.8h, v1.8h, XFIX_1_847759065 - sub v2.8h, v23.8h, v21.8h - add v3.8h, v3.8h, v6.8h - sqdmulh v6.8h, v2.8h, XFIX_1_414213562 - add v1.8h, v1.8h, v4.8h - sqdmulh v4.8h, v5.8h, XFIX_1_082392200 - sub v18.8h, v18.8h, v22.8h - add v2.8h, v2.8h, v6.8h - sub v6.8h, v16.8h, v20.8h - add v20.8h, v16.8h, v20.8h - add v17.8h, v5.8h, v4.8h - add v5.8h, v6.8h, v18.8h - sub v18.8h, v6.8h, v18.8h - add v6.8h, v23.8h, v21.8h - add v16.8h, v20.8h, v22.8h - sub v3.8h, v6.8h, v3.8h - sub v20.8h, v20.8h, v22.8h - sub v3.8h, v3.8h, v1.8h - sub v1.8h, v17.8h, v1.8h - add v2.8h, v3.8h, v2.8h - sub v23.8h, v16.8h, v6.8h - add v1.8h, v1.8h, v2.8h - add v16.8h, v16.8h, v6.8h - add v22.8h, v5.8h, v3.8h - sub v17.8h, v5.8h, v3.8h - sub v21.8h, v18.8h, v2.8h - add v18.8h, v18.8h, v2.8h - sub v19.8h, v20.8h, v1.8h - add v20.8h, v20.8h, v1.8h - /* Descale to 8-bit and range limit */ - movi v0.16b, #0x80 - /* Prepare pointers (dual-issue with NEON instructions) */ - ldp TMP1, TMP2, [OUTPUT_BUF], 16 - sqshrn v28.8b, v16.8h, #5 - ldp TMP3, TMP4, [OUTPUT_BUF], 16 - sqshrn v29.8b, v17.8h, #5 - add TMP1, TMP1, OUTPUT_COL - sqshrn v30.8b, v18.8h, #5 - add TMP2, TMP2, OUTPUT_COL - sqshrn v31.8b, v19.8h, #5 - add TMP3, TMP3, OUTPUT_COL - sqshrn2 v28.16b, v20.8h, #5 - add TMP4, TMP4, OUTPUT_COL - sqshrn2 v29.16b, v21.8h, #5 - ldp TMP5, TMP6, [OUTPUT_BUF], 16 - sqshrn2 v30.16b, v22.8h, #5 - ldp TMP7, TMP8, [OUTPUT_BUF], 16 - sqshrn2 v31.16b, v23.8h, #5 - add TMP5, TMP5, OUTPUT_COL - add v16.16b, v28.16b, v0.16b - add TMP6, TMP6, OUTPUT_COL - add v18.16b, v29.16b, v0.16b - add TMP7, TMP7, OUTPUT_COL - add v20.16b, v30.16b, v0.16b - add TMP8, TMP8, OUTPUT_COL - add v22.16b, v31.16b, v0.16b - - /* Transpose the final 8-bit samples */ - trn1 v28.16b, v16.16b, v18.16b - trn1 v30.16b, v20.16b, v22.16b - trn2 v29.16b, v16.16b, v18.16b - trn2 v31.16b, v20.16b, v22.16b - - trn1 v16.8h, v28.8h, v30.8h - trn2 v18.8h, v28.8h, v30.8h - trn1 v20.8h, v29.8h, v31.8h - trn2 v22.8h, v29.8h, v31.8h - - uzp1 v28.4s, v16.4s, v18.4s - uzp2 v30.4s, v16.4s, v18.4s - uzp1 v29.4s, v20.4s, v22.4s - uzp2 v31.4s, v20.4s, v22.4s - - /* Store results to the output buffer */ - st1 {v28.d}[0], [TMP1] - st1 {v29.d}[0], [TMP2] - st1 {v28.d}[1], [TMP3] - st1 {v29.d}[1], [TMP4] - st1 {v30.d}[0], [TMP5] - st1 {v31.d}[0], [TMP6] - st1 {v30.d}[1], [TMP7] - st1 {v31.d}[1], [TMP8] - blr x30 - - .unreq DCT_TABLE - .unreq COEF_BLOCK - .unreq OUTPUT_BUF - .unreq OUTPUT_COL - .unreq TMP1 - .unreq TMP2 - .unreq TMP3 - .unreq TMP4 - .unreq TMP5 - .unreq TMP6 - .unreq TMP7 - .unreq TMP8 - - -/*****************************************************************************/ - -/* - * jsimd_idct_4x4_neon - * - * This function contains inverse-DCT code for getting reduced-size - * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations - * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' - * function from jpeg-6b (jidctred.c). - * - * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which - * requires much less arithmetic operations and hence should be faster. - * The primary purpose of this particular NEON optimized function is - * bit exact compatibility with jpeg-6b. - * - * TODO: a bit better instructions scheduling can be achieved by expanding - * idct_helper/transpose_4x4 macros and reordering instructions, - * but readability will suffer somewhat. - */ - -#define CONST_BITS 13 - -.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 - smull v28.4s, \x4, v2.h[2] - smlal v28.4s, \x8, v0.h[0] - smlal v28.4s, \x14, v0.h[1] - - smull v26.4s, \x16, v1.h[2] - smlal v26.4s, \x12, v1.h[3] - smlal v26.4s, \x10, v2.h[0] - smlal v26.4s, \x6, v2.h[1] - - smull v30.4s, \x4, v2.h[2] - smlsl v30.4s, \x8, v0.h[0] - smlsl v30.4s, \x14, v0.h[1] - - smull v24.4s, \x16, v0.h[2] - smlal v24.4s, \x12, v0.h[3] - smlal v24.4s, \x10, v1.h[0] - smlal v24.4s, \x6, v1.h[1] - - add v20.4s, v28.4s, v26.4s - sub v28.4s, v28.4s, v26.4s - - .if \shift > 16 - srshr v20.4s, v20.4s, #\shift - srshr v28.4s, v28.4s, #\shift - xtn \y26, v20.4s - xtn \y29, v28.4s - .else - rshrn \y26, v20.4s, #\shift - rshrn \y29, v28.4s, #\shift - .endif - - add v20.4s, v30.4s, v24.4s - sub v30.4s, v30.4s, v24.4s - - .if \shift > 16 - srshr v20.4s, v20.4s, #\shift - srshr v30.4s, v30.4s, #\shift - xtn \y27, v20.4s - xtn \y28, v30.4s - .else - rshrn \y27, v20.4s, #\shift - rshrn \y28, v30.4s, #\shift - .endif -.endm - -asm_function jsimd_idct_4x4_neon - - DCT_TABLE .req x0 - COEF_BLOCK .req x1 - OUTPUT_BUF .req x2 - OUTPUT_COL .req x3 - TMP1 .req x0 - TMP2 .req x1 - TMP3 .req x2 - TMP4 .req x15 - - /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't - guarantee that the upper (unused) 32 bits of x3 are valid. This - instruction ensures that those bits are set to zero. */ - uxtw x3, w3 - - /* Save all used NEON registers */ - sub sp, sp, 64 - mov x9, sp - /* Load constants (v3.4h is just used for padding) */ - get_symbol_loc TMP4, Ljsimd_idct_4x4_neon_consts - st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 - st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 - ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4] - - /* Load all COEF_BLOCK into NEON registers with the following allocation: - * 0 1 2 3 | 4 5 6 7 - * ---------+-------- - * 0 | v4.4h | v5.4h - * 1 | v6.4h | v7.4h - * 2 | v8.4h | v9.4h - * 3 | v10.4h | v11.4h - * 4 | - | - - * 5 | v12.4h | v13.4h - * 6 | v14.4h | v15.4h - * 7 | v16.4h | v17.4h - */ - ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 - ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32 - add COEF_BLOCK, COEF_BLOCK, #16 - ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32 - ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 - /* dequantize */ - ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 - mul v4.4h, v4.4h, v18.4h - mul v5.4h, v5.4h, v19.4h - ins v4.d[1], v5.d[0] /* 128 bit q4 */ - ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32 - mul v6.4h, v6.4h, v20.4h - mul v7.4h, v7.4h, v21.4h - ins v6.d[1], v7.d[0] /* 128 bit q6 */ - mul v8.4h, v8.4h, v22.4h - mul v9.4h, v9.4h, v23.4h - ins v8.d[1], v9.d[0] /* 128 bit q8 */ - add DCT_TABLE, DCT_TABLE, #16 - ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32 - mul v10.4h, v10.4h, v24.4h - mul v11.4h, v11.4h, v25.4h - ins v10.d[1], v11.d[0] /* 128 bit q10 */ - mul v12.4h, v12.4h, v26.4h - mul v13.4h, v13.4h, v27.4h - ins v12.d[1], v13.d[0] /* 128 bit q12 */ - ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 - mul v14.4h, v14.4h, v28.4h - mul v15.4h, v15.4h, v29.4h - ins v14.d[1], v15.d[0] /* 128 bit q14 */ - mul v16.4h, v16.4h, v30.4h - mul v17.4h, v17.4h, v31.4h - ins v16.d[1], v17.d[0] /* 128 bit q16 */ - - /* Pass 1 */ - idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \ - v4.4h, v6.4h, v8.4h, v10.4h - transpose_4x4 v4, v6, v8, v10, v3 - ins v10.d[1], v11.d[0] - idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \ - v5.4h, v7.4h, v9.4h, v11.4h - transpose_4x4 v5, v7, v9, v11, v3 - ins v10.d[1], v11.d[0] - - /* Pass 2 */ - idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \ - v26.4h, v27.4h, v28.4h, v29.4h - transpose_4x4 v26, v27, v28, v29, v3 - - /* Range limit */ - movi v30.8h, #0x80 - ins v26.d[1], v27.d[0] - ins v28.d[1], v29.d[0] - add v26.8h, v26.8h, v30.8h - add v28.8h, v28.8h, v30.8h - sqxtun v26.8b, v26.8h - sqxtun v27.8b, v28.8h - - /* Store results to the output buffer */ - ldp TMP1, TMP2, [OUTPUT_BUF], 16 - ldp TMP3, TMP4, [OUTPUT_BUF] - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - add TMP3, TMP3, OUTPUT_COL - add TMP4, TMP4, OUTPUT_COL - -#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT - /* We can use much less instructions on little endian systems if the - * OS kernel is not configured to trap unaligned memory accesses - */ - st1 {v26.s}[0], [TMP1], 4 - st1 {v27.s}[0], [TMP3], 4 - st1 {v26.s}[1], [TMP2], 4 - st1 {v27.s}[1], [TMP4], 4 -#else - st1 {v26.b}[0], [TMP1], 1 - st1 {v27.b}[0], [TMP3], 1 - st1 {v26.b}[1], [TMP1], 1 - st1 {v27.b}[1], [TMP3], 1 - st1 {v26.b}[2], [TMP1], 1 - st1 {v27.b}[2], [TMP3], 1 - st1 {v26.b}[3], [TMP1], 1 - st1 {v27.b}[3], [TMP3], 1 - - st1 {v26.b}[4], [TMP2], 1 - st1 {v27.b}[4], [TMP4], 1 - st1 {v26.b}[5], [TMP2], 1 - st1 {v27.b}[5], [TMP4], 1 - st1 {v26.b}[6], [TMP2], 1 - st1 {v27.b}[6], [TMP4], 1 - st1 {v26.b}[7], [TMP2], 1 - st1 {v27.b}[7], [TMP4], 1 -#endif - - /* vpop {v8.4h - v15.4h} ;not available */ - ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 - ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 - blr x30 - - .unreq DCT_TABLE - .unreq COEF_BLOCK - .unreq OUTPUT_BUF - .unreq OUTPUT_COL - .unreq TMP1 - .unreq TMP2 - .unreq TMP3 - .unreq TMP4 - -.purgem idct_helper - - -/*****************************************************************************/ - -/* - * jsimd_idct_2x2_neon - * - * This function contains inverse-DCT code for getting reduced-size - * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations - * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' - * function from jpeg-6b (jidctred.c). - * - * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which - * requires much less arithmetic operations and hence should be faster. - * The primary purpose of this particular NEON optimized function is - * bit exact compatibility with jpeg-6b. - */ - -.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 - sshll v15.4s, \x4, #15 - smull v26.4s, \x6, v14.h[3] - smlal v26.4s, \x10, v14.h[2] - smlal v26.4s, \x12, v14.h[1] - smlal v26.4s, \x16, v14.h[0] - - add v20.4s, v15.4s, v26.4s - sub v15.4s, v15.4s, v26.4s - - .if \shift > 16 - srshr v20.4s, v20.4s, #\shift - srshr v15.4s, v15.4s, #\shift - xtn \y26, v20.4s - xtn \y27, v15.4s - .else - rshrn \y26, v20.4s, #\shift - rshrn \y27, v15.4s, #\shift - .endif -.endm - -asm_function jsimd_idct_2x2_neon - - DCT_TABLE .req x0 - COEF_BLOCK .req x1 - OUTPUT_BUF .req x2 - OUTPUT_COL .req x3 - TMP1 .req x0 - TMP2 .req x15 - - /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't - guarantee that the upper (unused) 32 bits of x3 are valid. This - instruction ensures that those bits are set to zero. */ - uxtw x3, w3 - - /* vpush {v8.4h - v15.4h} ; not available */ - sub sp, sp, 64 - mov x9, sp - - /* Load constants */ - get_symbol_loc TMP2, Ljsimd_idct_2x2_neon_consts - st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 - st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 - ld1 {v14.4h}, [TMP2] - - /* Load all COEF_BLOCK into NEON registers with the following allocation: - * 0 1 2 3 | 4 5 6 7 - * ---------+-------- - * 0 | v4.4h | v5.4h - * 1 | v6.4h | v7.4h - * 2 | - | - - * 3 | v10.4h | v11.4h - * 4 | - | - - * 5 | v12.4h | v13.4h - * 6 | - | - - * 7 | v16.4h | v17.4h - */ - ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 - add COEF_BLOCK, COEF_BLOCK, #16 - ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16 - add COEF_BLOCK, COEF_BLOCK, #16 - ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16 - add COEF_BLOCK, COEF_BLOCK, #16 - ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 - /* Dequantize */ - ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 - mul v4.4h, v4.4h, v18.4h - mul v5.4h, v5.4h, v19.4h - ins v4.d[1], v5.d[0] - mul v6.4h, v6.4h, v20.4h - mul v7.4h, v7.4h, v21.4h - ins v6.d[1], v7.d[0] - add DCT_TABLE, DCT_TABLE, #16 - ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16 - mul v10.4h, v10.4h, v24.4h - mul v11.4h, v11.4h, v25.4h - ins v10.d[1], v11.d[0] - add DCT_TABLE, DCT_TABLE, #16 - ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16 - mul v12.4h, v12.4h, v26.4h - mul v13.4h, v13.4h, v27.4h - ins v12.d[1], v13.d[0] - add DCT_TABLE, DCT_TABLE, #16 - ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 - mul v16.4h, v16.4h, v30.4h - mul v17.4h, v17.4h, v31.4h - ins v16.d[1], v17.d[0] - - /* Pass 1 */ -#if 0 - idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h - transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h - idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h - transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h -#else - smull v26.4s, v6.4h, v14.h[3] - smlal v26.4s, v10.4h, v14.h[2] - smlal v26.4s, v12.4h, v14.h[1] - smlal v26.4s, v16.4h, v14.h[0] - smull v24.4s, v7.4h, v14.h[3] - smlal v24.4s, v11.4h, v14.h[2] - smlal v24.4s, v13.4h, v14.h[1] - smlal v24.4s, v17.4h, v14.h[0] - sshll v15.4s, v4.4h, #15 - sshll v30.4s, v5.4h, #15 - add v20.4s, v15.4s, v26.4s - sub v15.4s, v15.4s, v26.4s - rshrn v4.4h, v20.4s, #13 - rshrn v6.4h, v15.4s, #13 - add v20.4s, v30.4s, v24.4s - sub v15.4s, v30.4s, v24.4s - rshrn v5.4h, v20.4s, #13 - rshrn v7.4h, v15.4s, #13 - ins v4.d[1], v5.d[0] - ins v6.d[1], v7.d[0] - transpose v4, v6, v3, .16b, .8h - transpose v6, v10, v3, .16b, .4s - ins v11.d[0], v10.d[1] - ins v7.d[0], v6.d[1] -#endif - - /* Pass 2 */ - idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h - - /* Range limit */ - movi v30.8h, #0x80 - ins v26.d[1], v27.d[0] - add v26.8h, v26.8h, v30.8h - sqxtun v30.8b, v26.8h - ins v26.d[0], v30.d[0] - sqxtun v27.8b, v26.8h - - /* Store results to the output buffer */ - ldp TMP1, TMP2, [OUTPUT_BUF] - add TMP1, TMP1, OUTPUT_COL - add TMP2, TMP2, OUTPUT_COL - - st1 {v26.b}[0], [TMP1], 1 - st1 {v27.b}[4], [TMP1], 1 - st1 {v26.b}[1], [TMP2], 1 - st1 {v27.b}[5], [TMP2], 1 - - ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 - ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 - blr x30 - - .unreq DCT_TABLE - .unreq COEF_BLOCK - .unreq OUTPUT_BUF - .unreq OUTPUT_COL - .unreq TMP1 - .unreq TMP2 - -.purgem idct_helper - - -/*****************************************************************************/ - -/* - * jsimd_ycc_extrgb_convert_neon - * jsimd_ycc_extbgr_convert_neon - * jsimd_ycc_extrgbx_convert_neon - * jsimd_ycc_extbgrx_convert_neon - * jsimd_ycc_extxbgr_convert_neon - * jsimd_ycc_extxrgb_convert_neon - * - * Colorspace conversion YCbCr -> RGB - */ - -.macro do_load size - .if \size == 8 - ld1 {v4.8b}, [U], 8 - ld1 {v5.8b}, [V], 8 - ld1 {v0.8b}, [Y], 8 - prfm pldl1keep, [U, #64] - prfm pldl1keep, [V, #64] - prfm pldl1keep, [Y, #64] - .elseif \size == 4 - ld1 {v4.b}[0], [U], 1 - ld1 {v4.b}[1], [U], 1 - ld1 {v4.b}[2], [U], 1 - ld1 {v4.b}[3], [U], 1 - ld1 {v5.b}[0], [V], 1 - ld1 {v5.b}[1], [V], 1 - ld1 {v5.b}[2], [V], 1 - ld1 {v5.b}[3], [V], 1 - ld1 {v0.b}[0], [Y], 1 - ld1 {v0.b}[1], [Y], 1 - ld1 {v0.b}[2], [Y], 1 - ld1 {v0.b}[3], [Y], 1 - .elseif \size == 2 - ld1 {v4.b}[4], [U], 1 - ld1 {v4.b}[5], [U], 1 - ld1 {v5.b}[4], [V], 1 - ld1 {v5.b}[5], [V], 1 - ld1 {v0.b}[4], [Y], 1 - ld1 {v0.b}[5], [Y], 1 - .elseif \size == 1 - ld1 {v4.b}[6], [U], 1 - ld1 {v5.b}[6], [V], 1 - ld1 {v0.b}[6], [Y], 1 - .else - .error unsupported macroblock size - .endif -.endm - -.macro do_store bpp, size, fast_st3 - .if \bpp == 24 - .if \size == 8 - .if \fast_st3 == 1 - st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24 - .else - st1 {v10.b}[0], [RGB], #1 - st1 {v11.b}[0], [RGB], #1 - st1 {v12.b}[0], [RGB], #1 - - st1 {v10.b}[1], [RGB], #1 - st1 {v11.b}[1], [RGB], #1 - st1 {v12.b}[1], [RGB], #1 - - st1 {v10.b}[2], [RGB], #1 - st1 {v11.b}[2], [RGB], #1 - st1 {v12.b}[2], [RGB], #1 - - st1 {v10.b}[3], [RGB], #1 - st1 {v11.b}[3], [RGB], #1 - st1 {v12.b}[3], [RGB], #1 - - st1 {v10.b}[4], [RGB], #1 - st1 {v11.b}[4], [RGB], #1 - st1 {v12.b}[4], [RGB], #1 - - st1 {v10.b}[5], [RGB], #1 - st1 {v11.b}[5], [RGB], #1 - st1 {v12.b}[5], [RGB], #1 - - st1 {v10.b}[6], [RGB], #1 - st1 {v11.b}[6], [RGB], #1 - st1 {v12.b}[6], [RGB], #1 - - st1 {v10.b}[7], [RGB], #1 - st1 {v11.b}[7], [RGB], #1 - st1 {v12.b}[7], [RGB], #1 - .endif - .elseif \size == 4 - st3 {v10.b, v11.b, v12.b}[0], [RGB], 3 - st3 {v10.b, v11.b, v12.b}[1], [RGB], 3 - st3 {v10.b, v11.b, v12.b}[2], [RGB], 3 - st3 {v10.b, v11.b, v12.b}[3], [RGB], 3 - .elseif \size == 2 - st3 {v10.b, v11.b, v12.b}[4], [RGB], 3 - st3 {v10.b, v11.b, v12.b}[5], [RGB], 3 - .elseif \size == 1 - st3 {v10.b, v11.b, v12.b}[6], [RGB], 3 - .else - .error unsupported macroblock size - .endif - .elseif \bpp == 32 - .if \size == 8 - st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32 - .elseif \size == 4 - st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4 - st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4 - st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4 - st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4 - .elseif \size == 2 - st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4 - st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4 - .elseif \size == 1 - st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4 - .else - .error unsupported macroblock size - .endif - .elseif \bpp == 16 - .if \size == 8 - st1 {v25.8h}, [RGB], 16 - .elseif \size == 4 - st1 {v25.4h}, [RGB], 8 - .elseif \size == 2 - st1 {v25.h}[4], [RGB], 2 - st1 {v25.h}[5], [RGB], 2 - .elseif \size == 1 - st1 {v25.h}[6], [RGB], 2 - .else - .error unsupported macroblock size - .endif - .else - .error unsupported bpp - .endif -.endm - -.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \ - g_offs, gsize, b_offs, bsize, \ - defsize, fast_st3 - -/* - * 2-stage pipelined YCbCr->RGB conversion - */ - -.macro do_yuv_to_rgb_stage1 - uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */ - uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ - smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ - smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ - smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ - smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ - smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ - smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ - smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */ - smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */ -.endm - -.macro do_yuv_to_rgb_stage2 - rshrn v20.4h, v20.4s, #15 - rshrn2 v20.8h, v22.4s, #15 - rshrn v24.4h, v24.4s, #14 - rshrn2 v24.8h, v26.4s, #14 - rshrn v28.4h, v28.4s, #14 - rshrn2 v28.8h, v30.4s, #14 - uaddw v20.8h, v20.8h, v0.8b - uaddw v24.8h, v24.8h, v0.8b - uaddw v28.8h, v28.8h, v0.8b - .if \bpp != 16 - sqxtun v1\g_offs\defsize, v20.8h - sqxtun v1\r_offs\defsize, v24.8h - sqxtun v1\b_offs\defsize, v28.8h - .else - sqshlu v21.8h, v20.8h, #8 - sqshlu v25.8h, v24.8h, #8 - sqshlu v29.8h, v28.8h, #8 - sri v25.8h, v21.8h, #5 - sri v25.8h, v29.8h, #11 - .endif -.endm - -.macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3 - rshrn v20.4h, v20.4s, #15 - rshrn v24.4h, v24.4s, #14 - rshrn v28.4h, v28.4s, #14 - ld1 {v4.8b}, [U], 8 - rshrn2 v20.8h, v22.4s, #15 - rshrn2 v24.8h, v26.4s, #14 - rshrn2 v28.8h, v30.4s, #14 - ld1 {v5.8b}, [V], 8 - uaddw v20.8h, v20.8h, v0.8b - uaddw v24.8h, v24.8h, v0.8b - uaddw v28.8h, v28.8h, v0.8b - .if \bpp != 16 /**************** rgb24/rgb32 ******************************/ - sqxtun v1\g_offs\defsize, v20.8h - ld1 {v0.8b}, [Y], 8 - sqxtun v1\r_offs\defsize, v24.8h - prfm pldl1keep, [U, #64] - prfm pldl1keep, [V, #64] - prfm pldl1keep, [Y, #64] - sqxtun v1\b_offs\defsize, v28.8h - uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ - uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ - smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ - smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ - smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ - smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ - smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ - smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ - .else /**************************** rgb565 ********************************/ - sqshlu v21.8h, v20.8h, #8 - sqshlu v25.8h, v24.8h, #8 - sqshlu v29.8h, v28.8h, #8 - uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ - uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ - ld1 {v0.8b}, [Y], 8 - smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ - smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ - smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ - smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ - sri v25.8h, v21.8h, #5 - smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ - smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ - prfm pldl1keep, [U, #64] - prfm pldl1keep, [V, #64] - prfm pldl1keep, [Y, #64] - sri v25.8h, v29.8h, #11 - .endif - do_store \bpp, 8, \fast_st3 - smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */ - smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */ -.endm - -.macro do_yuv_to_rgb - do_yuv_to_rgb_stage1 - do_yuv_to_rgb_stage2 -.endm - -/* Apple gas crashes on adrl, work around that by using adr. - * But this requires a copy of these constants for each function. - */ - -.if \fast_st3 == 1 -asm_function jsimd_ycc_\colorid\()_convert_neon -.else -asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3 -.endif - OUTPUT_WIDTH .req w0 - INPUT_BUF .req x1 - INPUT_ROW .req w2 - OUTPUT_BUF .req x3 - NUM_ROWS .req w4 - - INPUT_BUF0 .req x5 - INPUT_BUF1 .req x6 - INPUT_BUF2 .req x1 - - RGB .req x7 - Y .req x9 - U .req x10 - V .req x11 - N .req w15 - - sub sp, sp, 64 - mov x9, sp - - /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */ - get_symbol_loc x15, Ljsimd_ycc_colorid_neon_consts - - /* Save NEON registers */ - st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 - st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 - ld1 {v0.4h, v1.4h}, [x15], 16 - ld1 {v2.8h}, [x15] - - ldr INPUT_BUF0, [INPUT_BUF] - ldr INPUT_BUF1, [INPUT_BUF, #8] - ldr INPUT_BUF2, [INPUT_BUF, #16] - .unreq INPUT_BUF - - /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */ - movi v10.16b, #255 - movi v13.16b, #255 - - /* Outer loop over scanlines */ - cmp NUM_ROWS, #1 - b.lt 9f -0: - ldr Y, [INPUT_BUF0, INPUT_ROW, uxtw #3] - ldr U, [INPUT_BUF1, INPUT_ROW, uxtw #3] - mov N, OUTPUT_WIDTH - ldr V, [INPUT_BUF2, INPUT_ROW, uxtw #3] - add INPUT_ROW, INPUT_ROW, #1 - ldr RGB, [OUTPUT_BUF], #8 - - /* Inner loop over pixels */ - subs N, N, #8 - b.lt 3f - do_load 8 - do_yuv_to_rgb_stage1 - subs N, N, #8 - b.lt 2f -1: - do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3 - subs N, N, #8 - b.ge 1b -2: - do_yuv_to_rgb_stage2 - do_store \bpp, 8, \fast_st3 - tst N, #7 - b.eq 8f -3: - tst N, #4 - b.eq 3f - do_load 4 -3: - tst N, #2 - b.eq 4f - do_load 2 -4: - tst N, #1 - b.eq 5f - do_load 1 -5: - do_yuv_to_rgb - tst N, #4 - b.eq 6f - do_store \bpp, 4, \fast_st3 -6: - tst N, #2 - b.eq 7f - do_store \bpp, 2, \fast_st3 -7: - tst N, #1 - b.eq 8f - do_store \bpp, 1, \fast_st3 -8: - subs NUM_ROWS, NUM_ROWS, #1 - b.gt 0b -9: - /* Restore all registers and return */ - ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 - ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 - br x30 - .unreq OUTPUT_WIDTH - .unreq INPUT_ROW - .unreq OUTPUT_BUF - .unreq NUM_ROWS - .unreq INPUT_BUF0 - .unreq INPUT_BUF1 - .unreq INPUT_BUF2 - .unreq RGB - .unreq Y - .unreq U - .unreq V - .unreq N - -.purgem do_yuv_to_rgb -.purgem do_yuv_to_rgb_stage1 -.purgem do_yuv_to_rgb_stage2 -.purgem do_yuv_to_rgb_stage2_store_load_stage1 - -.endm - -/*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize fast_st3*/ -generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 1 -generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 1 -generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .8b, 1 -generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b, 1 -generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b, 1 -generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b, 1 -generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b, 1 - -generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 0 -generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 0 - -.purgem do_load -.purgem do_store - - -/*****************************************************************************/ - -/* - * jsimd_extrgb_ycc_convert_neon - * jsimd_extbgr_ycc_convert_neon - * jsimd_extrgbx_ycc_convert_neon - * jsimd_extbgrx_ycc_convert_neon - * jsimd_extxbgr_ycc_convert_neon - * jsimd_extxrgb_ycc_convert_neon - * - * Colorspace conversion RGB -> YCbCr - */ - -.macro do_store size - .if \size == 8 - st1 {v20.8b}, [Y], #8 - st1 {v21.8b}, [U], #8 - st1 {v22.8b}, [V], #8 - .elseif \size == 4 - st1 {v20.b}[0], [Y], #1 - st1 {v20.b}[1], [Y], #1 - st1 {v20.b}[2], [Y], #1 - st1 {v20.b}[3], [Y], #1 - st1 {v21.b}[0], [U], #1 - st1 {v21.b}[1], [U], #1 - st1 {v21.b}[2], [U], #1 - st1 {v21.b}[3], [U], #1 - st1 {v22.b}[0], [V], #1 - st1 {v22.b}[1], [V], #1 - st1 {v22.b}[2], [V], #1 - st1 {v22.b}[3], [V], #1 - .elseif \size == 2 - st1 {v20.b}[4], [Y], #1 - st1 {v20.b}[5], [Y], #1 - st1 {v21.b}[4], [U], #1 - st1 {v21.b}[5], [U], #1 - st1 {v22.b}[4], [V], #1 - st1 {v22.b}[5], [V], #1 - .elseif \size == 1 - st1 {v20.b}[6], [Y], #1 - st1 {v21.b}[6], [U], #1 - st1 {v22.b}[6], [V], #1 - .else - .error unsupported macroblock size - .endif -.endm - -.macro do_load bpp, size, fast_ld3 - .if \bpp == 24 - .if \size == 8 - .if \fast_ld3 == 1 - ld3 {v10.8b, v11.8b, v12.8b}, [RGB], #24 - .else - ld1 {v10.b}[0], [RGB], #1 - ld1 {v11.b}[0], [RGB], #1 - ld1 {v12.b}[0], [RGB], #1 - - ld1 {v10.b}[1], [RGB], #1 - ld1 {v11.b}[1], [RGB], #1 - ld1 {v12.b}[1], [RGB], #1 - - ld1 {v10.b}[2], [RGB], #1 - ld1 {v11.b}[2], [RGB], #1 - ld1 {v12.b}[2], [RGB], #1 - - ld1 {v10.b}[3], [RGB], #1 - ld1 {v11.b}[3], [RGB], #1 - ld1 {v12.b}[3], [RGB], #1 - - ld1 {v10.b}[4], [RGB], #1 - ld1 {v11.b}[4], [RGB], #1 - ld1 {v12.b}[4], [RGB], #1 - - ld1 {v10.b}[5], [RGB], #1 - ld1 {v11.b}[5], [RGB], #1 - ld1 {v12.b}[5], [RGB], #1 - - ld1 {v10.b}[6], [RGB], #1 - ld1 {v11.b}[6], [RGB], #1 - ld1 {v12.b}[6], [RGB], #1 - - ld1 {v10.b}[7], [RGB], #1 - ld1 {v11.b}[7], [RGB], #1 - ld1 {v12.b}[7], [RGB], #1 - .endif - prfm pldl1keep, [RGB, #128] - .elseif \size == 4 - ld3 {v10.b, v11.b, v12.b}[0], [RGB], #3 - ld3 {v10.b, v11.b, v12.b}[1], [RGB], #3 - ld3 {v10.b, v11.b, v12.b}[2], [RGB], #3 - ld3 {v10.b, v11.b, v12.b}[3], [RGB], #3 - .elseif \size == 2 - ld3 {v10.b, v11.b, v12.b}[4], [RGB], #3 - ld3 {v10.b, v11.b, v12.b}[5], [RGB], #3 - .elseif \size == 1 - ld3 {v10.b, v11.b, v12.b}[6], [RGB], #3 - .else - .error unsupported macroblock size - .endif - .elseif \bpp == 32 - .if \size == 8 - ld4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32 - prfm pldl1keep, [RGB, #128] - .elseif \size == 4 - ld4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4 - ld4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4 - ld4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4 - ld4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4 - .elseif \size == 2 - ld4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4 - ld4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4 - .elseif \size == 1 - ld4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4 - .else - .error unsupported macroblock size - .endif - .else - .error unsupported bpp - .endif -.endm - -.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \ - b_offs, fast_ld3 - -/* - * 2-stage pipelined RGB->YCbCr conversion - */ - -.macro do_rgb_to_yuv_stage1 - ushll v4.8h, v1\r_offs\().8b, #0 /* r = v4 */ - ushll v6.8h, v1\g_offs\().8b, #0 /* g = v6 */ - ushll v8.8h, v1\b_offs\().8b, #0 /* b = v8 */ - rev64 v18.4s, v1.4s - rev64 v26.4s, v1.4s - rev64 v28.4s, v1.4s - rev64 v30.4s, v1.4s - umull v14.4s, v4.4h, v0.h[0] - umull2 v16.4s, v4.8h, v0.h[0] - umlsl v18.4s, v4.4h, v0.h[3] - umlsl2 v26.4s, v4.8h, v0.h[3] - umlal v28.4s, v4.4h, v0.h[5] - umlal2 v30.4s, v4.8h, v0.h[5] - umlal v14.4s, v6.4h, v0.h[1] - umlal2 v16.4s, v6.8h, v0.h[1] - umlsl v18.4s, v6.4h, v0.h[4] - umlsl2 v26.4s, v6.8h, v0.h[4] - umlsl v28.4s, v6.4h, v0.h[6] - umlsl2 v30.4s, v6.8h, v0.h[6] - umlal v14.4s, v8.4h, v0.h[2] - umlal2 v16.4s, v8.8h, v0.h[2] - umlal v18.4s, v8.4h, v0.h[5] - umlal2 v26.4s, v8.8h, v0.h[5] - umlsl v28.4s, v8.4h, v0.h[7] - umlsl2 v30.4s, v8.8h, v0.h[7] -.endm - -.macro do_rgb_to_yuv_stage2 - rshrn v20.4h, v14.4s, #16 - shrn v22.4h, v18.4s, #16 - shrn v24.4h, v28.4s, #16 - rshrn2 v20.8h, v16.4s, #16 - shrn2 v22.8h, v26.4s, #16 - shrn2 v24.8h, v30.4s, #16 - xtn v20.8b, v20.8h /* v20 = y */ - xtn v21.8b, v22.8h /* v21 = u */ - xtn v22.8b, v24.8h /* v22 = v */ -.endm - -.macro do_rgb_to_yuv - do_rgb_to_yuv_stage1 - do_rgb_to_yuv_stage2 -.endm - -/* TODO: expand macros and interleave instructions if some in-order - * ARM64 processor actually can dual-issue LOAD/STORE with ALU */ -.macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3 - do_rgb_to_yuv_stage2 - do_load \bpp, 8, \fast_ld3 - st1 {v20.8b}, [Y], #8 - st1 {v21.8b}, [U], #8 - st1 {v22.8b}, [V], #8 - do_rgb_to_yuv_stage1 -.endm - - -.if \fast_ld3 == 1 -asm_function jsimd_\colorid\()_ycc_convert_neon -.else -asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3 -.endif - OUTPUT_WIDTH .req w0 - INPUT_BUF .req x1 - OUTPUT_BUF .req x2 - OUTPUT_ROW .req w3 - NUM_ROWS .req w4 - - OUTPUT_BUF0 .req x5 - OUTPUT_BUF1 .req x6 - OUTPUT_BUF2 .req x2 /* OUTPUT_BUF */ - - RGB .req x7 - Y .req x9 - U .req x10 - V .req x11 - N .req w12 - - /* Load constants to d0, d1, d2, d3 */ - get_symbol_loc x13, Ljsimd_colorid_ycc_neon_consts - - ld1 {v0.8h, v1.8h}, [x13] - - ldr OUTPUT_BUF0, [OUTPUT_BUF] - ldr OUTPUT_BUF1, [OUTPUT_BUF, #8] - ldr OUTPUT_BUF2, [OUTPUT_BUF, #16] - .unreq OUTPUT_BUF - - /* Save NEON registers */ - sub sp, sp, #64 - mov x9, sp - st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 - st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 - - /* Outer loop over scanlines */ - cmp NUM_ROWS, #1 - b.lt 9f -0: - ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3] - ldr U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3] - mov N, OUTPUT_WIDTH - ldr V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3] - add OUTPUT_ROW, OUTPUT_ROW, #1 - ldr RGB, [INPUT_BUF], #8 - - /* Inner loop over pixels */ - subs N, N, #8 - b.lt 3f - do_load \bpp, 8, \fast_ld3 - do_rgb_to_yuv_stage1 - subs N, N, #8 - b.lt 2f -1: - do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3 - subs N, N, #8 - b.ge 1b -2: - do_rgb_to_yuv_stage2 - do_store 8 - tst N, #7 - b.eq 8f -3: - tbz N, #2, 3f - do_load \bpp, 4, \fast_ld3 -3: - tbz N, #1, 4f - do_load \bpp, 2, \fast_ld3 -4: - tbz N, #0, 5f - do_load \bpp, 1, \fast_ld3 -5: - do_rgb_to_yuv - tbz N, #2, 6f - do_store 4 -6: - tbz N, #1, 7f - do_store 2 -7: - tbz N, #0, 8f - do_store 1 -8: - subs NUM_ROWS, NUM_ROWS, #1 - b.gt 0b -9: - /* Restore all registers and return */ - ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 - ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 - br x30 - - .unreq OUTPUT_WIDTH - .unreq OUTPUT_ROW - .unreq INPUT_BUF - .unreq NUM_ROWS - .unreq OUTPUT_BUF0 - .unreq OUTPUT_BUF1 - .unreq OUTPUT_BUF2 - .unreq RGB - .unreq Y - .unreq U - .unreq V - .unreq N - -.purgem do_rgb_to_yuv -.purgem do_rgb_to_yuv_stage1 -.purgem do_rgb_to_yuv_stage2 -.purgem do_rgb_to_yuv_stage2_store_load_stage1 - -.endm - -/*--------------------------------- id ----- bpp R G B Fast LD3 */ -generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 1 -generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 1 -generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1 -generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1 -generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1 -generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1 - -generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 0 -generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 0 - -.purgem do_load -.purgem do_store - - -/*****************************************************************************/ - -/* - * Load data into workspace, applying unsigned->signed conversion - * - * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get - * rid of VST1.16 instructions - */ - -asm_function jsimd_convsamp_neon - SAMPLE_DATA .req x0 - START_COL .req x1 - WORKSPACE .req x2 - TMP1 .req x9 - TMP2 .req x10 - TMP3 .req x11 - TMP4 .req x12 - TMP5 .req x13 - TMP6 .req x14 - TMP7 .req x15 - TMP8 .req x4 - TMPDUP .req w3 - - /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't - guarantee that the upper (unused) 32 bits of x1 are valid. This - instruction ensures that those bits are set to zero. */ - uxtw x1, w1 - - mov TMPDUP, #128 - ldp TMP1, TMP2, [SAMPLE_DATA], 16 - ldp TMP3, TMP4, [SAMPLE_DATA], 16 - dup v0.8b, TMPDUP - add TMP1, TMP1, START_COL - add TMP2, TMP2, START_COL - ldp TMP5, TMP6, [SAMPLE_DATA], 16 - add TMP3, TMP3, START_COL - add TMP4, TMP4, START_COL - ldp TMP7, TMP8, [SAMPLE_DATA], 16 - add TMP5, TMP5, START_COL - add TMP6, TMP6, START_COL - ld1 {v16.8b}, [TMP1] - add TMP7, TMP7, START_COL - add TMP8, TMP8, START_COL - ld1 {v17.8b}, [TMP2] - usubl v16.8h, v16.8b, v0.8b - ld1 {v18.8b}, [TMP3] - usubl v17.8h, v17.8b, v0.8b - ld1 {v19.8b}, [TMP4] - usubl v18.8h, v18.8b, v0.8b - ld1 {v20.8b}, [TMP5] - usubl v19.8h, v19.8b, v0.8b - ld1 {v21.8b}, [TMP6] - st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64 - usubl v20.8h, v20.8b, v0.8b - ld1 {v22.8b}, [TMP7] - usubl v21.8h, v21.8b, v0.8b - ld1 {v23.8b}, [TMP8] - usubl v22.8h, v22.8b, v0.8b - usubl v23.8h, v23.8b, v0.8b - st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64 - - br x30 - - .unreq SAMPLE_DATA - .unreq START_COL - .unreq WORKSPACE - .unreq TMP1 - .unreq TMP2 - .unreq TMP3 - .unreq TMP4 - .unreq TMP5 - .unreq TMP6 - .unreq TMP7 - .unreq TMP8 - .unreq TMPDUP - -/*****************************************************************************/ - -/* - * jsimd_fdct_islow_neon - * - * This file contains a slow-but-accurate integer implementation of the - * forward DCT (Discrete Cosine Transform). The following code is based - * directly on the IJG''s original jfdctint.c; see the jfdctint.c for - * more details. - * - * TODO: can be combined with 'jsimd_convsamp_neon' to get - * rid of a bunch of VLD1.16 instructions - */ - -#define CONST_BITS 13 -#define PASS1_BITS 2 - -#define DESCALE_P1 (CONST_BITS - PASS1_BITS) -#define DESCALE_P2 (CONST_BITS + PASS1_BITS) - -#define XFIX_P_0_298 v0.h[0] -#define XFIX_N_0_390 v0.h[1] -#define XFIX_P_0_541 v0.h[2] -#define XFIX_P_0_765 v0.h[3] -#define XFIX_N_0_899 v0.h[4] -#define XFIX_P_1_175 v0.h[5] -#define XFIX_P_1_501 v0.h[6] -#define XFIX_N_1_847 v0.h[7] -#define XFIX_N_1_961 v1.h[0] -#define XFIX_P_2_053 v1.h[1] -#define XFIX_N_2_562 v1.h[2] -#define XFIX_P_3_072 v1.h[3] - -asm_function jsimd_fdct_islow_neon - - DATA .req x0 - TMP .req x9 - - /* Load constants */ - get_symbol_loc TMP, Ljsimd_fdct_islow_neon_consts - ld1 {v0.8h, v1.8h}, [TMP] - - /* Save NEON registers */ - sub sp, sp, #64 - mov x10, sp - st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32 - st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32 - - /* Load all DATA into NEON registers with the following allocation: - * 0 1 2 3 | 4 5 6 7 - * ---------+-------- - * 0 | d16 | d17 | v16.8h - * 1 | d18 | d19 | v17.8h - * 2 | d20 | d21 | v18.8h - * 3 | d22 | d23 | v19.8h - * 4 | d24 | d25 | v20.8h - * 5 | d26 | d27 | v21.8h - * 6 | d28 | d29 | v22.8h - * 7 | d30 | d31 | v23.8h - */ - - ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 - ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] - sub DATA, DATA, #64 - - /* Transpose */ - transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4 - /* 1-D FDCT */ - add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */ - sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */ - add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */ - sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */ - add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */ - sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */ - add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */ - sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */ - - /* even part */ - - add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */ - sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */ - add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */ - sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */ - - add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */ - sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */ - - add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */ - - shl v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */ - shl v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */ - - smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ - smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ - mov v22.16b, v18.16b - mov v25.16b, v24.16b - - smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ - smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ - smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ - smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ - - rshrn v18.4h, v18.4s, #DESCALE_P1 - rshrn v22.4h, v22.4s, #DESCALE_P1 - rshrn2 v18.8h, v24.4s, #DESCALE_P1 /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */ - rshrn2 v22.8h, v25.4s, #DESCALE_P1 /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */ - - /* Odd part */ - - add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */ - add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */ - add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */ - add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */ - smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */ - smull2 v5.4s, v10.8h, XFIX_P_1_175 - smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */ - smlal2 v5.4s, v11.8h, XFIX_P_1_175 - - smull2 v24.4s, v28.8h, XFIX_P_0_298 - smull2 v25.4s, v29.8h, XFIX_P_2_053 - smull2 v26.4s, v30.8h, XFIX_P_3_072 - smull2 v27.4s, v31.8h, XFIX_P_1_501 - smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */ - smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */ - smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */ - smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */ - - smull2 v12.4s, v8.8h, XFIX_N_0_899 - smull2 v13.4s, v9.8h, XFIX_N_2_562 - smull2 v14.4s, v10.8h, XFIX_N_1_961 - smull2 v15.4s, v11.8h, XFIX_N_0_390 - smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223); */ - smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447); */ - smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560); */ - smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644); */ - - add v10.4s, v10.4s, v4.4s /* z3 += z5 */ - add v14.4s, v14.4s, v5.4s - add v11.4s, v11.4s, v4.4s /* z4 += z5 */ - add v15.4s, v15.4s, v5.4s - - add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */ - add v24.4s, v24.4s, v12.4s - add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */ - add v25.4s, v25.4s, v13.4s - add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */ - add v26.4s, v26.4s, v14.4s - add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */ - add v27.4s, v27.4s, v15.4s - - add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */ - add v24.4s, v24.4s, v14.4s - add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */ - add v25.4s, v25.4s, v15.4s - add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */ - add v26.4s, v26.4s, v13.4s - add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */ - add v27.4s, v27.4s, v12.4s - - rshrn v23.4h, v28.4s, #DESCALE_P1 - rshrn v21.4h, v29.4s, #DESCALE_P1 - rshrn v19.4h, v30.4s, #DESCALE_P1 - rshrn v17.4h, v31.4s, #DESCALE_P1 - rshrn2 v23.8h, v24.4s, #DESCALE_P1 /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */ - rshrn2 v21.8h, v25.4s, #DESCALE_P1 /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */ - rshrn2 v19.8h, v26.4s, #DESCALE_P1 /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */ - rshrn2 v17.8h, v27.4s, #DESCALE_P1 /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */ - - /* Transpose */ - transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4 - - /* 1-D FDCT */ - add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */ - sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */ - add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */ - sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */ - add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */ - sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */ - add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */ - sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */ - - /* even part */ - add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */ - sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */ - add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */ - sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */ - - add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */ - sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */ - - add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */ - - srshr v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS); */ - srshr v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS); */ - - smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ - smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ - mov v22.16b, v18.16b - mov v25.16b, v24.16b - - smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ - smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ - smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ - smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ - - rshrn v18.4h, v18.4s, #DESCALE_P2 - rshrn v22.4h, v22.4s, #DESCALE_P2 - rshrn2 v18.8h, v24.4s, #DESCALE_P2 /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */ - rshrn2 v22.8h, v25.4s, #DESCALE_P2 /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */ - - /* Odd part */ - add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */ - add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */ - add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */ - add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */ - - smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */ - smull2 v5.4s, v10.8h, XFIX_P_1_175 - smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */ - smlal2 v5.4s, v11.8h, XFIX_P_1_175 - - smull2 v24.4s, v28.8h, XFIX_P_0_298 - smull2 v25.4s, v29.8h, XFIX_P_2_053 - smull2 v26.4s, v30.8h, XFIX_P_3_072 - smull2 v27.4s, v31.8h, XFIX_P_1_501 - smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */ - smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */ - smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */ - smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */ - - smull2 v12.4s, v8.8h, XFIX_N_0_899 - smull2 v13.4s, v9.8h, XFIX_N_2_562 - smull2 v14.4s, v10.8h, XFIX_N_1_961 - smull2 v15.4s, v11.8h, XFIX_N_0_390 - smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223); */ - smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447); */ - smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560); */ - smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644); */ - - add v10.4s, v10.4s, v4.4s - add v14.4s, v14.4s, v5.4s - add v11.4s, v11.4s, v4.4s - add v15.4s, v15.4s, v5.4s - - add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */ - add v24.4s, v24.4s, v12.4s - add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */ - add v25.4s, v25.4s, v13.4s - add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */ - add v26.4s, v26.4s, v14.4s - add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */ - add v27.4s, v27.4s, v15.4s - - add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */ - add v24.4s, v24.4s, v14.4s - add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */ - add v25.4s, v25.4s, v15.4s - add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */ - add v26.4s, v26.4s, v13.4s - add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */ - add v27.4s, v27.4s, v12.4s - - rshrn v23.4h, v28.4s, #DESCALE_P2 - rshrn v21.4h, v29.4s, #DESCALE_P2 - rshrn v19.4h, v30.4s, #DESCALE_P2 - rshrn v17.4h, v31.4s, #DESCALE_P2 - rshrn2 v23.8h, v24.4s, #DESCALE_P2 /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */ - rshrn2 v21.8h, v25.4s, #DESCALE_P2 /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */ - rshrn2 v19.8h, v26.4s, #DESCALE_P2 /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */ - rshrn2 v17.8h, v27.4s, #DESCALE_P2 /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */ - - /* store results */ - st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 - st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] - - /* Restore NEON registers */ - ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 - ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 - - br x30 - - .unreq DATA - .unreq TMP - -#undef XFIX_P_0_298 -#undef XFIX_N_0_390 -#undef XFIX_P_0_541 -#undef XFIX_P_0_765 -#undef XFIX_N_0_899 -#undef XFIX_P_1_175 -#undef XFIX_P_1_501 -#undef XFIX_N_1_847 -#undef XFIX_N_1_961 -#undef XFIX_P_2_053 -#undef XFIX_N_2_562 -#undef XFIX_P_3_072 - - -/*****************************************************************************/ - -/* - * jsimd_fdct_ifast_neon - * - * This function contains a fast, not so accurate integer implementation of - * the forward DCT (Discrete Cosine Transform). It uses the same calculations - * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' - * function from jfdctfst.c - * - * TODO: can be combined with 'jsimd_convsamp_neon' to get - * rid of a bunch of VLD1.16 instructions - */ - -#undef XFIX_0_541196100 -#define XFIX_0_382683433 v0.h[0] -#define XFIX_0_541196100 v0.h[1] -#define XFIX_0_707106781 v0.h[2] -#define XFIX_1_306562965 v0.h[3] - -asm_function jsimd_fdct_ifast_neon - - DATA .req x0 - TMP .req x9 - - /* Load constants */ - get_symbol_loc TMP, Ljsimd_fdct_ifast_neon_consts - ld1 {v0.4h}, [TMP] - - /* Load all DATA into NEON registers with the following allocation: - * 0 1 2 3 | 4 5 6 7 - * ---------+-------- - * 0 | d16 | d17 | v0.8h - * 1 | d18 | d19 | q9 - * 2 | d20 | d21 | q10 - * 3 | d22 | d23 | q11 - * 4 | d24 | d25 | q12 - * 5 | d26 | d27 | q13 - * 6 | d28 | d29 | q14 - * 7 | d30 | d31 | q15 - */ - - ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 - ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] - mov TMP, #2 - sub DATA, DATA, #64 -1: - /* Transpose */ - transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4 - subs TMP, TMP, #1 - /* 1-D FDCT */ - add v4.8h, v19.8h, v20.8h - sub v20.8h, v19.8h, v20.8h - sub v28.8h, v18.8h, v21.8h - add v18.8h, v18.8h, v21.8h - sub v29.8h, v17.8h, v22.8h - add v17.8h, v17.8h, v22.8h - sub v21.8h, v16.8h, v23.8h - add v16.8h, v16.8h, v23.8h - sub v6.8h, v17.8h, v18.8h - sub v7.8h, v16.8h, v4.8h - add v5.8h, v17.8h, v18.8h - add v6.8h, v6.8h, v7.8h - add v4.8h, v16.8h, v4.8h - sqdmulh v6.8h, v6.8h, XFIX_0_707106781 - add v19.8h, v20.8h, v28.8h - add v16.8h, v4.8h, v5.8h - sub v20.8h, v4.8h, v5.8h - add v5.8h, v28.8h, v29.8h - add v29.8h, v29.8h, v21.8h - sqdmulh v5.8h, v5.8h, XFIX_0_707106781 - sub v28.8h, v19.8h, v29.8h - add v18.8h, v7.8h, v6.8h - sqdmulh v28.8h, v28.8h, XFIX_0_382683433 - sub v22.8h, v7.8h, v6.8h - sqdmulh v19.8h, v19.8h, XFIX_0_541196100 - sqdmulh v7.8h, v29.8h, XFIX_1_306562965 - add v6.8h, v21.8h, v5.8h - sub v5.8h, v21.8h, v5.8h - add v29.8h, v29.8h, v28.8h - add v19.8h, v19.8h, v28.8h - add v29.8h, v29.8h, v7.8h - add v21.8h, v5.8h, v19.8h - sub v19.8h, v5.8h, v19.8h - add v17.8h, v6.8h, v29.8h - sub v23.8h, v6.8h, v29.8h - - b.ne 1b - - /* store results */ - st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 - st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] - - br x30 - - .unreq DATA - .unreq TMP -#undef XFIX_0_382683433 -#undef XFIX_0_541196100 -#undef XFIX_0_707106781 -#undef XFIX_1_306562965 - - -/*****************************************************************************/ - -/* - * GLOBAL(void) - * jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors, - * DCTELEM *workspace); - * - */ -asm_function jsimd_quantize_neon - - COEF_BLOCK .req x0 - DIVISORS .req x1 - WORKSPACE .req x2 - - RECIPROCAL .req DIVISORS - CORRECTION .req x9 - SHIFT .req x10 - LOOP_COUNT .req x11 - - mov LOOP_COUNT, #2 - add CORRECTION, DIVISORS, #(64 * 2) - add SHIFT, DIVISORS, #(64 * 6) -1: - subs LOOP_COUNT, LOOP_COUNT, #1 - ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64 - ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64 - abs v20.8h, v0.8h - abs v21.8h, v1.8h - abs v22.8h, v2.8h - abs v23.8h, v3.8h - ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64 - add v20.8h, v20.8h, v4.8h /* add correction */ - add v21.8h, v21.8h, v5.8h - add v22.8h, v22.8h, v6.8h - add v23.8h, v23.8h, v7.8h - umull v4.4s, v20.4h, v28.4h /* multiply by reciprocal */ - umull2 v16.4s, v20.8h, v28.8h - umull v5.4s, v21.4h, v29.4h - umull2 v17.4s, v21.8h, v29.8h - umull v6.4s, v22.4h, v30.4h /* multiply by reciprocal */ - umull2 v18.4s, v22.8h, v30.8h - umull v7.4s, v23.4h, v31.4h - umull2 v19.4s, v23.8h, v31.8h - ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64 - shrn v4.4h, v4.4s, #16 - shrn v5.4h, v5.4s, #16 - shrn v6.4h, v6.4s, #16 - shrn v7.4h, v7.4s, #16 - shrn2 v4.8h, v16.4s, #16 - shrn2 v5.8h, v17.4s, #16 - shrn2 v6.8h, v18.4s, #16 - shrn2 v7.8h, v19.4s, #16 - neg v24.8h, v24.8h - neg v25.8h, v25.8h - neg v26.8h, v26.8h - neg v27.8h, v27.8h - sshr v0.8h, v0.8h, #15 /* extract sign */ - sshr v1.8h, v1.8h, #15 - sshr v2.8h, v2.8h, #15 - sshr v3.8h, v3.8h, #15 - ushl v4.8h, v4.8h, v24.8h /* shift */ - ushl v5.8h, v5.8h, v25.8h - ushl v6.8h, v6.8h, v26.8h - ushl v7.8h, v7.8h, v27.8h - - eor v4.16b, v4.16b, v0.16b /* restore sign */ - eor v5.16b, v5.16b, v1.16b - eor v6.16b, v6.16b, v2.16b - eor v7.16b, v7.16b, v3.16b - sub v4.8h, v4.8h, v0.8h - sub v5.8h, v5.8h, v1.8h - sub v6.8h, v6.8h, v2.8h - sub v7.8h, v7.8h, v3.8h - st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64 - - b.ne 1b - - br x30 /* return */ - - .unreq COEF_BLOCK - .unreq DIVISORS - .unreq WORKSPACE - .unreq RECIPROCAL - .unreq CORRECTION - .unreq SHIFT - .unreq LOOP_COUNT - - -/*****************************************************************************/ - -/* - * Downsample pixel values of a single component. - * This version handles the common case of 2:1 horizontal and 1:1 vertical, - * without smoothing. - * - * GLOBAL(void) - * jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor, - * JDIMENSION v_samp_factor, - * JDIMENSION width_in_blocks, - * JSAMPARRAY input_data, JSAMPARRAY output_data); - */ - -asm_function jsimd_h2v1_downsample_neon - IMAGE_WIDTH .req x0 - MAX_V_SAMP .req x1 - V_SAMP .req x2 - BLOCK_WIDTH .req x3 - INPUT_DATA .req x4 - OUTPUT_DATA .req x5 - OUTPTR .req x9 - INPTR .req x10 - TMP1 .req x11 - TMP2 .req x12 - TMP3 .req x13 - TMPDUP .req w15 - - mov TMPDUP, #0x10000 - lsl TMP2, BLOCK_WIDTH, #4 - sub TMP2, TMP2, IMAGE_WIDTH - get_symbol_loc TMP3, Ljsimd_h2_downsample_neon_consts - add TMP3, TMP3, TMP2, lsl #4 - dup v16.4s, TMPDUP - ld1 {v18.16b}, [TMP3] - -1: /* row loop */ - ldr INPTR, [INPUT_DATA], #8 - ldr OUTPTR, [OUTPUT_DATA], #8 - subs TMP1, BLOCK_WIDTH, #1 - b.eq 3f -2: /* columns */ - ld1 {v0.16b}, [INPTR], #16 - mov v4.16b, v16.16b - subs TMP1, TMP1, #1 - uadalp v4.8h, v0.16b - shrn v6.8b, v4.8h, #1 - st1 {v6.8b}, [OUTPTR], #8 - b.ne 2b -3: /* last columns */ - ld1 {v0.16b}, [INPTR] - mov v4.16b, v16.16b - subs V_SAMP, V_SAMP, #1 - /* expand right */ - tbl v2.16b, {v0.16b}, v18.16b - uadalp v4.8h, v2.16b - shrn v6.8b, v4.8h, #1 - st1 {v6.8b}, [OUTPTR], #8 - b.ne 1b - - br x30 - - .unreq IMAGE_WIDTH - .unreq MAX_V_SAMP - .unreq V_SAMP - .unreq BLOCK_WIDTH - .unreq INPUT_DATA - .unreq OUTPUT_DATA - .unreq OUTPTR - .unreq INPTR - .unreq TMP1 - .unreq TMP2 - .unreq TMP3 - .unreq TMPDUP - - -/*****************************************************************************/ - -/* - * Downsample pixel values of a single component. - * This version handles the common case of 2:1 horizontal and 2:1 vertical, - * without smoothing. - * - * GLOBAL(void) - * jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor, - * JDIMENSION v_samp_factor, - * JDIMENSION width_in_blocks, - * JSAMPARRAY input_data, JSAMPARRAY output_data); - */ - -.balign 16 -asm_function jsimd_h2v2_downsample_neon - IMAGE_WIDTH .req x0 - MAX_V_SAMP .req x1 - V_SAMP .req x2 - BLOCK_WIDTH .req x3 - INPUT_DATA .req x4 - OUTPUT_DATA .req x5 - OUTPTR .req x9 - INPTR0 .req x10 - INPTR1 .req x14 - TMP1 .req x11 - TMP2 .req x12 - TMP3 .req x13 - TMPDUP .req w15 - - mov TMPDUP, #1 - lsl TMP2, BLOCK_WIDTH, #4 - lsl TMPDUP, TMPDUP, #17 - sub TMP2, TMP2, IMAGE_WIDTH - get_symbol_loc TMP3, Ljsimd_h2_downsample_neon_consts - orr TMPDUP, TMPDUP, #1 - add TMP3, TMP3, TMP2, lsl #4 - dup v16.4s, TMPDUP - ld1 {v18.16b}, [TMP3] - -1: /* row loop */ - ldr INPTR0, [INPUT_DATA], #8 - ldr OUTPTR, [OUTPUT_DATA], #8 - ldr INPTR1, [INPUT_DATA], #8 - subs TMP1, BLOCK_WIDTH, #1 - b.eq 3f -2: /* columns */ - ld1 {v0.16b}, [INPTR0], #16 - ld1 {v1.16b}, [INPTR1], #16 - mov v4.16b, v16.16b - subs TMP1, TMP1, #1 - uadalp v4.8h, v0.16b - uadalp v4.8h, v1.16b - shrn v6.8b, v4.8h, #2 - st1 {v6.8b}, [OUTPTR], #8 - b.ne 2b -3: /* last columns */ - ld1 {v0.16b}, [INPTR0], #16 - ld1 {v1.16b}, [INPTR1], #16 - mov v4.16b, v16.16b - subs V_SAMP, V_SAMP, #1 - /* expand right */ - tbl v2.16b, {v0.16b}, v18.16b - tbl v3.16b, {v1.16b}, v18.16b - uadalp v4.8h, v2.16b - uadalp v4.8h, v3.16b - shrn v6.8b, v4.8h, #2 - st1 {v6.8b}, [OUTPTR], #8 - b.ne 1b - - br x30 - - .unreq IMAGE_WIDTH - .unreq MAX_V_SAMP - .unreq V_SAMP - .unreq BLOCK_WIDTH - .unreq INPUT_DATA - .unreq OUTPUT_DATA - .unreq OUTPTR - .unreq INPTR0 - .unreq INPTR1 - .unreq TMP1 - .unreq TMP2 - .unreq TMP3 - .unreq TMPDUP - - -/*****************************************************************************/ - -/* - * GLOBAL(JOCTET *) - * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer, - * JCOEFPTR block, int last_dc_val, - * c_derived_tbl *dctbl, c_derived_tbl *actbl) - * - */ - - BUFFER .req x1 - PUT_BUFFER .req x6 - PUT_BITS .req x7 - PUT_BITSw .req w7 - -.macro emit_byte - sub PUT_BITS, PUT_BITS, #0x8 - lsr x19, PUT_BUFFER, PUT_BITS - uxtb w19, w19 - strb w19, [BUFFER, #1]! - cmp w19, #0xff - b.ne 14f - strb wzr, [BUFFER, #1]! -14: -.endm -.macro put_bits CODE, SIZE - lsl PUT_BUFFER, PUT_BUFFER, \SIZE - add PUT_BITS, PUT_BITS, \SIZE - orr PUT_BUFFER, PUT_BUFFER, \CODE -.endm -.macro checkbuf31 - cmp PUT_BITS, #0x20 - b.lt 31f - emit_byte - emit_byte - emit_byte - emit_byte -31: -.endm -.macro checkbuf47 - cmp PUT_BITS, #0x30 - b.lt 47f - emit_byte - emit_byte - emit_byte - emit_byte - emit_byte - emit_byte -47: -.endm - -.macro generate_jsimd_huff_encode_one_block fast_tbl - -.balign 16 - -.if \fast_tbl == 1 -asm_function jsimd_huff_encode_one_block_neon -.else -asm_function jsimd_huff_encode_one_block_neon_slowtbl -.endif - sub sp, sp, 272 - sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */ - /* Save ARM registers */ - stp x19, x20, [sp] -.if \fast_tbl == 1 - get_symbol_loc x15, Ljsimd_huff_encode_one_block_neon_consts -.else - get_symbol_loc x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts -.endif - ldr PUT_BUFFER, [x0, #0x10] - ldr PUT_BITSw, [x0, #0x18] - ldrsh w12, [x2] /* load DC coeff in w12 */ - /* prepare data */ -.if \fast_tbl == 1 - ld1 {v23.16b}, [x15], #16 - ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64 - ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64 - ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64 - ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64 - ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64 - sub w12, w12, w3 /* last_dc_val, not used afterwards */ - /* ZigZag 8x8 */ - tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b - tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b - tbl v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b - tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b - tbl v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b - tbl v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b - tbl v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b - tbl v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b - ins v0.h[0], w12 - tbx v1.16b, {v28.16b}, v16.16b - tbx v2.16b, {v29.16b, v30.16b}, v17.16b - tbx v5.16b, {v29.16b, v30.16b}, v18.16b - tbx v6.16b, {v31.16b}, v19.16b -.else - add x13, x2, #0x22 - sub w12, w12, w3 /* last_dc_val, not used afterwards */ - ld1 {v23.16b}, [x15] - add x14, x2, #0x18 - add x3, x2, #0x36 - ins v0.h[0], w12 - add x9, x2, #0x2 - ld1 {v1.h}[0], [x13] - add x15, x2, #0x30 - ld1 {v2.h}[0], [x14] - add x19, x2, #0x26 - ld1 {v3.h}[0], [x3] - add x20, x2, #0x28 - ld1 {v0.h}[1], [x9] - add x12, x2, #0x10 - ld1 {v1.h}[1], [x15] - add x13, x2, #0x40 - ld1 {v2.h}[1], [x19] - add x14, x2, #0x34 - ld1 {v3.h}[1], [x20] - add x3, x2, #0x1a - ld1 {v0.h}[2], [x12] - add x9, x2, #0x20 - ld1 {v1.h}[2], [x13] - add x15, x2, #0x32 - ld1 {v2.h}[2], [x14] - add x19, x2, #0x42 - ld1 {v3.h}[2], [x3] - add x20, x2, #0xc - ld1 {v0.h}[3], [x9] - add x12, x2, #0x12 - ld1 {v1.h}[3], [x15] - add x13, x2, #0x24 - ld1 {v2.h}[3], [x19] - add x14, x2, #0x50 - ld1 {v3.h}[3], [x20] - add x3, x2, #0xe - ld1 {v0.h}[4], [x12] - add x9, x2, #0x4 - ld1 {v1.h}[4], [x13] - add x15, x2, #0x16 - ld1 {v2.h}[4], [x14] - add x19, x2, #0x60 - ld1 {v3.h}[4], [x3] - add x20, x2, #0x1c - ld1 {v0.h}[5], [x9] - add x12, x2, #0x6 - ld1 {v1.h}[5], [x15] - add x13, x2, #0x8 - ld1 {v2.h}[5], [x19] - add x14, x2, #0x52 - ld1 {v3.h}[5], [x20] - add x3, x2, #0x2a - ld1 {v0.h}[6], [x12] - add x9, x2, #0x14 - ld1 {v1.h}[6], [x13] - add x15, x2, #0xa - ld1 {v2.h}[6], [x14] - add x19, x2, #0x44 - ld1 {v3.h}[6], [x3] - add x20, x2, #0x38 - ld1 {v0.h}[7], [x9] - add x12, x2, #0x46 - ld1 {v1.h}[7], [x15] - add x13, x2, #0x3a - ld1 {v2.h}[7], [x19] - add x14, x2, #0x74 - ld1 {v3.h}[7], [x20] - add x3, x2, #0x6a - ld1 {v4.h}[0], [x12] - add x9, x2, #0x54 - ld1 {v5.h}[0], [x13] - add x15, x2, #0x2c - ld1 {v6.h}[0], [x14] - add x19, x2, #0x76 - ld1 {v7.h}[0], [x3] - add x20, x2, #0x78 - ld1 {v4.h}[1], [x9] - add x12, x2, #0x62 - ld1 {v5.h}[1], [x15] - add x13, x2, #0x1e - ld1 {v6.h}[1], [x19] - add x14, x2, #0x68 - ld1 {v7.h}[1], [x20] - add x3, x2, #0x7a - ld1 {v4.h}[2], [x12] - add x9, x2, #0x70 - ld1 {v5.h}[2], [x13] - add x15, x2, #0x2e - ld1 {v6.h}[2], [x14] - add x19, x2, #0x5a - ld1 {v7.h}[2], [x3] - add x20, x2, #0x6c - ld1 {v4.h}[3], [x9] - add x12, x2, #0x72 - ld1 {v5.h}[3], [x15] - add x13, x2, #0x3c - ld1 {v6.h}[3], [x19] - add x14, x2, #0x4c - ld1 {v7.h}[3], [x20] - add x3, x2, #0x5e - ld1 {v4.h}[4], [x12] - add x9, x2, #0x64 - ld1 {v5.h}[4], [x13] - add x15, x2, #0x4a - ld1 {v6.h}[4], [x14] - add x19, x2, #0x3e - ld1 {v7.h}[4], [x3] - add x20, x2, #0x6e - ld1 {v4.h}[5], [x9] - add x12, x2, #0x56 - ld1 {v5.h}[5], [x15] - add x13, x2, #0x58 - ld1 {v6.h}[5], [x19] - add x14, x2, #0x4e - ld1 {v7.h}[5], [x20] - add x3, x2, #0x7c - ld1 {v4.h}[6], [x12] - add x9, x2, #0x48 - ld1 {v5.h}[6], [x13] - add x15, x2, #0x66 - ld1 {v6.h}[6], [x14] - add x19, x2, #0x5c - ld1 {v7.h}[6], [x3] - add x20, x2, #0x7e - ld1 {v4.h}[7], [x9] - ld1 {v5.h}[7], [x15] - ld1 {v6.h}[7], [x19] - ld1 {v7.h}[7], [x20] -.endif - cmlt v24.8h, v0.8h, #0 - cmlt v25.8h, v1.8h, #0 - cmlt v26.8h, v2.8h, #0 - cmlt v27.8h, v3.8h, #0 - cmlt v28.8h, v4.8h, #0 - cmlt v29.8h, v5.8h, #0 - cmlt v30.8h, v6.8h, #0 - cmlt v31.8h, v7.8h, #0 - abs v0.8h, v0.8h - abs v1.8h, v1.8h - abs v2.8h, v2.8h - abs v3.8h, v3.8h - abs v4.8h, v4.8h - abs v5.8h, v5.8h - abs v6.8h, v6.8h - abs v7.8h, v7.8h - eor v24.16b, v24.16b, v0.16b - eor v25.16b, v25.16b, v1.16b - eor v26.16b, v26.16b, v2.16b - eor v27.16b, v27.16b, v3.16b - eor v28.16b, v28.16b, v4.16b - eor v29.16b, v29.16b, v5.16b - eor v30.16b, v30.16b, v6.16b - eor v31.16b, v31.16b, v7.16b - cmeq v16.8h, v0.8h, #0 - cmeq v17.8h, v1.8h, #0 - cmeq v18.8h, v2.8h, #0 - cmeq v19.8h, v3.8h, #0 - cmeq v20.8h, v4.8h, #0 - cmeq v21.8h, v5.8h, #0 - cmeq v22.8h, v6.8h, #0 - xtn v16.8b, v16.8h - xtn v18.8b, v18.8h - xtn v20.8b, v20.8h - xtn v22.8b, v22.8h - umov w14, v0.h[0] - xtn2 v16.16b, v17.8h - umov w13, v24.h[0] - xtn2 v18.16b, v19.8h - clz w14, w14 - xtn2 v20.16b, v21.8h - lsl w13, w13, w14 - cmeq v17.8h, v7.8h, #0 - sub w12, w14, #32 - xtn2 v22.16b, v17.8h - lsr w13, w13, w14 - and v16.16b, v16.16b, v23.16b - neg w12, w12 - and v18.16b, v18.16b, v23.16b - add x3, x4, #0x400 /* r1 = dctbl->ehufsi */ - and v20.16b, v20.16b, v23.16b - add x15, sp, #0x90 /* x15 = t2 */ - and v22.16b, v22.16b, v23.16b - ldr w10, [x4, x12, lsl #2] - addp v16.16b, v16.16b, v18.16b - ldrb w11, [x3, x12] - addp v20.16b, v20.16b, v22.16b - checkbuf47 - addp v16.16b, v16.16b, v20.16b - put_bits x10, x11 - addp v16.16b, v16.16b, v18.16b - checkbuf47 - umov x9, v16.D[0] - put_bits x13, x12 - cnt v17.8b, v16.8b - mvn x9, x9 - addv B18, v17.8b - add x4, x5, #0x400 /* x4 = actbl->ehufsi */ - umov w12, v18.b[0] - lsr x9, x9, #0x1 /* clear AC coeff */ - ldr w13, [x5, #0x3c0] /* x13 = actbl->ehufco[0xf0] */ - rbit x9, x9 /* x9 = index0 */ - ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */ - cmp w12, #(64-8) - add x11, sp, #16 - b.lt 4f - cbz x9, 6f - st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64 - st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64 - st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64 - st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64 -1: - clz x2, x9 - add x15, x15, x2, lsl #1 - lsl x9, x9, x2 - ldrh w20, [x15, #-126] -2: - cmp x2, #0x10 - b.lt 3f - sub x2, x2, #0x10 - checkbuf47 - put_bits x13, x14 - b 2b -3: - clz w20, w20 - ldrh w3, [x15, #2]! - sub w11, w20, #32 - lsl w3, w3, w20 - neg w11, w11 - lsr w3, w3, w20 - add x2, x11, x2, lsl #4 - lsl x9, x9, #0x1 - ldr w12, [x5, x2, lsl #2] - ldrb w10, [x4, x2] - checkbuf31 - put_bits x12, x10 - put_bits x3, x11 - cbnz x9, 1b - b 6f -4: - movi v21.8h, #0x0010 - clz v0.8h, v0.8h - clz v1.8h, v1.8h - clz v2.8h, v2.8h - clz v3.8h, v3.8h - clz v4.8h, v4.8h - clz v5.8h, v5.8h - clz v6.8h, v6.8h - clz v7.8h, v7.8h - ushl v24.8h, v24.8h, v0.8h - ushl v25.8h, v25.8h, v1.8h - ushl v26.8h, v26.8h, v2.8h - ushl v27.8h, v27.8h, v3.8h - ushl v28.8h, v28.8h, v4.8h - ushl v29.8h, v29.8h, v5.8h - ushl v30.8h, v30.8h, v6.8h - ushl v31.8h, v31.8h, v7.8h - neg v0.8h, v0.8h - neg v1.8h, v1.8h - neg v2.8h, v2.8h - neg v3.8h, v3.8h - neg v4.8h, v4.8h - neg v5.8h, v5.8h - neg v6.8h, v6.8h - neg v7.8h, v7.8h - ushl v24.8h, v24.8h, v0.8h - ushl v25.8h, v25.8h, v1.8h - ushl v26.8h, v26.8h, v2.8h - ushl v27.8h, v27.8h, v3.8h - ushl v28.8h, v28.8h, v4.8h - ushl v29.8h, v29.8h, v5.8h - ushl v30.8h, v30.8h, v6.8h - ushl v31.8h, v31.8h, v7.8h - add v0.8h, v21.8h, v0.8h - add v1.8h, v21.8h, v1.8h - add v2.8h, v21.8h, v2.8h - add v3.8h, v21.8h, v3.8h - add v4.8h, v21.8h, v4.8h - add v5.8h, v21.8h, v5.8h - add v6.8h, v21.8h, v6.8h - add v7.8h, v21.8h, v7.8h - st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64 - st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64 - st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64 - st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64 -1: - clz x2, x9 - add x15, x15, x2, lsl #1 - lsl x9, x9, x2 - ldrh w11, [x15, #-126] -2: - cmp x2, #0x10 - b.lt 3f - sub x2, x2, #0x10 - checkbuf47 - put_bits x13, x14 - b 2b -3: - ldrh w3, [x15, #2]! - add x2, x11, x2, lsl #4 - lsl x9, x9, #0x1 - ldr w12, [x5, x2, lsl #2] - ldrb w10, [x4, x2] - checkbuf31 - put_bits x12, x10 - put_bits x3, x11 - cbnz x9, 1b -6: - add x13, sp, #0x10e - cmp x15, x13 - b.hs 1f - ldr w12, [x5] - ldrb w14, [x4] - checkbuf47 - put_bits x12, x14 -1: - str PUT_BUFFER, [x0, #0x10] - str PUT_BITSw, [x0, #0x18] - ldp x19, x20, [sp], 16 - add x0, BUFFER, #0x1 - add sp, sp, 256 - br x30 - -.endm - -generate_jsimd_huff_encode_one_block 1 -generate_jsimd_huff_encode_one_block 0 - - .unreq BUFFER - .unreq PUT_BUFFER - .unreq PUT_BITS - .unreq PUT_BITSw - -.purgem emit_byte -.purgem put_bits -.purgem checkbuf31 -.purgem checkbuf47 diff --git a/simd/i386/jccolext-avx2.asm b/simd/i386/jccolext-avx2.asm index 7a8d784..c46d684 100644 --- a/simd/i386/jccolext-avx2.asm +++ b/simd/i386/jccolext-avx2.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jcolsamp.inc" @@ -110,12 +108,12 @@ EXTN(jsimd_rgb_ycc_convert_avx2): test cl, SIZEOF_BYTE jz short .column_ld2 sub ecx, byte SIZEOF_BYTE - movzx eax, BYTE [esi+ecx] + movzx eax, byte [esi+ecx] .column_ld2: test cl, SIZEOF_WORD jz short .column_ld4 sub ecx, byte SIZEOF_WORD - movzx edx, WORD [esi+ecx] + movzx edx, word [esi+ecx] shl eax, WORD_BIT or eax, edx .column_ld4: diff --git a/simd/i386/jccolext-mmx.asm b/simd/i386/jccolext-mmx.asm index 9a2c30e..6357a42 100644 --- a/simd/i386/jccolext-mmx.asm +++ b/simd/i386/jccolext-mmx.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jcolsamp.inc" @@ -111,13 +109,13 @@ EXTN(jsimd_rgb_ycc_convert_mmx): jz short .column_ld2 sub ecx, byte SIZEOF_BYTE xor eax, eax - mov al, BYTE [esi+ecx] + mov al, byte [esi+ecx] .column_ld2: test cl, SIZEOF_WORD jz short .column_ld4 sub ecx, byte SIZEOF_WORD xor edx, edx - mov dx, WORD [esi+ecx] + mov dx, word [esi+ecx] shl eax, WORD_BIT or eax, edx .column_ld4: @@ -127,7 +125,7 @@ EXTN(jsimd_rgb_ycc_convert_mmx): test cl, SIZEOF_DWORD jz short .column_ld8 sub ecx, byte SIZEOF_DWORD - movd mmG, DWORD [esi+ecx] + movd mmG, dword [esi+ecx] psllq mmA, DWORD_BIT por mmA, mmG .column_ld8: @@ -197,7 +195,7 @@ EXTN(jsimd_rgb_ycc_convert_mmx): test cl, SIZEOF_MMWORD/8 jz short .column_ld2 sub ecx, byte SIZEOF_MMWORD/8 - movd mmA, DWORD [esi+ecx*RGB_PIXELSIZE] + movd mmA, dword [esi+ecx*RGB_PIXELSIZE] .column_ld2: test cl, SIZEOF_MMWORD/4 jz short .column_ld4 diff --git a/simd/i386/jccolext-sse2.asm b/simd/i386/jccolext-sse2.asm index e830562..c6c8085 100644 --- a/simd/i386/jccolext-sse2.asm +++ b/simd/i386/jccolext-sse2.asm @@ -12,8 +12,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jcolsamp.inc" @@ -109,12 +107,12 @@ EXTN(jsimd_rgb_ycc_convert_sse2): test cl, SIZEOF_BYTE jz short .column_ld2 sub ecx, byte SIZEOF_BYTE - movzx eax, BYTE [esi+ecx] + movzx eax, byte [esi+ecx] .column_ld2: test cl, SIZEOF_WORD jz short .column_ld4 sub ecx, byte SIZEOF_WORD - movzx edx, WORD [esi+ecx] + movzx edx, word [esi+ecx] shl eax, WORD_BIT or eax, edx .column_ld4: diff --git a/simd/i386/jccolor-avx2.asm b/simd/i386/jccolor-avx2.asm index 958517f..14944e9 100644 --- a/simd/i386/jccolor-avx2.asm +++ b/simd/i386/jccolor-avx2.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/i386/jccolor-mmx.asm b/simd/i386/jccolor-mmx.asm index 47be9e1..8cb399b 100644 --- a/simd/i386/jccolor-mmx.asm +++ b/simd/i386/jccolor-mmx.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/i386/jccolor-sse2.asm b/simd/i386/jccolor-sse2.asm index c0d5d45..686d222 100644 --- a/simd/i386/jccolor-sse2.asm +++ b/simd/i386/jccolor-sse2.asm @@ -12,8 +12,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/i386/jcgray-avx2.asm b/simd/i386/jcgray-avx2.asm index 4d66242..560ee0c 100644 --- a/simd/i386/jcgray-avx2.asm +++ b/simd/i386/jcgray-avx2.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/i386/jcgray-mmx.asm b/simd/i386/jcgray-mmx.asm index 07c7ea6..79fdf08 100644 --- a/simd/i386/jcgray-mmx.asm +++ b/simd/i386/jcgray-mmx.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/i386/jcgray-sse2.asm b/simd/i386/jcgray-sse2.asm index 4b8c797..cb4b28e 100644 --- a/simd/i386/jcgray-sse2.asm +++ b/simd/i386/jcgray-sse2.asm @@ -12,8 +12,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/i386/jcgryext-avx2.asm b/simd/i386/jcgryext-avx2.asm index 52e99a8..3fa7973 100644 --- a/simd/i386/jcgryext-avx2.asm +++ b/simd/i386/jcgryext-avx2.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jcolsamp.inc" @@ -102,12 +100,12 @@ EXTN(jsimd_rgb_gray_convert_avx2): test cl, SIZEOF_BYTE jz short .column_ld2 sub ecx, byte SIZEOF_BYTE - movzx eax, BYTE [esi+ecx] + movzx eax, byte [esi+ecx] .column_ld2: test cl, SIZEOF_WORD jz short .column_ld4 sub ecx, byte SIZEOF_WORD - movzx edx, WORD [esi+ecx] + movzx edx, word [esi+ecx] shl eax, WORD_BIT or eax, edx .column_ld4: diff --git a/simd/i386/jcgryext-mmx.asm b/simd/i386/jcgryext-mmx.asm index 4a9ab0d..8af42e5 100644 --- a/simd/i386/jcgryext-mmx.asm +++ b/simd/i386/jcgryext-mmx.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jcolsamp.inc" @@ -103,13 +101,13 @@ EXTN(jsimd_rgb_gray_convert_mmx): jz short .column_ld2 sub ecx, byte SIZEOF_BYTE xor eax, eax - mov al, BYTE [esi+ecx] + mov al, byte [esi+ecx] .column_ld2: test cl, SIZEOF_WORD jz short .column_ld4 sub ecx, byte SIZEOF_WORD xor edx, edx - mov dx, WORD [esi+ecx] + mov dx, word [esi+ecx] shl eax, WORD_BIT or eax, edx .column_ld4: @@ -119,7 +117,7 @@ EXTN(jsimd_rgb_gray_convert_mmx): test cl, SIZEOF_DWORD jz short .column_ld8 sub ecx, byte SIZEOF_DWORD - movd mmG, DWORD [esi+ecx] + movd mmG, dword [esi+ecx] psllq mmA, DWORD_BIT por mmA, mmG .column_ld8: @@ -189,7 +187,7 @@ EXTN(jsimd_rgb_gray_convert_mmx): test cl, SIZEOF_MMWORD/8 jz short .column_ld2 sub ecx, byte SIZEOF_MMWORD/8 - movd mmA, DWORD [esi+ecx*RGB_PIXELSIZE] + movd mmA, dword [esi+ecx*RGB_PIXELSIZE] .column_ld2: test cl, SIZEOF_MMWORD/4 jz short .column_ld4 diff --git a/simd/i386/jcgryext-sse2.asm b/simd/i386/jcgryext-sse2.asm index 04d891c..c9d6ff1 100644 --- a/simd/i386/jcgryext-sse2.asm +++ b/simd/i386/jcgryext-sse2.asm @@ -12,8 +12,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jcolsamp.inc" @@ -101,12 +99,12 @@ EXTN(jsimd_rgb_gray_convert_sse2): test cl, SIZEOF_BYTE jz short .column_ld2 sub ecx, byte SIZEOF_BYTE - movzx eax, BYTE [esi+ecx] + movzx eax, byte [esi+ecx] .column_ld2: test cl, SIZEOF_WORD jz short .column_ld4 sub ecx, byte SIZEOF_WORD - movzx edx, WORD [esi+ecx] + movzx edx, word [esi+ecx] shl eax, WORD_BIT or eax, edx .column_ld4: diff --git a/simd/i386/jchuff-sse2.asm b/simd/i386/jchuff-sse2.asm index 6ea69f6..d0112e6 100644 --- a/simd/i386/jchuff-sse2.asm +++ b/simd/i386/jchuff-sse2.asm @@ -17,8 +17,6 @@ ; This file contains an SSE2 implementation for Huffman coding of one block. ; The following code is based directly on jchuff.c; see jchuff.c for more ; details. -; -; [TAB8] %include "jsimdext.inc" @@ -27,11 +25,10 @@ alignz 32 GLOBAL_DATA(jconst_huff_encode_one_block) + EXTERN EXTN(jpeg_nbits_table) EXTN(jconst_huff_encode_one_block): -%include "jpeg_nbits_table.inc" - alignz 32 ; -------------------------------------------------------------------------- @@ -197,8 +194,8 @@ EXTN(jsimd_huff_encode_one_block_sse2): push ebp mov esi, POINTER [eax+8] ; (working_state *state) - mov put_buffer, DWORD [esi+8] ; put_buffer = state->cur.put_buffer; - mov put_bits, DWORD [esi+12] ; put_bits = state->cur.put_bits; + mov put_buffer, dword [esi+8] ; put_buffer = state->cur.put_buffer; + mov put_bits, dword [esi+12] ; put_bits = state->cur.put_bits; push esi ; esi is now scratch get_GOT edx ; get GOT address @@ -214,7 +211,7 @@ EXTN(jsimd_huff_encode_one_block_sse2): ; Encode the DC coefficient difference per section F.1.2.1 mov esi, POINTER [esp+block] ; block movsx ecx, word [esi] ; temp = temp2 = block[0] - last_dc_val; - sub ecx, DWORD [eax+20] + sub ecx, dword [eax+20] mov esi, ecx ; This is a well-known technique for obtaining the absolute value @@ -229,12 +226,12 @@ EXTN(jsimd_huff_encode_one_block_sse2): ; For a negative input, want temp2 = bitwise complement of abs(input) ; This code assumes we are on a two's complement machine add esi, edx ; temp2 += temp3; - mov DWORD [esp+temp], esi ; backup temp2 in temp + mov dword [esp+temp], esi ; backup temp2 in temp ; Find the number of bits needed for the magnitude of the coefficient movpic ebp, POINTER [esp+gotptr] ; load GOT address (ebp) - movzx edx, byte [GOTOFF(ebp, jpeg_nbits_table + ecx)] ; nbits = JPEG_NBITS(temp); - mov DWORD [esp+temp2], edx ; backup nbits in temp2 + movzx edx, byte [GOTOFF(ebp, EXTN(jpeg_nbits_table) + ecx)] ; nbits = JPEG_NBITS(temp); + mov dword [esp+temp2], edx ; backup nbits in temp2 ; Emit the Huffman-coded symbol for the number of bits mov ebp, POINTER [eax+24] ; After this point, arguments are not accessible anymore @@ -242,13 +239,13 @@ EXTN(jsimd_huff_encode_one_block_sse2): movzx ecx, byte [ebp + edx + 1024] ; size = dctbl->ehufsi[nbits]; EMIT_BITS eax ; EMIT_BITS(code, size) - mov ecx, DWORD [esp+temp2] ; restore nbits + mov ecx, dword [esp+temp2] ; restore nbits ; Mask off any extra bits in code mov eax, 1 shl eax, cl dec eax - and eax, DWORD [esp+temp] ; temp2 &= (((JLONG)1)<<nbits) - 1; + and eax, dword [esp+temp] ; temp2 &= (((JLONG)1)<<nbits) - 1; ; Emit that number of bits of the value, if positive, ; or the complement of its magnitude, if negative. @@ -291,22 +288,22 @@ EXTN(jsimd_huff_encode_one_block_sse2): jz near .ELOOP lea esi, [esi+ecx*2] ; k += r; shr edx, cl ; index >>= r; - mov DWORD [esp+temp3], edx + mov dword [esp+temp3], edx .BRLOOP: cmp ecx, 16 ; while (r > 15) { jl near .ERLOOP sub ecx, 16 ; r -= 16; - mov DWORD [esp+temp], ecx + mov dword [esp+temp], ecx mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0]; movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0]; EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0) - mov ecx, DWORD [esp+temp] + mov ecx, dword [esp+temp] jmp .BRLOOP .ERLOOP: movsx eax, word [esi] ; temp = t1[k]; movpic edx, POINTER [esp+gotptr] ; load GOT address (edx) - movzx eax, byte [GOTOFF(edx, jpeg_nbits_table + eax)] ; nbits = JPEG_NBITS(temp); - mov DWORD [esp+temp2], eax + movzx eax, byte [GOTOFF(edx, EXTN(jpeg_nbits_table) + eax)] ; nbits = JPEG_NBITS(temp); + mov dword [esp+temp2], eax ; Emit Huffman symbol for run length / number of bits shl ecx, 4 ; temp3 = (r << 4) + nbits; add ecx, eax @@ -316,13 +313,13 @@ EXTN(jsimd_huff_encode_one_block_sse2): movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k]; ; Mask off any extra bits in code - mov ecx, DWORD [esp+temp2] + mov ecx, dword [esp+temp2] mov eax, 1 shl eax, cl dec eax and eax, edx ; temp2 &= (((JLONG)1)<<nbits) - 1; EMIT_BITS eax ; PUT_BITS(temp2, nbits) - mov edx, DWORD [esp+temp3] + mov edx, dword [esp+temp3] add esi, 2 ; ++k; shr edx, 1 ; index >>= 1; @@ -352,29 +349,29 @@ EXTN(jsimd_huff_encode_one_block_sse2): shr edx, cl ; index >>= r; add ecx, eax lea esi, [esi+ecx*2] ; k += r; - mov DWORD [esp+temp3], edx + mov dword [esp+temp3], edx jmp .BRLOOP2 .BLOOP2: bsf ecx, edx ; r = __builtin_ctzl(index); jz near .ELOOP2 lea esi, [esi+ecx*2] ; k += r; shr edx, cl ; index >>= r; - mov DWORD [esp+temp3], edx + mov dword [esp+temp3], edx .BRLOOP2: cmp ecx, 16 ; while (r > 15) { jl near .ERLOOP2 sub ecx, 16 ; r -= 16; - mov DWORD [esp+temp], ecx + mov dword [esp+temp], ecx mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0]; movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0]; EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0) - mov ecx, DWORD [esp+temp] + mov ecx, dword [esp+temp] jmp .BRLOOP2 .ERLOOP2: movsx eax, word [esi] ; temp = t1[k]; bsr eax, eax ; nbits = 32 - __builtin_clz(temp); inc eax - mov DWORD [esp+temp2], eax + mov dword [esp+temp2], eax ; Emit Huffman symbol for run length / number of bits shl ecx, 4 ; temp3 = (r << 4) + nbits; add ecx, eax @@ -384,13 +381,13 @@ EXTN(jsimd_huff_encode_one_block_sse2): movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k]; ; Mask off any extra bits in code - mov ecx, DWORD [esp+temp2] + mov ecx, dword [esp+temp2] mov eax, 1 shl eax, cl dec eax and eax, edx ; temp2 &= (((JLONG)1)<<nbits) - 1; EMIT_BITS eax ; PUT_BITS(temp2, nbits) - mov edx, DWORD [esp+temp3] + mov edx, dword [esp+temp3] add esi, 2 ; ++k; shr edx, 1 ; index >>= 1; @@ -407,8 +404,8 @@ EXTN(jsimd_huff_encode_one_block_sse2): mov eax, [esp+buffer] pop esi ; Save put_buffer & put_bits - mov DWORD [esi+8], put_buffer ; state->cur.put_buffer = put_buffer; - mov DWORD [esi+12], put_bits ; state->cur.put_bits = put_bits; + mov dword [esi+8], put_buffer ; state->cur.put_buffer = put_buffer; + mov dword [esi+12], put_bits ; state->cur.put_bits = put_bits; pop ebp pop edi diff --git a/simd/i386/jcphuff-sse2.asm b/simd/i386/jcphuff-sse2.asm index e35a7d8..8b73178 100644 --- a/simd/i386/jcphuff-sse2.asm +++ b/simd/i386/jcphuff-sse2.asm @@ -15,8 +15,6 @@ ; ; This file contains an SSE2 implementation of data preparation for progressive ; Huffman encoding. See jcphuff.c for more details. -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/i386/jcsample-avx2.asm b/simd/i386/jcsample-avx2.asm index 5bcdefd..0a20802 100644 --- a/simd/i386/jcsample-avx2.asm +++ b/simd/i386/jcsample-avx2.asm @@ -14,8 +14,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/i386/jcsample-mmx.asm b/simd/i386/jcsample-mmx.asm index faf4234..2c223ee 100644 --- a/simd/i386/jcsample-mmx.asm +++ b/simd/i386/jcsample-mmx.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/i386/jcsample-sse2.asm b/simd/i386/jcsample-sse2.asm index b10fa83..4fea60d 100644 --- a/simd/i386/jcsample-sse2.asm +++ b/simd/i386/jcsample-sse2.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/i386/jdcolext-avx2.asm b/simd/i386/jdcolext-avx2.asm index 46de9b9..015be04 100644 --- a/simd/i386/jdcolext-avx2.asm +++ b/simd/i386/jdcolext-avx2.asm @@ -14,8 +14,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jcolsamp.inc" @@ -348,7 +346,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2): vmovd eax, xmmA cmp ecx, byte SIZEOF_WORD jb short .column_st1 - mov WORD [edi], ax + mov word [edi], ax add edi, byte SIZEOF_WORD sub ecx, byte SIZEOF_WORD shr eax, 16 @@ -357,7 +355,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2): ; space. test ecx, ecx jz short .nextrow - mov BYTE [edi], al + mov byte [edi], al %else ; RGB_PIXELSIZE == 4 ; ----------- diff --git a/simd/i386/jdcolext-mmx.asm b/simd/i386/jdcolext-mmx.asm index cd2cb3f..5813cfc 100644 --- a/simd/i386/jdcolext-mmx.asm +++ b/simd/i386/jdcolext-mmx.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jcolsamp.inc" @@ -280,7 +278,7 @@ EXTN(jsimd_ycc_rgb_convert_mmx): movd eax, mmA cmp ecx, byte SIZEOF_DWORD jb short .column_st2 - mov DWORD [edi+0*SIZEOF_DWORD], eax + mov dword [edi+0*SIZEOF_DWORD], eax psrlq mmA, DWORD_BIT movd eax, mmA sub ecx, byte SIZEOF_DWORD @@ -288,14 +286,14 @@ EXTN(jsimd_ycc_rgb_convert_mmx): .column_st2: cmp ecx, byte SIZEOF_WORD jb short .column_st1 - mov WORD [edi+0*SIZEOF_WORD], ax + mov word [edi+0*SIZEOF_WORD], ax shr eax, WORD_BIT sub ecx, byte SIZEOF_WORD add edi, byte SIZEOF_WORD .column_st1: cmp ecx, byte SIZEOF_BYTE jb short .nextrow - mov BYTE [edi+0*SIZEOF_BYTE], al + mov byte [edi+0*SIZEOF_BYTE], al %else ; RGB_PIXELSIZE == 4 ; ----------- @@ -367,7 +365,7 @@ EXTN(jsimd_ycc_rgb_convert_mmx): .column_st4: cmp ecx, byte SIZEOF_MMWORD/8 jb short .nextrow - movd DWORD [edi+0*SIZEOF_DWORD], mmA + movd dword [edi+0*SIZEOF_DWORD], mmA %endif ; RGB_PIXELSIZE ; --------------- diff --git a/simd/i386/jdcolext-sse2.asm b/simd/i386/jdcolext-sse2.asm index 0fcb006..d5572b3 100644 --- a/simd/i386/jdcolext-sse2.asm +++ b/simd/i386/jdcolext-sse2.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jcolsamp.inc" @@ -320,7 +318,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2): movd eax, xmmA cmp ecx, byte SIZEOF_WORD jb short .column_st1 - mov WORD [edi], ax + mov word [edi], ax add edi, byte SIZEOF_WORD sub ecx, byte SIZEOF_WORD shr eax, 16 @@ -329,7 +327,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2): ; space. test ecx, ecx jz short .nextrow - mov BYTE [edi], al + mov byte [edi], al %else ; RGB_PIXELSIZE == 4 ; ----------- diff --git a/simd/i386/jdcolor-avx2.asm b/simd/i386/jdcolor-avx2.asm index d2f86e6..e05b60d 100644 --- a/simd/i386/jdcolor-avx2.asm +++ b/simd/i386/jdcolor-avx2.asm @@ -14,8 +14,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/i386/jdcolor-mmx.asm b/simd/i386/jdcolor-mmx.asm index 8f5a3b3..fb7e7bc 100644 --- a/simd/i386/jdcolor-mmx.asm +++ b/simd/i386/jdcolor-mmx.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/i386/jdcolor-sse2.asm b/simd/i386/jdcolor-sse2.asm index ae553db..b736255 100644 --- a/simd/i386/jdcolor-sse2.asm +++ b/simd/i386/jdcolor-sse2.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/i386/jdmerge-avx2.asm b/simd/i386/jdmerge-avx2.asm index 1731844..711e679 100644 --- a/simd/i386/jdmerge-avx2.asm +++ b/simd/i386/jdmerge-avx2.asm @@ -14,8 +14,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/i386/jdmerge-mmx.asm b/simd/i386/jdmerge-mmx.asm index 607bf39..6e8311d 100644 --- a/simd/i386/jdmerge-mmx.asm +++ b/simd/i386/jdmerge-mmx.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/i386/jdmerge-sse2.asm b/simd/i386/jdmerge-sse2.asm index ddb1d5e..e32f90a 100644 --- a/simd/i386/jdmerge-sse2.asm +++ b/simd/i386/jdmerge-sse2.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/i386/jdmrgext-avx2.asm b/simd/i386/jdmrgext-avx2.asm index cde4865..e35f728 100644 --- a/simd/i386/jdmrgext-avx2.asm +++ b/simd/i386/jdmrgext-avx2.asm @@ -14,8 +14,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jcolsamp.inc" @@ -354,7 +352,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2): vmovd eax, xmmA cmp ecx, byte SIZEOF_WORD jb short .column_st1 - mov WORD [edi], ax + mov word [edi], ax add edi, byte SIZEOF_WORD sub ecx, byte SIZEOF_WORD shr eax, 16 @@ -363,7 +361,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2): ; space. test ecx, ecx jz short .endcolumn - mov BYTE [edi], al + mov byte [edi], al %else ; RGB_PIXELSIZE == 4 ; ----------- diff --git a/simd/i386/jdmrgext-mmx.asm b/simd/i386/jdmrgext-mmx.asm index 4b9e35d..eb3e36b 100644 --- a/simd/i386/jdmrgext-mmx.asm +++ b/simd/i386/jdmrgext-mmx.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jcolsamp.inc" @@ -283,7 +281,7 @@ EXTN(jsimd_h2v1_merged_upsample_mmx): movd eax, mmA cmp ecx, byte SIZEOF_DWORD jb short .column_st2 - mov DWORD [edi+0*SIZEOF_DWORD], eax + mov dword [edi+0*SIZEOF_DWORD], eax psrlq mmA, DWORD_BIT movd eax, mmA sub ecx, byte SIZEOF_DWORD @@ -291,14 +289,14 @@ EXTN(jsimd_h2v1_merged_upsample_mmx): .column_st2: cmp ecx, byte SIZEOF_WORD jb short .column_st1 - mov WORD [edi+0*SIZEOF_WORD], ax + mov word [edi+0*SIZEOF_WORD], ax shr eax, WORD_BIT sub ecx, byte SIZEOF_WORD add edi, byte SIZEOF_WORD .column_st1: cmp ecx, byte SIZEOF_BYTE jb short .endcolumn - mov BYTE [edi+0*SIZEOF_BYTE], al + mov byte [edi+0*SIZEOF_BYTE], al %else ; RGB_PIXELSIZE == 4 ; ----------- @@ -373,7 +371,7 @@ EXTN(jsimd_h2v1_merged_upsample_mmx): .column_st4: cmp ecx, byte SIZEOF_MMWORD/8 jb short .endcolumn - movd DWORD [edi+0*SIZEOF_DWORD], mmA + movd dword [edi+0*SIZEOF_DWORD], mmA %endif ; RGB_PIXELSIZE ; --------------- diff --git a/simd/i386/jdmrgext-sse2.asm b/simd/i386/jdmrgext-sse2.asm index ac4697e..c113dc4 100644 --- a/simd/i386/jdmrgext-sse2.asm +++ b/simd/i386/jdmrgext-sse2.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jcolsamp.inc" @@ -325,7 +323,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): movd eax, xmmA cmp ecx, byte SIZEOF_WORD jb short .column_st1 - mov WORD [edi], ax + mov word [edi], ax add edi, byte SIZEOF_WORD sub ecx, byte SIZEOF_WORD shr eax, 16 @@ -334,7 +332,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): ; space. test ecx, ecx jz short .endcolumn - mov BYTE [edi], al + mov byte [edi], al %else ; RGB_PIXELSIZE == 4 ; ----------- diff --git a/simd/i386/jdsample-avx2.asm b/simd/i386/jdsample-avx2.asm index 61ce511..a800c35 100644 --- a/simd/i386/jdsample-avx2.asm +++ b/simd/i386/jdsample-avx2.asm @@ -14,8 +14,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/i386/jdsample-mmx.asm b/simd/i386/jdsample-mmx.asm index 1f810fa..12c49f0 100644 --- a/simd/i386/jdsample-mmx.asm +++ b/simd/i386/jdsample-mmx.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/i386/jdsample-sse2.asm b/simd/i386/jdsample-sse2.asm index f0da626..4e28d2f 100644 --- a/simd/i386/jdsample-sse2.asm +++ b/simd/i386/jdsample-sse2.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/i386/jfdctflt-3dn.asm b/simd/i386/jfdctflt-3dn.asm index 1d45865..322ab16 100644 --- a/simd/i386/jfdctflt-3dn.asm +++ b/simd/i386/jfdctflt-3dn.asm @@ -17,8 +17,6 @@ ; This file contains a floating-point implementation of the forward DCT ; (Discrete Cosine Transform). The following code is based directly on ; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" diff --git a/simd/i386/jfdctflt-sse.asm b/simd/i386/jfdctflt-sse.asm index 1faf835..86952c6 100644 --- a/simd/i386/jfdctflt-sse.asm +++ b/simd/i386/jfdctflt-sse.asm @@ -17,8 +17,6 @@ ; This file contains a floating-point implementation of the forward DCT ; (Discrete Cosine Transform). The following code is based directly on ; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" diff --git a/simd/i386/jfdctfst-mmx.asm b/simd/i386/jfdctfst-mmx.asm index 0271901..80645a5 100644 --- a/simd/i386/jfdctfst-mmx.asm +++ b/simd/i386/jfdctfst-mmx.asm @@ -18,8 +18,6 @@ ; the forward DCT (Discrete Cosine Transform). The following code is ; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c ; for more details. -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" diff --git a/simd/i386/jfdctfst-sse2.asm b/simd/i386/jfdctfst-sse2.asm index f09dadd..446fa7a 100644 --- a/simd/i386/jfdctfst-sse2.asm +++ b/simd/i386/jfdctfst-sse2.asm @@ -18,8 +18,6 @@ ; the forward DCT (Discrete Cosine Transform). The following code is ; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c ; for more details. -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" diff --git a/simd/i386/jfdctint-avx2.asm b/simd/i386/jfdctint-avx2.asm index ae258ee..97de230 100644 --- a/simd/i386/jfdctint-avx2.asm +++ b/simd/i386/jfdctint-avx2.asm @@ -18,8 +18,6 @@ ; forward DCT (Discrete Cosine Transform). The following code is based ; directly on the IJG's original jfdctint.c; see the jfdctint.c for ; more details. -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" diff --git a/simd/i386/jfdctint-mmx.asm b/simd/i386/jfdctint-mmx.asm index c6bd959..3ade9d4 100644 --- a/simd/i386/jfdctint-mmx.asm +++ b/simd/i386/jfdctint-mmx.asm @@ -18,8 +18,6 @@ ; forward DCT (Discrete Cosine Transform). The following code is based ; directly on the IJG's original jfdctint.c; see the jfdctint.c for ; more details. -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" diff --git a/simd/i386/jfdctint-sse2.asm b/simd/i386/jfdctint-sse2.asm index d67dcc1..71b684c 100644 --- a/simd/i386/jfdctint-sse2.asm +++ b/simd/i386/jfdctint-sse2.asm @@ -18,8 +18,6 @@ ; forward DCT (Discrete Cosine Transform). The following code is based ; directly on the IJG's original jfdctint.c; see the jfdctint.c for ; more details. -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" diff --git a/simd/i386/jidctflt-3dn.asm b/simd/i386/jidctflt-3dn.asm index 73aa18d..8795191 100644 --- a/simd/i386/jidctflt-3dn.asm +++ b/simd/i386/jidctflt-3dn.asm @@ -17,8 +17,6 @@ ; This file contains a floating-point implementation of the inverse DCT ; (Discrete Cosine Transform). The following code is based directly on ; the IJG's original jidctflt.c; see the jidctflt.c for more details. -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" @@ -92,23 +90,23 @@ EXTN(jsimd_idct_float_3dnow): alignx 16, 7 .columnloop: %ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] jnz short .columnDCT pushpic ebx ; save GOT address - mov ebx, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)] - mov eax, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)] - or ebx, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)] - or ebx, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)] + mov ebx, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)] + mov eax, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)] + or ebx, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)] + or ebx, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)] or eax, ebx poppic ebx ; restore GOT address jnz short .columnDCT ; -- AC terms all zero - movd mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)] + movd mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)] punpcklwd mm0, mm0 psrad mm0, (DWORD_BIT-WORD_BIT) @@ -135,10 +133,10 @@ EXTN(jsimd_idct_float_3dnow): ; -- Even part - movd mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)] - movd mm1, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] - movd mm2, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)] - movd mm3, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)] + movd mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)] + movd mm1, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + movd mm2, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)] + movd mm3, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)] punpcklwd mm0, mm0 punpcklwd mm1, mm1 @@ -182,10 +180,10 @@ EXTN(jsimd_idct_float_3dnow): ; -- Odd part - movd mm2, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - movd mm3, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)] - movd mm5, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)] - movd mm1, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)] + movd mm2, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + movd mm3, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)] + movd mm5, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)] + movd mm1, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)] punpcklwd mm2, mm2 punpcklwd mm3, mm3 diff --git a/simd/i386/jidctflt-sse.asm b/simd/i386/jidctflt-sse.asm index 386650f..b27ecfd 100644 --- a/simd/i386/jidctflt-sse.asm +++ b/simd/i386/jidctflt-sse.asm @@ -17,8 +17,6 @@ ; This file contains a floating-point implementation of the inverse DCT ; (Discrete Cosine Transform). The following code is based directly on ; the IJG's original jidctflt.c; see the jidctflt.c for more details. -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" @@ -102,8 +100,8 @@ EXTN(jsimd_idct_float_sse): alignx 16, 7 .columnloop: %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] jnz near .columnDCT movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] diff --git a/simd/i386/jidctflt-sse2.asm b/simd/i386/jidctflt-sse2.asm index 9de7139..c646eae 100644 --- a/simd/i386/jidctflt-sse2.asm +++ b/simd/i386/jidctflt-sse2.asm @@ -17,8 +17,6 @@ ; This file contains a floating-point implementation of the inverse DCT ; (Discrete Cosine Transform). The following code is based directly on ; the IJG's original jidctflt.c; see the jidctflt.c for more details. -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" @@ -102,8 +100,8 @@ EXTN(jsimd_idct_float_sse2): alignx 16, 7 .columnloop: %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] jnz near .columnDCT movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] diff --git a/simd/i386/jidctfst-mmx.asm b/simd/i386/jidctfst-mmx.asm index d3e8a5d..24622d4 100644 --- a/simd/i386/jidctfst-mmx.asm +++ b/simd/i386/jidctfst-mmx.asm @@ -18,8 +18,6 @@ ; the inverse DCT (Discrete Cosine Transform). The following code is ; based directly on the IJG's original jidctfst.c; see the jidctfst.c ; for more details. -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" @@ -123,8 +121,8 @@ EXTN(jsimd_idct_ifast_mmx): alignx 16, 7 .columnloop: %ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] jnz short .columnDCT movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] diff --git a/simd/i386/jidctfst-sse2.asm b/simd/i386/jidctfst-sse2.asm index 83bc414..19704ff 100644 --- a/simd/i386/jidctfst-sse2.asm +++ b/simd/i386/jidctfst-sse2.asm @@ -18,8 +18,6 @@ ; the inverse DCT (Discrete Cosine Transform). The following code is ; based directly on the IJG's original jidctfst.c; see the jidctfst.c ; for more details. -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" @@ -118,8 +116,8 @@ EXTN(jsimd_idct_ifast_sse2): mov esi, JCOEFPTR [coef_block(eax)] ; inptr %ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2 - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] jnz near .columnDCT movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] diff --git a/simd/i386/jidctint-avx2.asm b/simd/i386/jidctint-avx2.asm index b3b7b14..c371985 100644 --- a/simd/i386/jidctint-avx2.asm +++ b/simd/i386/jidctint-avx2.asm @@ -18,8 +18,6 @@ ; inverse DCT (Discrete Cosine Transform). The following code is based ; directly on the IJG's original jidctint.c; see the jidctint.c for ; more details. -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" @@ -320,8 +318,8 @@ EXTN(jsimd_idct_islow_avx2): mov esi, JCOEFPTR [coef_block(eax)] ; inptr %ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2 - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] jnz near .columnDCT movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] diff --git a/simd/i386/jidctint-mmx.asm b/simd/i386/jidctint-mmx.asm index 6ca6d06..4f07f56 100644 --- a/simd/i386/jidctint-mmx.asm +++ b/simd/i386/jidctint-mmx.asm @@ -18,8 +18,6 @@ ; inverse DCT (Discrete Cosine Transform). The following code is based ; directly on the IJG's original jidctint.c; see the jidctint.c for ; more details. -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" @@ -136,8 +134,8 @@ EXTN(jsimd_idct_islow_mmx): alignx 16, 7 .columnloop: %ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] jnz short .columnDCT movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] diff --git a/simd/i386/jidctint-sse2.asm b/simd/i386/jidctint-sse2.asm index a6bd00a..e442fdd 100644 --- a/simd/i386/jidctint-sse2.asm +++ b/simd/i386/jidctint-sse2.asm @@ -18,8 +18,6 @@ ; inverse DCT (Discrete Cosine Transform). The following code is based ; directly on the IJG's original jidctint.c; see the jidctint.c for ; more details. -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" @@ -131,8 +129,8 @@ EXTN(jsimd_idct_islow_sse2): mov esi, JCOEFPTR [coef_block(eax)] ; inptr %ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2 - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] jnz near .columnDCT movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] diff --git a/simd/i386/jidctred-mmx.asm b/simd/i386/jidctred-mmx.asm index 336ee3b..e2307e1 100644 --- a/simd/i386/jidctred-mmx.asm +++ b/simd/i386/jidctred-mmx.asm @@ -18,8 +18,6 @@ ; output: either 4x4 or 2x2 pixels from an 8x8 DCT block. ; The following code is based directly on the IJG's original jidctred.c; ; see the jidctred.c for more details. -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" @@ -144,8 +142,8 @@ EXTN(jsimd_idct_4x4_mmx): alignx 16, 7 .columnloop: %ifndef NO_ZERO_COLUMN_TEST_4X4_MMX - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] jnz short .columnDCT movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] @@ -464,16 +462,16 @@ EXTN(jsimd_idct_4x4_mmx): mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] - movd DWORD [edx+eax*SIZEOF_JSAMPLE], mm1 - movd DWORD [esi+eax*SIZEOF_JSAMPLE], mm0 + movd dword [edx+eax*SIZEOF_JSAMPLE], mm1 + movd dword [esi+eax*SIZEOF_JSAMPLE], mm0 psrlq mm1, 4*BYTE_BIT psrlq mm0, 4*BYTE_BIT mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] - movd DWORD [edx+eax*SIZEOF_JSAMPLE], mm1 - movd DWORD [esi+eax*SIZEOF_JSAMPLE], mm0 + movd dword [edx+eax*SIZEOF_JSAMPLE], mm1 + movd dword [esi+eax*SIZEOF_JSAMPLE], mm0 emms ; empty MMX state @@ -688,8 +686,8 @@ EXTN(jsimd_idct_2x2_mmx): mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] - mov WORD [edx+eax*SIZEOF_JSAMPLE], bx - mov WORD [esi+eax*SIZEOF_JSAMPLE], cx + mov word [edx+eax*SIZEOF_JSAMPLE], bx + mov word [esi+eax*SIZEOF_JSAMPLE], cx emms ; empty MMX state diff --git a/simd/i386/jidctred-sse2.asm b/simd/i386/jidctred-sse2.asm index 97838ba..6e56494 100644 --- a/simd/i386/jidctred-sse2.asm +++ b/simd/i386/jidctred-sse2.asm @@ -18,8 +18,6 @@ ; output: either 4x4 or 2x2 pixels from an 8x8 DCT block. ; The following code is based directly on the IJG's original jidctred.c; ; see the jidctred.c for more details. -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" @@ -139,8 +137,8 @@ EXTN(jsimd_idct_4x4_sse2): mov esi, JCOEFPTR [coef_block(eax)] ; inptr %ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2 - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] jnz short .columnDCT movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] @@ -578,8 +576,8 @@ EXTN(jsimd_idct_2x2_sse2): mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] - mov WORD [edx+eax*SIZEOF_JSAMPLE], bx - mov WORD [esi+eax*SIZEOF_JSAMPLE], cx + mov word [edx+eax*SIZEOF_JSAMPLE], bx + mov word [esi+eax*SIZEOF_JSAMPLE], cx pop edi pop esi diff --git a/simd/i386/jquant-3dn.asm b/simd/i386/jquant-3dn.asm index 1767f44..5cb60ca 100644 --- a/simd/i386/jquant-3dn.asm +++ b/simd/i386/jquant-3dn.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" diff --git a/simd/i386/jquant-mmx.asm b/simd/i386/jquant-mmx.asm index 98932db..61305c6 100644 --- a/simd/i386/jquant-mmx.asm +++ b/simd/i386/jquant-mmx.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" diff --git a/simd/i386/jquant-sse.asm b/simd/i386/jquant-sse.asm index cc244c4..218adc9 100644 --- a/simd/i386/jquant-sse.asm +++ b/simd/i386/jquant-sse.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" diff --git a/simd/i386/jquantf-sse2.asm b/simd/i386/jquantf-sse2.asm index 8d1201c..a881ab5 100644 --- a/simd/i386/jquantf-sse2.asm +++ b/simd/i386/jquantf-sse2.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" diff --git a/simd/i386/jquanti-avx2.asm b/simd/i386/jquanti-avx2.asm index ea8e1a1..5ed6bec 100644 --- a/simd/i386/jquanti-avx2.asm +++ b/simd/i386/jquanti-avx2.asm @@ -14,8 +14,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" diff --git a/simd/i386/jquanti-sse2.asm b/simd/i386/jquanti-sse2.asm index 2a69494..0a50940 100644 --- a/simd/i386/jquanti-sse2.asm +++ b/simd/i386/jquanti-sse2.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" diff --git a/simd/i386/jsimd.c b/simd/i386/jsimd.c index 563949a..2f92f8c 100644 --- a/simd/i386/jsimd.c +++ b/simd/i386/jsimd.c @@ -543,6 +543,12 @@ jsimd_can_h2v1_fancy_upsample(void) return 0; } +GLOBAL(int) +jsimd_can_h1v2_fancy_upsample(void) +{ + return 0; +} + GLOBAL(void) jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) @@ -579,6 +585,12 @@ jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, output_data_ptr); } +GLOBAL(void) +jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ +} + GLOBAL(int) jsimd_can_h2v2_merged_upsample(void) { diff --git a/simd/i386/jsimdcpu.asm b/simd/i386/jsimdcpu.asm index 0af4eec..ddcafa9 100644 --- a/simd/i386/jsimdcpu.asm +++ b/simd/i386/jsimdcpu.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/jsimd.h b/simd/jsimd.h index a9fc812..99c8801 100644 --- a/simd/jsimd.h +++ b/simd/jsimd.h @@ -121,13 +121,6 @@ EXTERN(void) jsimd_extxrgb_ycc_convert_neon (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows); -EXTERN(void) jsimd_extrgb_ycc_convert_neon_slowld3 - (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows); -EXTERN(void) jsimd_extbgr_ycc_convert_neon_slowld3 - (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows); - EXTERN(void) jsimd_rgb_ycc_convert_dspr2 (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows); @@ -263,6 +256,28 @@ EXTERN(void) jsimd_extxrgb_gray_convert_avx2 (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_rgb_gray_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgb_gray_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extrgbx_gray_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgr_gray_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extbgrx_gray_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxbgr_gray_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); +EXTERN(void) jsimd_extxrgb_gray_convert_neon + (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows); + EXTERN(void) jsimd_rgb_gray_convert_dspr2 (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows); @@ -401,13 +416,6 @@ EXTERN(void) jsimd_ycc_rgb565_convert_neon (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows); -EXTERN(void) jsimd_ycc_extrgb_convert_neon_slowst3 - (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows); -EXTERN(void) jsimd_ycc_extbgr_convert_neon_slowst3 - (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows); - EXTERN(void) jsimd_ycc_rgb_convert_dspr2 (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows); @@ -562,6 +570,13 @@ EXTERN(void) jsimd_h2v2_upsample_avx2 (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v1_upsample_neon + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_upsample_neon + (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); + EXTERN(void) jsimd_h2v1_upsample_dspr2 (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); @@ -608,6 +623,12 @@ EXTERN(void) jsimd_h2v2_fancy_upsample_avx2 EXTERN(void) jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h2v2_fancy_upsample_neon + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); +EXTERN(void) jsimd_h1v2_fancy_upsample_neon + (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, + JSAMPARRAY *output_data_ptr); EXTERN(void) jsimd_h2v1_fancy_upsample_dspr2 (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data, @@ -762,6 +783,50 @@ EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_avx2 (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_merged_upsample_neon + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_neon + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_neon + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_neon + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_neon + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_neon + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_neon + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); + +EXTERN(void) jsimd_h2v2_merged_upsample_neon + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_neon + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_neon + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_neon + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_neon + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_neon + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); +EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_neon + (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf); + EXTERN(void) jsimd_h2v1_merged_upsample_dspr2 (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE *range); diff --git a/simd/loongson/jccolext-mmi.c b/simd/loongson/jccolext-mmi.c deleted file mode 100644 index 6cdeb5e..0000000 --- a/simd/loongson/jccolext-mmi.c +++ /dev/null @@ -1,483 +0,0 @@ -/* - * Loongson MMI optimizations for libjpeg-turbo - * - * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB - * Copyright (C) 2014-2015, 2019, D. R. Commander. All Rights Reserved. - * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing. - * All Rights Reserved. - * Authors: ZhuChen <zhuchen@loongson.cn> - * SunZhangzhi <sunzhangzhi-cq@loongson.cn> - * CaiWanwei <caiwanwei@loongson.cn> - * ZhangLixia <zhanglixia-hf@loongson.cn> - * - * Based on the x86 SIMD extension for IJG JPEG library - * Copyright (C) 1999-2006, MIYASAKA Masaru. - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* This file is included by jccolor-mmi.c */ - - -#if RGB_RED == 0 -#define mmA mm0 -#define mmB mm1 -#elif RGB_GREEN == 0 -#define mmA mm2 -#define mmB mm3 -#elif RGB_BLUE == 0 -#define mmA mm4 -#define mmB mm5 -#else -#define mmA mm6 -#define mmB mm7 -#endif - -#if RGB_RED == 1 -#define mmC mm0 -#define mmD mm1 -#elif RGB_GREEN == 1 -#define mmC mm2 -#define mmD mm3 -#elif RGB_BLUE == 1 -#define mmC mm4 -#define mmD mm5 -#else -#define mmC mm6 -#define mmD mm7 -#endif - -#if RGB_RED == 2 -#define mmE mm0 -#define mmF mm1 -#elif RGB_GREEN == 2 -#define mmE mm2 -#define mmF mm3 -#elif RGB_BLUE == 2 -#define mmE mm4 -#define mmF mm5 -#else -#define mmE mm6 -#define mmF mm7 -#endif - -#if RGB_RED == 3 -#define mmG mm0 -#define mmH mm1 -#elif RGB_GREEN == 3 -#define mmG mm2 -#define mmH mm3 -#elif RGB_BLUE == 3 -#define mmG mm4 -#define mmH mm5 -#else -#define mmG mm6 -#define mmH mm7 -#endif - - -void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf, - JSAMPIMAGE output_buf, JDIMENSION output_row, - int num_rows) -{ - JSAMPROW inptr, outptr0, outptr1, outptr2; - int num_cols, col; - __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; - __m64 wk[7]; - __m64 Y_BG, Cb_RG, Cr_BG; - - while (--num_rows >= 0) { - inptr = *input_buf++; - outptr0 = output_buf[0][output_row]; - outptr1 = output_buf[1][output_row]; - outptr2 = output_buf[2][output_row]; - output_row++; - - for (num_cols = image_width; num_cols > 0; num_cols -= 8, - outptr0 += 8, outptr1 += 8, outptr2 += 8) { - -#if RGB_PIXELSIZE == 3 - - if (num_cols < 8) { - col = num_cols * 3; - asm(".set noreorder\r\n" - - "li $8, 1\r\n" - "move $9, %3\r\n" - "and $10, $9, $8\r\n" - "beqz $10, 1f\r\n" - "nop \r\n" - "subu $9, $9, 1\r\n" - "xor $12, $12, $12\r\n" - "move $13, %5\r\n" - "dadd $13, $13, $9\r\n" - "lbu $12, 0($13)\r\n" - - "1: \r\n" - "li $8, 2\r\n" - "and $10, $9, $8\r\n" - "beqz $10, 2f\r\n" - "nop \r\n" - "subu $9, $9, 2\r\n" - "xor $11, $11, $11\r\n" - "move $13, %5\r\n" - "dadd $13, $13, $9\r\n" - "lhu $11, 0($13)\r\n" - "sll $12, $12, 16\r\n" - "or $12, $12, $11\r\n" - - "2: \r\n" - "dmtc1 $12, %0\r\n" - "li $8, 4\r\n" - "and $10, $9, $8\r\n" - "beqz $10, 3f\r\n" - "nop \r\n" - "subu $9, $9, 4\r\n" - "move $13, %5\r\n" - "dadd $13, $13, $9\r\n" - "lwu $14, 0($13)\r\n" - "dmtc1 $14, %1\r\n" - "dsll32 $12, $12, 0\r\n" - "or $12, $12, $14\r\n" - "dmtc1 $12, %0\r\n" - - "3: \r\n" - "li $8, 8\r\n" - "and $10, $9, $8\r\n" - "beqz $10, 4f\r\n" - "nop \r\n" - "mov.s %1, %0\r\n" - "ldc1 %0, 0(%5)\r\n" - "li $9, 8\r\n" - "j 5f\r\n" - "nop \r\n" - - "4: \r\n" - "li $8, 16\r\n" - "and $10, $9, $8\r\n" - "beqz $10, 5f\r\n" - "nop \r\n" - "mov.s %2, %0\r\n" - "ldc1 %0, 0(%5)\r\n" - "ldc1 %1, 8(%5)\r\n" - - "5: \r\n" - "nop \r\n" - ".set reorder\r\n" - - : "=f" (mmA), "=f" (mmG), "=f" (mmF) - : "r" (col), "r" (num_rows), "r" (inptr) - : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13", - "$14", "memory" - ); - } else { - if (!(((long)inptr) & 7)) { - mmA = _mm_load_si64((__m64 *)&inptr[0]); - mmG = _mm_load_si64((__m64 *)&inptr[8]); - mmF = _mm_load_si64((__m64 *)&inptr[16]); - } else { - mmA = _mm_loadu_si64((__m64 *)&inptr[0]); - mmG = _mm_loadu_si64((__m64 *)&inptr[8]); - mmF = _mm_loadu_si64((__m64 *)&inptr[16]); - } - inptr += RGB_PIXELSIZE * 8; - } - mmD = mmA; - mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT); - mmD = _mm_srli_si64(mmD, 4 * BYTE_BIT); - - mmA = _mm_unpackhi_pi8(mmA, mmG); - mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT); - - mmD = _mm_unpacklo_pi8(mmD, mmF); - mmG = _mm_unpackhi_pi8(mmG, mmF); - - mmE = mmA; - mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT); - mmE = _mm_srli_si64(mmE, 4 * BYTE_BIT); - - mmA = _mm_unpackhi_pi8(mmA, mmD); - mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT); - - mmE = _mm_unpacklo_pi8(mmE, mmG); - mmD = _mm_unpackhi_pi8(mmD, mmG); - mmC = mmA; - mmA = _mm_loadlo_pi8_f(mmA); - mmC = _mm_loadhi_pi8_f(mmC); - - mmB = mmE; - mmE = _mm_loadlo_pi8_f(mmE); - mmB = _mm_loadhi_pi8_f(mmB); - - mmF = mmD; - mmD = _mm_loadlo_pi8_f(mmD); - mmF = _mm_loadhi_pi8_f(mmF); - -#else /* RGB_PIXELSIZE == 4 */ - - if (num_cols < 8) { - col = num_cols; - asm(".set noreorder\r\n" - - "li $8, 1\r\n" - "move $9, %4\r\n" - "and $10, $9, $8\r\n" - "beqz $10, 1f\r\n" - "nop \r\n" - "subu $9, $9, 1\r\n" - "dsll $11, $9, 2\r\n" - "move $13, %5\r\n" - "daddu $13, $13, $11\r\n" - "lwc1 %0, 0($13)\r\n" - - "1: \r\n" - "li $8, 2\r\n" - "and $10, $9, $8\r\n" - "beqz $10, 2f\r\n" - "nop \r\n" - "subu $9, $9, 2\r\n" - "dsll $11, $9, 2\r\n" - "move $13, %5\r\n" - "daddu $13, $13, $11\r\n" - "mov.s %1, %0\r\n" - "ldc1 %0, 0($13)\r\n" - - "2: \r\n" - "li $8, 4\r\n" - "and $10, $9, $8\r\n" - "beqz $10, 3f\r\n" - "nop \r\n" - "mov.s %2, %0\r\n" - "mov.s %3, %1\r\n" - "ldc1 %0, 0(%5)\r\n" - "ldc1 %1, 8(%5)\r\n" - - "3: \r\n" - "nop \r\n" - ".set reorder\r\n" - - : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC) - : "r" (col), "r" (inptr) - : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory" - ); - } else { - if (!(((long)inptr) & 7)) { - mmA = _mm_load_si64((__m64 *)&inptr[0]); - mmF = _mm_load_si64((__m64 *)&inptr[8]); - mmD = _mm_load_si64((__m64 *)&inptr[16]); - mmC = _mm_load_si64((__m64 *)&inptr[24]); - } else { - mmA = _mm_loadu_si64((__m64 *)&inptr[0]); - mmF = _mm_loadu_si64((__m64 *)&inptr[8]); - mmD = _mm_loadu_si64((__m64 *)&inptr[16]); - mmC = _mm_loadu_si64((__m64 *)&inptr[24]); - } - inptr += RGB_PIXELSIZE * 8; - } - mmB = mmA; - mmA = _mm_unpacklo_pi8(mmA, mmF); - mmB = _mm_unpackhi_pi8(mmB, mmF); - - mmG = mmD; - mmD = _mm_unpacklo_pi8(mmD, mmC); - mmG = _mm_unpackhi_pi8(mmG, mmC); - - mmE = mmA; - mmA = _mm_unpacklo_pi16(mmA, mmD); - mmE = _mm_unpackhi_pi16(mmE, mmD); - - mmH = mmB; - mmB = _mm_unpacklo_pi16(mmB, mmG); - mmH = _mm_unpackhi_pi16(mmH, mmG); - - mmC = mmA; - mmA = _mm_loadlo_pi8_f(mmA); - mmC = _mm_loadhi_pi8_f(mmC); - - mmD = mmB; - mmB = _mm_loadlo_pi8_f(mmB); - mmD = _mm_loadhi_pi8_f(mmD); - - mmG = mmE; - mmE = _mm_loadlo_pi8_f(mmE); - mmG = _mm_loadhi_pi8_f(mmG); - - mmF = mmH; - mmF = _mm_unpacklo_pi8(mmF, mmH); - mmH = _mm_unpackhi_pi8(mmH, mmH); - mmF = _mm_srli_pi16(mmF, BYTE_BIT); - mmH = _mm_srli_pi16(mmH, BYTE_BIT); - -#endif - - wk[0] = mm0; - wk[1] = mm1; - wk[2] = mm4; - wk[3] = mm5; - - mm6 = mm1; - mm1 = _mm_unpacklo_pi16(mm1, mm3); - mm6 = _mm_unpackhi_pi16(mm6, mm3); - mm7 = mm1; - mm4 = mm6; - mm1 = _mm_madd_pi16(mm1, PW_F0299_F0337); - mm6 = _mm_madd_pi16(mm6, PW_F0299_F0337); - mm7 = _mm_madd_pi16(mm7, PW_MF016_MF033); - mm4 = _mm_madd_pi16(mm4, PW_MF016_MF033); - - wk[4] = mm1; - wk[5] = mm6; - - mm1 = _mm_loadlo_pi16_f(mm5); - mm6 = _mm_loadhi_pi16_f(mm5); - mm1 = _mm_srli_pi32(mm1, 1); - mm6 = _mm_srli_pi32(mm6, 1); - - mm5 = PD_ONEHALFM1_CJ; - mm7 = _mm_add_pi32(mm7, mm1); - mm4 = _mm_add_pi32(mm4, mm6); - mm7 = _mm_add_pi32(mm7, mm5); - mm4 = _mm_add_pi32(mm4, mm5); - mm7 = _mm_srli_pi32(mm7, SCALEBITS); - mm4 = _mm_srli_pi32(mm4, SCALEBITS); - mm7 = _mm_packs_pi32(mm7, mm4); - - mm1 = wk[2]; - mm6 = mm0; - mm0 = _mm_unpacklo_pi16(mm0, mm2); - mm6 = _mm_unpackhi_pi16(mm6, mm2); - mm5 = mm0; - mm4 = mm6; - mm0 = _mm_madd_pi16(mm0, PW_F0299_F0337); - mm6 = _mm_madd_pi16(mm6, PW_F0299_F0337); - mm5 = _mm_madd_pi16(mm5, PW_MF016_MF033); - mm4 = _mm_madd_pi16(mm4, PW_MF016_MF033); - - wk[6] = mm0; - wk[7] = mm6; - mm0 = _mm_loadlo_pi16_f(mm1); - mm6 = _mm_loadhi_pi16_f(mm1); - mm0 = _mm_srli_pi32(mm0, 1); - mm6 = _mm_srli_pi32(mm6, 1); - - mm1 = PD_ONEHALFM1_CJ; - mm5 = _mm_add_pi32(mm5, mm0); - mm4 = _mm_add_pi32(mm4, mm6); - mm5 = _mm_add_pi32(mm5, mm1); - mm4 = _mm_add_pi32(mm4, mm1); - mm5 = _mm_srli_pi32(mm5, SCALEBITS); - mm4 = _mm_srli_pi32(mm4, SCALEBITS); - mm5 = _mm_packs_pi32(mm5, mm4); - - mm7 = _mm_slli_pi16(mm7, BYTE_BIT); - mm5 = _mm_or_si64(mm5, mm7); - Cb_RG = mm5; - - mm0 = wk[3]; - mm6 = wk[2]; - mm1 = wk[1]; - - mm4 = mm0; - mm0 = _mm_unpacklo_pi16(mm0, mm3); - mm4 = _mm_unpackhi_pi16(mm4, mm3); - mm7 = mm0; - mm5 = mm4; - mm0 = _mm_madd_pi16(mm0, PW_F0114_F0250); - mm4 = _mm_madd_pi16(mm4, PW_F0114_F0250); - mm7 = _mm_madd_pi16(mm7, PW_MF008_MF041); - mm5 = _mm_madd_pi16(mm5, PW_MF008_MF041); - - mm3 = PD_ONEHALF; - mm0 = _mm_add_pi32(mm0, wk[4]); - mm4 = _mm_add_pi32(mm4, wk[5]); - mm0 = _mm_add_pi32(mm0, mm3); - mm4 = _mm_add_pi32(mm4, mm3); - mm0 = _mm_srli_pi32(mm0, SCALEBITS); - mm4 = _mm_srli_pi32(mm4, SCALEBITS); - mm0 = _mm_packs_pi32(mm0, mm4); - - mm3 = _mm_loadlo_pi16_f(mm1); - mm4 = _mm_loadhi_pi16_f(mm1); - mm3 = _mm_srli_pi32(mm3, 1); - mm4 = _mm_srli_pi32(mm4, 1); - - mm1 = PD_ONEHALFM1_CJ; - mm7 = _mm_add_pi32(mm7, mm3); - mm5 = _mm_add_pi32(mm5, mm4); - mm7 = _mm_add_pi32(mm7, mm1); - mm5 = _mm_add_pi32(mm5, mm1); - mm7 = _mm_srli_pi32(mm7, SCALEBITS); - mm5 = _mm_srli_pi32(mm5, SCALEBITS); - mm7 = _mm_packs_pi32(mm7, mm5); - - mm3 = wk[0]; - mm4 = mm6; - mm6 = _mm_unpacklo_pi16(mm6, mm2); - mm4 = _mm_unpackhi_pi16(mm4, mm2); - mm1 = mm6; - mm5 = mm4; - mm6 = _mm_madd_pi16(mm6, PW_F0114_F0250); - mm4 = _mm_madd_pi16(mm4, PW_F0114_F0250); - mm1 = _mm_madd_pi16(mm1, PW_MF008_MF041); - mm5 = _mm_madd_pi16(mm5, PW_MF008_MF041); - - mm2 = PD_ONEHALF; - mm6 = _mm_add_pi32(mm6, wk[6]); - mm4 = _mm_add_pi32(mm4, wk[7]); - mm6 = _mm_add_pi32(mm6, mm2); - mm4 = _mm_add_pi32(mm4, mm2); - mm6 = _mm_srli_pi32(mm6, SCALEBITS); - mm4 = _mm_srli_pi32(mm4, SCALEBITS); - mm6 = _mm_packs_pi32(mm6, mm4); - - mm0 = _mm_slli_pi16(mm0, BYTE_BIT); - mm6 = _mm_or_si64(mm6, mm0); - Y_BG = mm6; - - mm2 = _mm_loadlo_pi16_f(mm3); - mm4 = _mm_loadhi_pi16_f(mm3); - mm2 = _mm_srli_pi32(mm2, 1); - mm4 = _mm_srli_pi32(mm4, 1); - - mm0 = PD_ONEHALFM1_CJ; - mm1 = _mm_add_pi32(mm1, mm2); - mm5 = _mm_add_pi32(mm5, mm4); - mm1 = _mm_add_pi32(mm1, mm0); - mm5 = _mm_add_pi32(mm5, mm0); - mm1 = _mm_srli_pi32(mm1, SCALEBITS); - mm5 = _mm_srli_pi32(mm5, SCALEBITS); - mm1 = _mm_packs_pi32(mm1, mm5); - - mm7 = _mm_slli_pi16(mm7, BYTE_BIT); - mm1 = _mm_or_si64(mm1, mm7); - Cr_BG = mm1; - - _mm_store_si64((__m64 *)&outptr0[0], Y_BG); - _mm_store_si64((__m64 *)&outptr1[0], Cb_RG); - _mm_store_si64((__m64 *)&outptr2[0], Cr_BG); - } - } -} - -#undef mmA -#undef mmB -#undef mmC -#undef mmD -#undef mmE -#undef mmF -#undef mmG -#undef mmH diff --git a/simd/loongson/jccolor-mmi.c b/simd/loongson/jccolor-mmi.c deleted file mode 100644 index 93ef5c7..0000000 --- a/simd/loongson/jccolor-mmi.c +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Loongson MMI optimizations for libjpeg-turbo - * - * Copyright (C) 2011, 2014, D. R. Commander. All Rights Reserved. - * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. - * All Rights Reserved. - * Authors: ZhuChen <zhuchen@loongson.cn> - * CaiWanwei <caiwanwei@loongson.cn> - * SunZhangzhi <sunzhangzhi-cq@loongson.cn> - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* RGB --> YCC CONVERSION */ - -#include "jsimd_mmi.h" - - -#define F_0_081 ((short)5329) /* FIX(0.08131) */ -#define F_0_114 ((short)7471) /* FIX(0.11400) */ -#define F_0_168 ((short)11059) /* FIX(0.16874) */ -#define F_0_250 ((short)16384) /* FIX(0.25000) */ -#define F_0_299 ((short)19595) /* FIX(0.29900) */ -#define F_0_331 ((short)21709) /* FIX(0.33126) */ -#define F_0_418 ((short)27439) /* FIX(0.41869) */ -#define F_0_587 ((short)38470) /* FIX(0.58700) */ -#define F_0_337 ((short)(F_0_587 - F_0_250)) /* FIX(0.58700) - FIX(0.25000) */ - -enum const_index { - index_PD_ONEHALF, - index_PW_F0299_F0337, - index_PW_F0114_F0250, - index_PW_MF016_MF033, - index_PW_MF008_MF041, - index_PD_ONEHALFM1_CJ -}; - -static uint64_t const_value[] = { - _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1))), - _uint64_set_pi16(F_0_337, F_0_299, F_0_337, F_0_299), - _uint64_set_pi16(F_0_250, F_0_114, F_0_250, F_0_114), - _uint64_set_pi16(-F_0_331, -F_0_168, -F_0_331, -F_0_168), - _uint64_set_pi16(-F_0_418, -F_0_081, -F_0_418, -F_0_081), - _uint64_set_pi32(((1 << (SCALEBITS - 1)) - 1 + (CENTERJSAMPLE << SCALEBITS)), - ((1 << (SCALEBITS - 1)) - 1 + (CENTERJSAMPLE << SCALEBITS))) -}; - -#define get_const_value(index) (*(__m64 *)&const_value[index]) - -#define PD_ONEHALF get_const_value(index_PD_ONEHALF) -#define PW_F0299_F0337 get_const_value(index_PW_F0299_F0337) -#define PW_F0114_F0250 get_const_value(index_PW_F0114_F0250) -#define PW_MF016_MF033 get_const_value(index_PW_MF016_MF033) -#define PW_MF008_MF041 get_const_value(index_PW_MF008_MF041) -#define PD_ONEHALFM1_CJ get_const_value(index_PD_ONEHALFM1_CJ) - - -#include "jccolext-mmi.c" -#undef RGB_RED -#undef RGB_GREEN -#undef RGB_BLUE -#undef RGB_PIXELSIZE - -#define RGB_RED EXT_RGB_RED -#define RGB_GREEN EXT_RGB_GREEN -#define RGB_BLUE EXT_RGB_BLUE -#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE -#define jsimd_rgb_ycc_convert_mmi jsimd_extrgb_ycc_convert_mmi -#include "jccolext-mmi.c" -#undef RGB_RED -#undef RGB_GREEN -#undef RGB_BLUE -#undef RGB_PIXELSIZE -#undef jsimd_rgb_ycc_convert_mmi - -#define RGB_RED EXT_RGBX_RED -#define RGB_GREEN EXT_RGBX_GREEN -#define RGB_BLUE EXT_RGBX_BLUE -#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE -#define jsimd_rgb_ycc_convert_mmi jsimd_extrgbx_ycc_convert_mmi -#include "jccolext-mmi.c" -#undef RGB_RED -#undef RGB_GREEN -#undef RGB_BLUE -#undef RGB_PIXELSIZE -#undef jsimd_rgb_ycc_convert_mmi - -#define RGB_RED EXT_BGR_RED -#define RGB_GREEN EXT_BGR_GREEN -#define RGB_BLUE EXT_BGR_BLUE -#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE -#define jsimd_rgb_ycc_convert_mmi jsimd_extbgr_ycc_convert_mmi -#include "jccolext-mmi.c" -#undef RGB_RED -#undef RGB_GREEN -#undef RGB_BLUE -#undef RGB_PIXELSIZE -#undef jsimd_rgb_ycc_convert_mmi - -#define RGB_RED EXT_BGRX_RED -#define RGB_GREEN EXT_BGRX_GREEN -#define RGB_BLUE EXT_BGRX_BLUE -#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE -#define jsimd_rgb_ycc_convert_mmi jsimd_extbgrx_ycc_convert_mmi -#include "jccolext-mmi.c" -#undef RGB_RED -#undef RGB_GREEN -#undef RGB_BLUE -#undef RGB_PIXELSIZE -#undef jsimd_rgb_ycc_convert_mmi - -#define RGB_RED EXT_XBGR_RED -#define RGB_GREEN EXT_XBGR_GREEN -#define RGB_BLUE EXT_XBGR_BLUE -#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE -#define jsimd_rgb_ycc_convert_mmi jsimd_extxbgr_ycc_convert_mmi -#include "jccolext-mmi.c" -#undef RGB_RED -#undef RGB_GREEN -#undef RGB_BLUE -#undef RGB_PIXELSIZE -#undef jsimd_rgb_ycc_convert_mmi - -#define RGB_RED EXT_XRGB_RED -#define RGB_GREEN EXT_XRGB_GREEN -#define RGB_BLUE EXT_XRGB_BLUE -#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE -#define jsimd_rgb_ycc_convert_mmi jsimd_extxrgb_ycc_convert_mmi -#include "jccolext-mmi.c" -#undef RGB_RED -#undef RGB_GREEN -#undef RGB_BLUE -#undef RGB_PIXELSIZE -#undef jsimd_rgb_ycc_convert_mmi diff --git a/simd/loongson/jcsample-mmi.c b/simd/loongson/jcsample-mmi.c deleted file mode 100644 index 2f2d851..0000000 --- a/simd/loongson/jcsample-mmi.c +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Loongson MMI optimizations for libjpeg-turbo - * - * Copyright (C) 2015, 2018, D. R. Commander. All Rights Reserved. - * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. - * All Rights Reserved. - * Authors: ZhuChen <zhuchen@loongson.cn> - * CaiWanwei <caiwanwei@loongson.cn> - * SunZhangzhi <sunzhangzhi-cq@loongson.cn> - * - * Based on the x86 SIMD extension for IJG JPEG library - * Copyright (C) 1999-2006, MIYASAKA Masaru. - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* CHROMA DOWNSAMPLING */ - -#include "jsimd_mmi.h" -#include "jcsample.h" - - -void jsimd_h2v2_downsample_mmi(JDIMENSION image_width, int max_v_samp_factor, - JDIMENSION v_samp_factor, - JDIMENSION width_in_blocks, - JSAMPARRAY input_data, JSAMPARRAY output_data) -{ - int inrow, outrow, outcol, bias; - JDIMENSION output_cols = width_in_blocks * DCTSIZE; - JSAMPROW inptr0, inptr1, outptr; - __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6 = 0.0, mm7; - - expand_right_edge(input_data, max_v_samp_factor, image_width, - output_cols * 2); - - bias = (1 << 17) + 1; /* 0x00020001 (bias pattern) */ - mm7 = _mm_set1_pi32(bias); /* mm7={1, 2, 1, 2} */ - mm6 = _mm_cmpeq_pi16(mm6, mm6); - mm6 = _mm_srli_pi16(mm6, BYTE_BIT); /* mm6={0xFF 0x00 0xFF 0x00 ..} */ - - for (inrow = 0, outrow = 0; outrow < v_samp_factor; - inrow += 2, outrow++) { - - inptr0 = input_data[inrow]; - inptr1 = input_data[inrow + 1]; - outptr = output_data[outrow]; - - for (outcol = output_cols; outcol > 0; - outcol -= 8, inptr0 += 16, inptr1 += 16, outptr += 8) { - - mm0 = _mm_load_si64((__m64 *)&inptr0[0]); - mm1 = _mm_load_si64((__m64 *)&inptr1[0]); - mm2 = _mm_load_si64((__m64 *)&inptr0[8]); - mm3 = _mm_load_si64((__m64 *)&inptr1[8]); - - mm4 = mm0; - mm5 = mm1; - mm0 = _mm_and_si64(mm0, mm6); - mm4 = _mm_srli_pi16(mm4, BYTE_BIT); - mm1 = _mm_and_si64(mm1, mm6); - mm5 = _mm_srli_pi16(mm5, BYTE_BIT); - mm0 = _mm_add_pi16(mm0, mm4); - mm1 = _mm_add_pi16(mm1, mm5); - - mm4 = mm2; - mm5 = mm3; - mm2 = _mm_and_si64(mm2, mm6); - mm4 = _mm_srli_pi16(mm4, BYTE_BIT); - mm3 = _mm_and_si64(mm3, mm6); - mm5 = _mm_srli_pi16(mm5, BYTE_BIT); - mm2 = _mm_add_pi16(mm2, mm4); - mm3 = _mm_add_pi16(mm3, mm5); - - mm0 = _mm_add_pi16(mm0, mm1); - mm2 = _mm_add_pi16(mm2, mm3); - mm0 = _mm_add_pi16(mm0, mm7); - mm2 = _mm_add_pi16(mm2, mm7); - mm0 = _mm_srli_pi16(mm0, 2); - mm2 = _mm_srli_pi16(mm2, 2); - - mm0 = _mm_packs_pu16(mm0, mm2); - - _mm_store_si64((__m64 *)&outptr[0], mm0); - } - } -} diff --git a/simd/loongson/jcsample.h b/simd/loongson/jcsample.h deleted file mode 100644 index 2ac4816..0000000 --- a/simd/loongson/jcsample.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - * jcsample.h - * - * This file was part of the Independent JPEG Group's software: - * Copyright (C) 1991-1996, Thomas G. Lane. - * For conditions of distribution and use, see the accompanying README.ijg - * file. - */ - -LOCAL(void) -expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols, - JDIMENSION output_cols) -{ - register JSAMPROW ptr; - register JSAMPLE pixval; - register int count; - int row; - int numcols = (int)(output_cols - input_cols); - - if (numcols > 0) { - for (row = 0; row < num_rows; row++) { - ptr = image_data[row] + input_cols; - pixval = ptr[-1]; /* don't need GETJSAMPLE() here */ - for (count = numcols; count > 0; count--) - *ptr++ = pixval; - } - } -} diff --git a/simd/loongson/jdcolext-mmi.c b/simd/loongson/jdcolext-mmi.c deleted file mode 100644 index 560d9b0..0000000 --- a/simd/loongson/jdcolext-mmi.c +++ /dev/null @@ -1,424 +0,0 @@ -/* - * Loongson MMI optimizations for libjpeg-turbo - * - * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB - * Copyright (C) 2015, D. R. Commander. All Rights Reserved. - * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. - * All Rights Reserved. - * Authors: ZhuChen <zhuchen@loongson.cn> - * SunZhangzhi <sunzhangzhi-cq@loongson.cn> - * CaiWanwei <caiwanwei@loongson.cn> - * - * Based on the x86 SIMD extension for IJG JPEG library - * Copyright (C) 1999-2006, MIYASAKA Masaru. - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* This file is included by jdcolor-mmi.c */ - - -#if RGB_RED == 0 -#define mmA mm0 -#define mmB mm1 -#elif RGB_GREEN == 0 -#define mmA mm2 -#define mmB mm3 -#elif RGB_BLUE == 0 -#define mmA mm4 -#define mmB mm5 -#else -#define mmA mm6 -#define mmB mm7 -#endif - -#if RGB_RED == 1 -#define mmC mm0 -#define mmD mm1 -#elif RGB_GREEN == 1 -#define mmC mm2 -#define mmD mm3 -#elif RGB_BLUE == 1 -#define mmC mm4 -#define mmD mm5 -#else -#define mmC mm6 -#define mmD mm7 -#endif - -#if RGB_RED == 2 -#define mmE mm0 -#define mmF mm1 -#elif RGB_GREEN == 2 -#define mmE mm2 -#define mmF mm3 -#elif RGB_BLUE == 2 -#define mmE mm4 -#define mmF mm5 -#else -#define mmE mm6 -#define mmF mm7 -#endif - -#if RGB_RED == 3 -#define mmG mm0 -#define mmH mm1 -#elif RGB_GREEN == 3 -#define mmG mm2 -#define mmH mm3 -#elif RGB_BLUE == 3 -#define mmG mm4 -#define mmH mm5 -#else -#define mmG mm6 -#define mmH mm7 -#endif - - -void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf, - JDIMENSION input_row, JSAMPARRAY output_buf, - int num_rows) -{ - JSAMPROW outptr, inptr0, inptr1, inptr2; - int num_cols, col; - __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; - __m64 mm8, wk[2]; - - while (--num_rows >= 0) { - inptr0 = input_buf[0][input_row]; - inptr1 = input_buf[1][input_row]; - inptr2 = input_buf[2][input_row]; - input_row++; - outptr = *output_buf++; - - for (num_cols = out_width; num_cols > 0; num_cols -= 8, - inptr0 += 8, inptr1 += 8, inptr2 += 8) { - - mm5 = _mm_load_si64((__m64 *)inptr1); - mm1 = _mm_load_si64((__m64 *)inptr2); - mm8 = _mm_load_si64((__m64 *)inptr0); - mm4 = 0; - mm7 = 0; - mm4 = _mm_cmpeq_pi16(mm4, mm4); - mm7 = _mm_cmpeq_pi16(mm7, mm7); - mm4 = _mm_srli_pi16(mm4, BYTE_BIT); - mm7 = _mm_slli_pi16(mm7, 7); /* mm7={0xFF80 0xFF80 0xFF80 0xFF80} */ - mm0 = mm4; /* mm0=mm4={0xFF 0x00 0xFF 0x00 ..} */ - - mm4 = _mm_and_si64(mm4, mm5); /* mm4=Cb(0246)=CbE */ - mm5 = _mm_srli_pi16(mm5, BYTE_BIT); /* mm5=Cb(1357)=CbO */ - mm0 = _mm_and_si64(mm0, mm1); /* mm0=Cr(0246)=CrE */ - mm1 = _mm_srli_pi16(mm1, BYTE_BIT); /* mm1=Cr(1357)=CrO */ - mm4 = _mm_add_pi16(mm4, mm7); - mm5 = _mm_add_pi16(mm5, mm7); - mm0 = _mm_add_pi16(mm0, mm7); - mm1 = _mm_add_pi16(mm1, mm7); - - /* (Original) - * R = Y + 1.40200 * Cr - * G = Y - 0.34414 * Cb - 0.71414 * Cr - * B = Y + 1.77200 * Cb - * - * (This implementation) - * R = Y + 0.40200 * Cr + Cr - * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr - * B = Y - 0.22800 * Cb + Cb + Cb - */ - - mm2 = mm4; /* mm2 = CbE */ - mm3 = mm5; /* mm3 = CbO */ - mm4 = _mm_add_pi16(mm4, mm4); /* mm4 = 2*CbE */ - mm5 = _mm_add_pi16(mm5, mm5); /* mm5 = 2*CbO */ - mm6 = mm0; /* mm6 = CrE */ - mm7 = mm1; /* mm7 = CrO */ - mm0 = _mm_add_pi16(mm0, mm0); /* mm0 = 2*CrE */ - mm1 = _mm_add_pi16(mm1, mm1); /* mm1 = 2*CrO */ - - mm4 = _mm_mulhi_pi16(mm4, PW_MF0228); /* mm4=(2*CbE * -FIX(0.22800) */ - mm5 = _mm_mulhi_pi16(mm5, PW_MF0228); /* mm5=(2*CbO * -FIX(0.22800) */ - mm0 = _mm_mulhi_pi16(mm0, PW_F0402); /* mm0=(2*CrE * FIX(0.40200)) */ - mm1 = _mm_mulhi_pi16(mm1, PW_F0402); /* mm1=(2*CrO * FIX(0.40200)) */ - - mm4 = _mm_add_pi16(mm4, PW_ONE); - mm5 = _mm_add_pi16(mm5, PW_ONE); - mm4 = _mm_srai_pi16(mm4, 1); /* mm4=(CbE * -FIX(0.22800)) */ - mm5 = _mm_srai_pi16(mm5, 1); /* mm5=(CbO * -FIX(0.22800)) */ - mm0 = _mm_add_pi16(mm0, PW_ONE); - mm1 = _mm_add_pi16(mm1, PW_ONE); - mm0 = _mm_srai_pi16(mm0, 1); /* mm0=(CrE * FIX(0.40200)) */ - mm1 = _mm_srai_pi16(mm1, 1); /* mm1=(CrO * FIX(0.40200)) */ - - mm4 = _mm_add_pi16(mm4, mm2); - mm5 = _mm_add_pi16(mm5, mm3); - mm4 = _mm_add_pi16(mm4, mm2); /* mm4=(CbE * FIX(1.77200))=(B-Y)E */ - mm5 = _mm_add_pi16(mm5, mm3); /* mm5=(CbO * FIX(1.77200))=(B-Y)O */ - mm0 = _mm_add_pi16(mm0, mm6); /* mm0=(CrE * FIX(1.40200))=(R-Y)E */ - mm1 = _mm_add_pi16(mm1, mm7); /* mm1=(CrO * FIX(1.40200))=(R-Y)O */ - - wk[0] = mm4; /* wk(0)=(B-Y)E */ - wk[1] = mm5; /* wk(1)=(B-Y)O */ - - mm4 = mm2; - mm5 = mm3; - mm2 = _mm_unpacklo_pi16(mm2, mm6); - mm4 = _mm_unpackhi_pi16(mm4, mm6); - mm2 = _mm_madd_pi16(mm2, PW_MF0344_F0285); - mm4 = _mm_madd_pi16(mm4, PW_MF0344_F0285); - mm3 = _mm_unpacklo_pi16(mm3, mm7); - mm5 = _mm_unpackhi_pi16(mm5, mm7); - mm3 = _mm_madd_pi16(mm3, PW_MF0344_F0285); - mm5 = _mm_madd_pi16(mm5, PW_MF0344_F0285); - - mm2 = _mm_add_pi32(mm2, PD_ONEHALF); - mm4 = _mm_add_pi32(mm4, PD_ONEHALF); - mm2 = _mm_srai_pi32(mm2, SCALEBITS); - mm4 = _mm_srai_pi32(mm4, SCALEBITS); - mm3 = _mm_add_pi32(mm3, PD_ONEHALF); - mm5 = _mm_add_pi32(mm5, PD_ONEHALF); - mm3 = _mm_srai_pi32(mm3, SCALEBITS); - mm5 = _mm_srai_pi32(mm5, SCALEBITS); - - mm2 = _mm_packs_pi32(mm2, mm4); /* mm2=CbE*-FIX(0.344)+CrE*FIX(0.285) */ - mm3 = _mm_packs_pi32(mm3, mm5); /* mm3=CbO*-FIX(0.344)+CrO*FIX(0.285) */ - mm2 = _mm_sub_pi16(mm2, mm6); /* mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E */ - mm3 = _mm_sub_pi16(mm3, mm7); /* mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O */ - - mm5 = mm8; /* mm5=Y(01234567) */ - - mm4 = _mm_cmpeq_pi16(mm4, mm4); - mm4 = _mm_srli_pi16(mm4, BYTE_BIT); /* mm4={0xFF 0x00 0xFF 0x00 ..} */ - mm4 = _mm_and_si64(mm4, mm5); /* mm4=Y(0246)=YE */ - mm5 = _mm_srli_pi16(mm5, BYTE_BIT); /* mm5=Y(1357)=YO */ - - mm0 = _mm_add_pi16(mm0, mm4); /* mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6) */ - mm1 = _mm_add_pi16(mm1, mm5); /* mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7) */ - mm0 = _mm_packs_pu16(mm0, mm0); /* mm0=(R0 R2 R4 R6 ** ** ** **) */ - mm1 = _mm_packs_pu16(mm1, mm1); /* mm1=(R1 R3 R5 R7 ** ** ** **) */ - - mm2 = _mm_add_pi16(mm2, mm4); /* mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6) */ - mm3 = _mm_add_pi16(mm3, mm5); /* mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7) */ - mm2 = _mm_packs_pu16(mm2, mm2); /* mm2=(G0 G2 G4 G6 ** ** ** **) */ - mm3 = _mm_packs_pu16(mm3, mm3); /* mm3=(G1 G3 G5 G7 ** ** ** **) */ - - mm4 = _mm_add_pi16(mm4, wk[0]); /* mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6) */ - mm5 = _mm_add_pi16(mm5, wk[1]); /* mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7) */ - mm4 = _mm_packs_pu16(mm4, mm4); /* mm4=(B0 B2 B4 B6 ** ** ** **) */ - mm5 = _mm_packs_pu16(mm5, mm5); /* mm5=(B1 B3 B5 B7 ** ** ** **) */ - -#if RGB_PIXELSIZE == 3 - - /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */ - /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */ - mmA = _mm_unpacklo_pi8(mmA, mmC); /* mmA=(00 10 02 12 04 14 06 16) */ - mmE = _mm_unpacklo_pi8(mmE, mmB); /* mmE=(20 01 22 03 24 05 26 07) */ - mmD = _mm_unpacklo_pi8(mmD, mmF); /* mmD=(11 21 13 23 15 25 17 27) */ - - mmG = mmA; - mmH = mmA; - mmA = _mm_unpacklo_pi16(mmA, mmE); /* mmA=(00 10 20 01 02 12 22 03) */ - mmG = _mm_unpackhi_pi16(mmG, mmE); /* mmG=(04 14 24 05 06 16 26 07) */ - - mmH = _mm_srli_si64(mmH, 2 * BYTE_BIT); - mmE = _mm_srli_si64(mmE, 2 * BYTE_BIT); - - mmC = mmD; - mmB = mmD; - mmD = _mm_unpacklo_pi16(mmD, mmH); /* mmD=(11 21 02 12 13 23 04 14) */ - mmC = _mm_unpackhi_pi16(mmC, mmH); /* mmC=(15 25 06 16 17 27 -- --) */ - - mmB = _mm_srli_si64(mmB, 2 * BYTE_BIT); /* mmB=(13 23 15 25 17 27 -- --) */ - - mmF = mmE; - mmE = _mm_unpacklo_pi16(mmE, mmB); /* mmE=(22 03 13 23 24 05 15 25) */ - mmF = _mm_unpackhi_pi16(mmF, mmB); /* mmF=(26 07 17 27 -- -- -- --) */ - - mmA = _mm_unpacklo_pi32(mmA, mmD); /* mmA=(00 10 20 01 11 21 02 12) */ - mmE = _mm_unpacklo_pi32(mmE, mmG); /* mmE=(22 03 13 23 04 14 24 05) */ - mmC = _mm_unpacklo_pi32(mmC, mmF); /* mmC=(15 25 06 16 26 07 17 27) */ - - if (num_cols >= 8) { - _mm_store_si64((__m64 *)outptr, mmA); - _mm_store_si64((__m64 *)(outptr + 8), mmE); - _mm_store_si64((__m64 *)(outptr + 16), mmC); - outptr += RGB_PIXELSIZE * 8; - } else { - col = num_cols * 3; - asm(".set noreorder\r\n" - - "li $8, 16\r\n" - "move $9, %4\r\n" - "mov.s $f4, %1\r\n" - "mov.s $f6, %3\r\n" - "move $10, %5\r\n" - "bltu $9, $8, 1f\r\n" - "nop \r\n" - "gssdlc1 $f4, 7($10)\r\n" - "gssdrc1 $f4, 0($10)\r\n" - "gssdlc1 $f6, 7+8($10)\r\n" - "gssdrc1 $f6, 8($10)\r\n" - "mov.s $f4, %2\r\n" - "subu $9, $9, 16\r\n" - "daddu $10, $10, 16\r\n" - "b 2f\r\n" - "nop \r\n" - - "1: \r\n" - "li $8, 8\r\n" /* st8 */ - "bltu $9, $8, 2f\r\n" - "nop \r\n" - "gssdlc1 $f4, 7($10)\r\n" - "gssdrc1 $f4, ($10)\r\n" - "mov.s $f4, %3\r\n" - "subu $9, $9, 8\r\n" - "daddu $10, $10, 8\r\n" - - "2: \r\n" - "li $8, 4\r\n" /* st4 */ - "mfc1 $11, $f4\r\n" - "bltu $9, $8, 3f\r\n" - "nop \r\n" - "swl $11, 3($10)\r\n" - "swr $11, 0($10)\r\n" - "li $8, 32\r\n" - "mtc1 $8, $f6\r\n" - "dsrl $f4, $f4, $f6\r\n" - "mfc1 $11, $f4\r\n" - "subu $9, $9, 4\r\n" - "daddu $10, $10, 4\r\n" - - "3: \r\n" - "li $8, 2\r\n" /* st2 */ - "bltu $9, $8, 4f\r\n" - "nop \r\n" - "ush $11, 0($10)\r\n" - "srl $11, 16\r\n" - "subu $9, $9, 2\r\n" - "daddu $10, $10, 2\r\n" - - "4: \r\n" - "li $8, 1\r\n" /* st1 */ - "bltu $9, $8, 5f\r\n" - "nop \r\n" - "sb $11, 0($10)\r\n" - - "5: \r\n" - "nop \r\n" /* end */ - : "=m" (*outptr) - : "f" (mmA), "f" (mmC), "f" (mmE), "r" (col), "r" (outptr) - : "$f4", "$f6", "$8", "$9", "$10", "$11", "memory" - ); - } - -#else /* RGB_PIXELSIZE == 4 */ - -#ifdef RGBX_FILLER_0XFF - mm6 = _mm_cmpeq_pi8(mm6, mm6); - mm7 = _mm_cmpeq_pi8(mm7, mm7); -#else - mm6 = _mm_xor_si64(mm6, mm6); - mm7 = _mm_xor_si64(mm7, mm7); -#endif - /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */ - /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */ - /* mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) */ - /* mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) */ - - mmA = _mm_unpacklo_pi8(mmA, mmC); /* mmA=(00 10 02 12 04 14 06 16) */ - mmE = _mm_unpacklo_pi8(mmE, mmG); /* mmE=(20 30 22 32 24 34 26 36) */ - mmB = _mm_unpacklo_pi8(mmB, mmD); /* mmB=(01 11 03 13 05 15 07 17) */ - mmF = _mm_unpacklo_pi8(mmF, mmH); /* mmF=(21 31 23 33 25 35 27 37) */ - - mmC = mmA; - mmA = _mm_unpacklo_pi16(mmA, mmE); /* mmA=(00 10 20 30 02 12 22 32) */ - mmC = _mm_unpackhi_pi16(mmC, mmE); /* mmC=(04 14 24 34 06 16 26 36) */ - mmG = mmB; - mmB = _mm_unpacklo_pi16(mmB, mmF); /* mmB=(01 11 21 31 03 13 23 33) */ - mmG = _mm_unpackhi_pi16(mmG, mmF); /* mmG=(05 15 25 35 07 17 27 37) */ - - mmD = mmA; - mmA = _mm_unpacklo_pi32(mmA, mmB); /* mmA=(00 10 20 30 01 11 21 31) */ - mmD = _mm_unpackhi_pi32(mmD, mmB); /* mmD=(02 12 22 32 03 13 23 33) */ - mmH = mmC; - mmC = _mm_unpacklo_pi32(mmC, mmG); /* mmC=(04 14 24 34 05 15 25 35) */ - mmH = _mm_unpackhi_pi32(mmH, mmG); /* mmH=(06 16 26 36 07 17 27 37) */ - - if (num_cols >= 8) { - _mm_store_si64((__m64 *)outptr, mmA); - _mm_store_si64((__m64 *)(outptr + 8), mmD); - _mm_store_si64((__m64 *)(outptr + 16), mmC); - _mm_store_si64((__m64 *)(outptr + 24), mmH); - outptr += RGB_PIXELSIZE * 8; - } else { - col = num_cols; - asm(".set noreorder\r\n" /* st16 */ - - "li $8, 4\r\n" - "move $9, %6\r\n" - "move $10, %7\r\n" - "mov.s $f4, %2\r\n" - "mov.s $f6, %4\r\n" - "bltu $9, $8, 1f\r\n" - "nop \r\n" - "gssdlc1 $f4, 7($10)\r\n" - "gssdrc1 $f4, ($10)\r\n" - "gssdlc1 $f6, 7+8($10)\r\n" - "gssdrc1 $f6, 8($10)\r\n" - "mov.s $f4, %3\r\n" - "mov.s $f6, %5\r\n" - "subu $9, $9, 4\r\n" - "daddu $10, $10, 16\r\n" - - "1: \r\n" - "li $8, 2\r\n" /* st8 */ - "bltu $9, $8, 2f\r\n" - "nop \r\n" - "gssdlc1 $f4, 7($10)\r\n" - "gssdrc1 $f4, 0($10)\r\n" - "mov.s $f4, $f6\r\n" - "subu $9, $9, 2\r\n" - "daddu $10, $10, 8\r\n" - - "2: \r\n" - "li $8, 1\r\n" /* st4 */ - "bltu $9, $8, 3f\r\n" - "nop \r\n" - "gsswlc1 $f4, 3($10)\r\n" - "gsswrc1 $f4, 0($10)\r\n" - - "3: \r\n" - "li %1, 0\r\n" /* end */ - : "=m" (*outptr), "=r" (col) - : "f" (mmA), "f" (mmC), "f" (mmD), "f" (mmH), "r" (col), - "r" (outptr) - : "$f4", "$f6", "$8", "$9", "$10", "memory" - ); - } - -#endif - - } - } -} - -#undef mmA -#undef mmB -#undef mmC -#undef mmD -#undef mmE -#undef mmF -#undef mmG -#undef mmH diff --git a/simd/loongson/jdcolor-mmi.c b/simd/loongson/jdcolor-mmi.c deleted file mode 100644 index 2c58263..0000000 --- a/simd/loongson/jdcolor-mmi.c +++ /dev/null @@ -1,139 +0,0 @@ -/* - * Loongson MMI optimizations for libjpeg-turbo - * - * Copyright (C) 2011, 2015, D. R. Commander. All Rights Reserved. - * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. - * All Rights Reserved. - * Authors: ZhuChen <zhuchen@loongson.cn> - * CaiWanwei <caiwanwei@loongson.cn> - * SunZhangzhi <sunzhangzhi-cq@loongson.cn> - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* YCC --> RGB CONVERSION */ - -#include "jsimd_mmi.h" - - -#define F_0_344 ((short)22554) /* FIX(0.34414) */ -#define F_0_402 ((short)26345) /* FIX(1.40200) - FIX(1) */ -#define F_0_285 ((short)18734) /* FIX(1) - FIX(0.71414) */ -#define F_0_228 ((short)14942) /* FIX(2) - FIX(1.77200) */ - -enum const_index { - index_PW_ONE, - index_PW_F0402, - index_PW_MF0228, - index_PW_MF0344_F0285, - index_PD_ONEHALF -}; - -static uint64_t const_value[] = { - _uint64_set_pi16(1, 1, 1, 1), - _uint64_set_pi16(F_0_402, F_0_402, F_0_402, F_0_402), - _uint64_set_pi16(-F_0_228, -F_0_228, -F_0_228, -F_0_228), - _uint64_set_pi16(F_0_285, -F_0_344, F_0_285, -F_0_344), - _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1))) -}; - -#define PW_ONE get_const_value(index_PW_ONE) -#define PW_F0402 get_const_value(index_PW_F0402) -#define PW_MF0228 get_const_value(index_PW_MF0228) -#define PW_MF0344_F0285 get_const_value(index_PW_MF0344_F0285) -#define PD_ONEHALF get_const_value(index_PD_ONEHALF) - -#define RGBX_FILLER_0XFF 1 - - -#include "jdcolext-mmi.c" -#undef RGB_RED -#undef RGB_GREEN -#undef RGB_BLUE -#undef RGB_PIXELSIZE - -#define RGB_RED EXT_RGB_RED -#define RGB_GREEN EXT_RGB_GREEN -#define RGB_BLUE EXT_RGB_BLUE -#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE -#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extrgb_convert_mmi -#include "jdcolext-mmi.c" -#undef RGB_RED -#undef RGB_GREEN -#undef RGB_BLUE -#undef RGB_PIXELSIZE -#undef jsimd_ycc_rgb_convert_mmi - -#define RGB_RED EXT_RGBX_RED -#define RGB_GREEN EXT_RGBX_GREEN -#define RGB_BLUE EXT_RGBX_BLUE -#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE -#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extrgbx_convert_mmi -#include "jdcolext-mmi.c" -#undef RGB_RED -#undef RGB_GREEN -#undef RGB_BLUE -#undef RGB_PIXELSIZE -#undef jsimd_ycc_rgb_convert_mmi - -#define RGB_RED EXT_BGR_RED -#define RGB_GREEN EXT_BGR_GREEN -#define RGB_BLUE EXT_BGR_BLUE -#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE -#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extbgr_convert_mmi -#include "jdcolext-mmi.c" -#undef RGB_RED -#undef RGB_GREEN -#undef RGB_BLUE -#undef RGB_PIXELSIZE -#undef jsimd_ycc_rgb_convert_mmi - -#define RGB_RED EXT_BGRX_RED -#define RGB_GREEN EXT_BGRX_GREEN -#define RGB_BLUE EXT_BGRX_BLUE -#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE -#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extbgrx_convert_mmi -#include "jdcolext-mmi.c" -#undef RGB_RED -#undef RGB_GREEN -#undef RGB_BLUE -#undef RGB_PIXELSIZE -#undef jsimd_ycc_rgb_convert_mmi - -#define RGB_RED EXT_XBGR_RED -#define RGB_GREEN EXT_XBGR_GREEN -#define RGB_BLUE EXT_XBGR_BLUE -#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE -#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extxbgr_convert_mmi -#include "jdcolext-mmi.c" -#undef RGB_RED -#undef RGB_GREEN -#undef RGB_BLUE -#undef RGB_PIXELSIZE -#undef jsimd_ycc_rgb_convert_mmi - -#define RGB_RED EXT_XRGB_RED -#define RGB_GREEN EXT_XRGB_GREEN -#define RGB_BLUE EXT_XRGB_BLUE -#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE -#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extxrgb_convert_mmi -#include "jdcolext-mmi.c" -#undef RGB_RED -#undef RGB_GREEN -#undef RGB_BLUE -#undef RGB_PIXELSIZE -#undef jsimd_ycc_rgb_convert_mmi diff --git a/simd/loongson/jdsample-mmi.c b/simd/loongson/jdsample-mmi.c deleted file mode 100644 index 00a6265..0000000 --- a/simd/loongson/jdsample-mmi.c +++ /dev/null @@ -1,245 +0,0 @@ -/* - * Loongson MMI optimizations for libjpeg-turbo - * - * Copyright (C) 2015, 2018, D. R. Commander. All Rights Reserved. - * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. - * All Rights Reserved. - * Authors: ZhuChen <zhuchen@loongson.cn> - * CaiWanwei <caiwanwei@loongson.cn> - * SunZhangzhi <sunzhangzhi-cq@loongson.cn> - * - * Based on the x86 SIMD extension for IJG JPEG library - * Copyright (C) 1999-2006, MIYASAKA Masaru. - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* CHROMA UPSAMPLING */ - -#include "jsimd_mmi.h" - - -enum const_index { - index_PW_THREE, - index_PW_SEVEN, - index_PW_EIGHT, -}; - -static uint64_t const_value[] = { - _uint64_set_pi16(3, 3, 3, 3), - _uint64_set_pi16(7, 7, 7, 7), - _uint64_set_pi16(8, 8, 8, 8), -}; - -#define PW_THREE get_const_value(index_PW_THREE) -#define PW_SEVEN get_const_value(index_PW_SEVEN) -#define PW_EIGHT get_const_value(index_PW_EIGHT) - - -#define PROCESS_ROW(r) { \ - mm7 = _mm_load_si64((__m64 *)outptr##r); /* mm7=IntrL=( 0 1 2 3) */ \ - mm3 = _mm_load_si64((__m64 *)outptr##r + 1); /* mm3=IntrH=( 4 5 6 7) */ \ - \ - mm0 = mm7; \ - mm4 = mm3; \ - mm0 = _mm_srli_si64(mm0, 2 * BYTE_BIT); /* mm0=( 1 2 3 -) */ \ - mm4 = _mm_slli_si64(mm4, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm4=( - - - 4) */ \ - mm5 = mm7; \ - mm6 = mm3; \ - mm5 = _mm_srli_si64(mm5, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm5=( 3 - - -) */ \ - mm6 = _mm_slli_si64(mm6, 2 * BYTE_BIT); /* mm6=( - 4 5 6) */ \ - \ - mm0 = _mm_or_si64(mm0, mm4); /* mm0=( 1 2 3 4) */ \ - mm5 = _mm_or_si64(mm5, mm6); /* mm5=( 3 4 5 6) */ \ - \ - mm1 = mm7; \ - mm2 = mm3; \ - mm1 = _mm_slli_si64(mm1, 2 * BYTE_BIT); /* mm1=( - 0 1 2) */ \ - mm2 = _mm_srli_si64(mm2, 2 * BYTE_BIT); /* mm2=( 5 6 7 -) */ \ - mm4 = mm3; \ - mm4 = _mm_srli_si64(mm4, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm4=( 7 - - -) */ \ - \ - mm1 = _mm_or_si64(mm1, wk[r]); /* mm1=(-1 0 1 2) */ \ - mm2 = _mm_or_si64(mm2, wk[r + 2]); /* mm2=( 5 6 6 8) */ \ - \ - wk[r] = mm4; \ - \ - mm7 = _mm_mullo_pi16(mm7, PW_THREE); \ - mm3 = _mm_mullo_pi16(mm3, PW_THREE); \ - mm1 = _mm_add_pi16(mm1, PW_EIGHT); \ - mm5 = _mm_add_pi16(mm5, PW_EIGHT); \ - mm0 = _mm_add_pi16(mm0, PW_SEVEN); \ - mm2 = _mm_add_pi16(mm2, PW_SEVEN); \ - \ - mm1 = _mm_add_pi16(mm1, mm7); \ - mm5 = _mm_add_pi16(mm5, mm3); \ - mm1 = _mm_srli_pi16(mm1, 4); /* mm1=OutrLE=( 0 2 4 6) */ \ - mm5 = _mm_srli_pi16(mm5, 4); /* mm5=OutrHE=( 8 10 12 14) */ \ - mm0 = _mm_add_pi16(mm0, mm7); \ - mm2 = _mm_add_pi16(mm2, mm3); \ - mm0 = _mm_srli_pi16(mm0, 4); /* mm0=OutrLO=( 1 3 5 7) */ \ - mm2 = _mm_srli_pi16(mm2, 4); /* mm2=OutrHO=( 9 11 13 15) */ \ - \ - mm0 = _mm_slli_pi16(mm0, BYTE_BIT); \ - mm2 = _mm_slli_pi16(mm2, BYTE_BIT); \ - mm1 = _mm_or_si64(mm1, mm0); /* mm1=OutrL=( 0 1 2 3 4 5 6 7) */ \ - mm5 = _mm_or_si64(mm5, mm2); /* mm5=OutrH=( 8 9 10 11 12 13 14 15) */ \ - \ - _mm_store_si64((__m64 *)outptr##r, mm1); \ - _mm_store_si64((__m64 *)outptr##r + 1, mm5); \ -} - -void jsimd_h2v2_fancy_upsample_mmi(int max_v_samp_factor, - JDIMENSION downsampled_width, - JSAMPARRAY input_data, - JSAMPARRAY *output_data_ptr) -{ - JSAMPARRAY output_data = *output_data_ptr; - JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1; - int inrow, outrow, incol, tmp, tmp1; - __m64 mm0, mm1, mm2, mm3 = 0.0, mm4, mm5, mm6, mm7 = 0.0; - __m64 wk[4], mm_tmp; - - for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { - - inptr_1 = input_data[inrow - 1]; - inptr0 = input_data[inrow]; - inptr1 = input_data[inrow + 1]; - outptr0 = output_data[outrow++]; - outptr1 = output_data[outrow++]; - - if (downsampled_width & 7) { - tmp = (downsampled_width - 1) * sizeof(JSAMPLE); - tmp1 = downsampled_width * sizeof(JSAMPLE); - asm("daddu $8, %3, %6\r\n" - "lb $9, ($8)\r\n" - "daddu $8, %3, %7\r\n" - "sb $9, ($8)\r\n" - "daddu $8, %4, %6\r\n" - "lb $9, ($8)\r\n" - "daddu $8, %4, %7\r\n" - "sb $9, ($8)\r\n" - "daddu $8, %5, %6\r\n" - "lb $9, ($8)\r\n" - "daddu $8, %5, %7\r\n" - "sb $9, ($8)\r\n" - : "=m" (*inptr_1), "=m" (*inptr0), "=m" (*inptr1) - : "r" (inptr_1), "r" (inptr0), "r" (inptr1), "r" (tmp), "r" (tmp1) - : "$8", "$9" - ); - } - - /* process the first column block */ - mm0 = _mm_load_si64((__m64 *)inptr0); /* mm0 = row[ 0][0] */ - mm1 = _mm_load_si64((__m64 *)inptr_1); /* mm1 = row[-1][0] */ - mm2 = _mm_load_si64((__m64 *)inptr1); /* mm2 = row[ 1][0] */ - - mm3 = _mm_xor_si64(mm3, mm3); /* mm3 = (all 0's) */ - mm4 = mm0; - mm0 = _mm_unpacklo_pi8(mm0, mm3); /* mm0 = row[ 0][0]( 0 1 2 3) */ - mm4 = _mm_unpackhi_pi8(mm4, mm3); /* mm4 = row[ 0][0]( 4 5 6 7) */ - mm5 = mm1; - mm1 = _mm_unpacklo_pi8(mm1, mm3); /* mm1 = row[-1][0]( 0 1 2 3) */ - mm5 = _mm_unpackhi_pi8(mm5, mm3); /* mm5 = row[-1][0]( 4 5 6 7) */ - mm6 = mm2; - mm2 = _mm_unpacklo_pi8(mm2, mm3); /* mm2 = row[+1][0]( 0 1 2 3) */ - mm6 = _mm_unpackhi_pi8(mm6, mm3); /* mm6 = row[+1][0]( 4 5 6 7) */ - - mm0 = _mm_mullo_pi16(mm0, PW_THREE); - mm4 = _mm_mullo_pi16(mm4, PW_THREE); - - mm7 = _mm_cmpeq_pi8(mm7, mm7); - mm7 = _mm_srli_si64(mm7, (SIZEOF_MMWORD - 2) * BYTE_BIT); - - mm1 = _mm_add_pi16(mm1, mm0); /* mm1=Int0L=( 0 1 2 3) */ - mm5 = _mm_add_pi16(mm5, mm4); /* mm5=Int0H=( 4 5 6 7) */ - mm2 = _mm_add_pi16(mm2, mm0); /* mm2=Int1L=( 0 1 2 3) */ - mm6 = _mm_add_pi16(mm6, mm4); /* mm6=Int1H=( 4 5 6 7) */ - - _mm_store_si64((__m64 *)outptr0, mm1); /* temporarily save */ - _mm_store_si64((__m64 *)outptr0 + 1, mm5); /* the intermediate data */ - _mm_store_si64((__m64 *)outptr1, mm2); - _mm_store_si64((__m64 *)outptr1 + 1, mm6); - - mm1 = _mm_and_si64(mm1, mm7); /* mm1=( 0 - - -) */ - mm2 = _mm_and_si64(mm2, mm7); /* mm2=( 0 - - -) */ - - wk[0] = mm1; - wk[1] = mm2; - - for (incol = downsampled_width; incol > 0; - incol -= 8, inptr_1 += 8, inptr0 += 8, inptr1 += 8, - outptr0 += 16, outptr1 += 16) { - - if (incol > 8) { - /* process the next column block */ - mm0 = _mm_load_si64((__m64 *)inptr0 + 1); /* mm0 = row[ 0][1] */ - mm1 = _mm_load_si64((__m64 *)inptr_1 + 1); /* mm1 = row[-1][1] */ - mm2 = _mm_load_si64((__m64 *)inptr1 + 1); /* mm2 = row[+1][1] */ - - mm3 = _mm_setzero_si64(); /* mm3 = (all 0's) */ - mm4 = mm0; - mm0 = _mm_unpacklo_pi8(mm0, mm3); /* mm0 = row[ 0][1]( 0 1 2 3) */ - mm4 = _mm_unpackhi_pi8(mm4, mm3); /* mm4 = row[ 0][1]( 4 5 6 7) */ - mm5 = mm1; - mm1 = _mm_unpacklo_pi8(mm1, mm3); /* mm1 = row[-1][1]( 0 1 2 3) */ - mm5 = _mm_unpackhi_pi8(mm5, mm3); /* mm5 = row[-1][1]( 4 5 6 7) */ - mm6 = mm2; - mm2 = _mm_unpacklo_pi8(mm2, mm3); /* mm2 = row[+1][1]( 0 1 2 3) */ - mm6 = _mm_unpackhi_pi8(mm6, mm3); /* mm6 = row[+1][1]( 4 5 6 7) */ - - mm0 = _mm_mullo_pi16(mm0, PW_THREE); - mm4 = _mm_mullo_pi16(mm4, PW_THREE); - - mm1 = _mm_add_pi16(mm1, mm0); /* mm1 = Int0L = ( 0 1 2 3) */ - mm5 = _mm_add_pi16(mm5, mm4); /* mm5 = Int0H = ( 4 5 6 7) */ - mm2 = _mm_add_pi16(mm2, mm0); /* mm2 = Int1L = ( 0 1 2 3) */ - mm6 = _mm_add_pi16(mm6, mm4); /* mm6 = Int1H = ( 4 5 6 7) */ - - _mm_store_si64((__m64 *)outptr0 + 2, mm1); /* temporarily save */ - _mm_store_si64((__m64 *)outptr0 + 3, mm5); /* the intermediate data */ - _mm_store_si64((__m64 *)outptr1 + 2, mm2); - _mm_store_si64((__m64 *)outptr1 + 3, mm6); - - mm1 = _mm_slli_si64(mm1, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm1=( - - - 0) */ - mm2 = _mm_slli_si64(mm2, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm2=( - - - 0) */ - - wk[2] = mm1; - wk[3] = mm2; - } else { - /* process the last column block */ - mm1 = _mm_cmpeq_pi8(mm1, mm1); - mm1 = _mm_slli_si64(mm1, (SIZEOF_MMWORD - 2) * BYTE_BIT); - mm2 = mm1; - - mm_tmp = _mm_load_si64((__m64 *)outptr0 + 1); - mm1 = _mm_and_si64(mm1, mm_tmp); /* mm1=( - - - 7) */ - mm_tmp = _mm_load_si64((__m64 *)outptr1 + 1); - mm2 = _mm_and_si64(mm2, mm_tmp); /* mm2=( - - - 7) */ - - wk[2] = mm1; - wk[3] = mm2; - } - - /* process the upper row */ - PROCESS_ROW(0) - - /* process the lower row */ - PROCESS_ROW(1) - } - } -} diff --git a/simd/loongson/jfdctint-mmi.c b/simd/loongson/jfdctint-mmi.c deleted file mode 100644 index a0ea692..0000000 --- a/simd/loongson/jfdctint-mmi.c +++ /dev/null @@ -1,398 +0,0 @@ -/* - * Loongson MMI optimizations for libjpeg-turbo - * - * Copyright (C) 2014, 2018, D. R. Commander. All Rights Reserved. - * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. - * All Rights Reserved. - * Authors: ZhuChen <zhuchen@loongson.cn> - * CaiWanwei <caiwanwei@loongson.cn> - * SunZhangzhi <sunzhangzhi-cq@loongson.cn> - * - * Based on the x86 SIMD extension for IJG JPEG library - * Copyright (C) 1999-2006, MIYASAKA Masaru. - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* SLOW INTEGER FORWARD DCT */ - -#include "jsimd_mmi.h" - - -#define CONST_BITS 13 -#define PASS1_BITS 2 -#define DESCALE_P1 (CONST_BITS - PASS1_BITS) -#define DESCALE_P2 (CONST_BITS + PASS1_BITS) - -#define FIX_0_298 ((short)2446) /* FIX(0.298631336) */ -#define FIX_0_390 ((short)3196) /* FIX(0.390180644) */ -#define FIX_0_541 ((short)4433) /* FIX(0.541196100) */ -#define FIX_0_765 ((short)6270) /* FIX(0.765366865) */ -#define FIX_0_899 ((short)7373) /* FIX(0.899976223) */ -#define FIX_1_175 ((short)9633) /* FIX(1.175875602) */ -#define FIX_1_501 ((short)12299) /* FIX(1.501321110) */ -#define FIX_1_847 ((short)15137) /* FIX(1.847759065) */ -#define FIX_1_961 ((short)16069) /* FIX(1.961570560) */ -#define FIX_2_053 ((short)16819) /* FIX(2.053119869) */ -#define FIX_2_562 ((short)20995) /* FIX(2.562915447) */ -#define FIX_3_072 ((short)25172) /* FIX(3.072711026) */ - -enum const_index { - index_PW_F130_F054, - index_PW_F054_MF130, - index_PW_MF078_F117, - index_PW_F117_F078, - index_PW_MF060_MF089, - index_PW_MF089_F060, - index_PW_MF050_MF256, - index_PW_MF256_F050, - index_PD_DESCALE_P1, - index_PD_DESCALE_P2, - index_PW_DESCALE_P2X -}; - -static uint64_t const_value[] = { - _uint64_set_pi16(FIX_0_541, (FIX_0_541 + FIX_0_765), - FIX_0_541, (FIX_0_541 + FIX_0_765)), - _uint64_set_pi16((FIX_0_541 - FIX_1_847), FIX_0_541, - (FIX_0_541 - FIX_1_847), FIX_0_541), - _uint64_set_pi16(FIX_1_175, (FIX_1_175 - FIX_1_961), - FIX_1_175, (FIX_1_175 - FIX_1_961)), - _uint64_set_pi16((FIX_1_175 - FIX_0_390), FIX_1_175, - (FIX_1_175 - FIX_0_390), FIX_1_175), - _uint64_set_pi16(-FIX_0_899, (FIX_0_298 - FIX_0_899), - -FIX_0_899, (FIX_0_298 - FIX_0_899)), - _uint64_set_pi16((FIX_1_501 - FIX_0_899), -FIX_0_899, - (FIX_1_501 - FIX_0_899), -FIX_0_899), - _uint64_set_pi16(-FIX_2_562, (FIX_2_053 - FIX_2_562), - -FIX_2_562, (FIX_2_053 - FIX_2_562)), - _uint64_set_pi16((FIX_3_072 - FIX_2_562), -FIX_2_562, - (FIX_3_072 - FIX_2_562), -FIX_2_562), - _uint64_set_pi32((1 << (DESCALE_P1 - 1)), (1 << (DESCALE_P1 - 1))), - _uint64_set_pi32((1 << (DESCALE_P2 - 1)), (1 << (DESCALE_P2 - 1))), - _uint64_set_pi16((1 << (PASS1_BITS - 1)), (1 << (PASS1_BITS - 1)), - (1 << (PASS1_BITS - 1)), (1 << (PASS1_BITS - 1))) -}; - -#define PW_F130_F054 get_const_value(index_PW_F130_F054) -#define PW_F054_MF130 get_const_value(index_PW_F054_MF130) -#define PW_MF078_F117 get_const_value(index_PW_MF078_F117) -#define PW_F117_F078 get_const_value(index_PW_F117_F078) -#define PW_MF060_MF089 get_const_value(index_PW_MF060_MF089) -#define PW_MF089_F060 get_const_value(index_PW_MF089_F060) -#define PW_MF050_MF256 get_const_value(index_PW_MF050_MF256) -#define PW_MF256_F050 get_const_value(index_PW_MF256_F050) -#define PD_DESCALE_P1 get_const_value(index_PD_DESCALE_P1) -#define PD_DESCALE_P2 get_const_value(index_PD_DESCALE_P2) -#define PW_DESCALE_P2X get_const_value(index_PW_DESCALE_P2X) - - -#define DO_FDCT_COMMON(PASS) { \ - __m64 tmp1312l, tmp1312h, tmp47l, tmp47h, tmp4l, tmp4h, tmp7l, tmp7h; \ - __m64 tmp56l, tmp56h, tmp5l, tmp5h, tmp6l, tmp6h; \ - __m64 out1l, out1h, out2l, out2h, out3l, out3h; \ - __m64 out5l, out5h, out6l, out6h, out7l, out7h; \ - __m64 z34l, z34h, z3l, z3h, z4l, z4h, z3, z4; \ - \ - /* (Original) \ - * z1 = (tmp12 + tmp13) * 0.541196100; \ - * out2 = z1 + tmp13 * 0.765366865; \ - * out6 = z1 + tmp12 * -1.847759065; \ - * \ - * (This implementation) \ - * out2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \ - * out6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \ - */ \ - \ - tmp1312l = _mm_unpacklo_pi16(tmp13, tmp12); \ - tmp1312h = _mm_unpackhi_pi16(tmp13, tmp12); \ - \ - out2l = _mm_madd_pi16(tmp1312l, PW_F130_F054); \ - out2h = _mm_madd_pi16(tmp1312h, PW_F130_F054); \ - out6l = _mm_madd_pi16(tmp1312l, PW_F054_MF130); \ - out6h = _mm_madd_pi16(tmp1312h, PW_F054_MF130); \ - \ - out2l = _mm_add_pi32(out2l, PD_DESCALE_P##PASS); \ - out2h = _mm_add_pi32(out2h, PD_DESCALE_P##PASS); \ - out2l = _mm_srai_pi32(out2l, DESCALE_P##PASS); \ - out2h = _mm_srai_pi32(out2h, DESCALE_P##PASS); \ - \ - out6l = _mm_add_pi32(out6l, PD_DESCALE_P##PASS); \ - out6h = _mm_add_pi32(out6h, PD_DESCALE_P##PASS); \ - out6l = _mm_srai_pi32(out6l, DESCALE_P##PASS); \ - out6h = _mm_srai_pi32(out6h, DESCALE_P##PASS); \ - \ - out2 = _mm_packs_pi32(out2l, out2h); \ - out6 = _mm_packs_pi32(out6l, out6h); \ - \ - /* Odd part */ \ - \ - z3 = _mm_add_pi16(tmp4, tmp6); \ - z4 = _mm_add_pi16(tmp5, tmp7); \ - \ - /* (Original) \ - * z5 = (z3 + z4) * 1.175875602; \ - * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \ - * z3 += z5; z4 += z5; \ - * \ - * (This implementation) \ - * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \ - * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \ - */ \ - \ - z34l = _mm_unpacklo_pi16(z3, z4); \ - z34h = _mm_unpackhi_pi16(z3, z4); \ - z3l = _mm_madd_pi16(z34l, PW_MF078_F117); \ - z3h = _mm_madd_pi16(z34h, PW_MF078_F117); \ - z4l = _mm_madd_pi16(z34l, PW_F117_F078); \ - z4h = _mm_madd_pi16(z34h, PW_F117_F078); \ - \ - /* (Original) \ - * z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; \ - * tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; \ - * tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; \ - * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \ - * out7 = tmp4 + z1 + z3; out5 = tmp5 + z2 + z4; \ - * out3 = tmp6 + z2 + z3; out1 = tmp7 + z1 + z4; \ - * \ - * (This implementation) \ - * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \ - * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \ - * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \ - * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \ - * out7 = tmp4 + z3; out5 = tmp5 + z4; \ - * out3 = tmp6 + z3; out1 = tmp7 + z4; \ - */ \ - \ - tmp47l = _mm_unpacklo_pi16(tmp4, tmp7); \ - tmp47h = _mm_unpackhi_pi16(tmp4, tmp7); \ - \ - tmp4l = _mm_madd_pi16(tmp47l, PW_MF060_MF089); \ - tmp4h = _mm_madd_pi16(tmp47h, PW_MF060_MF089); \ - tmp7l = _mm_madd_pi16(tmp47l, PW_MF089_F060); \ - tmp7h = _mm_madd_pi16(tmp47h, PW_MF089_F060); \ - \ - out7l = _mm_add_pi32(tmp4l, z3l); \ - out7h = _mm_add_pi32(tmp4h, z3h); \ - out1l = _mm_add_pi32(tmp7l, z4l); \ - out1h = _mm_add_pi32(tmp7h, z4h); \ - \ - out7l = _mm_add_pi32(out7l, PD_DESCALE_P##PASS); \ - out7h = _mm_add_pi32(out7h, PD_DESCALE_P##PASS); \ - out7l = _mm_srai_pi32(out7l, DESCALE_P##PASS); \ - out7h = _mm_srai_pi32(out7h, DESCALE_P##PASS); \ - \ - out1l = _mm_add_pi32(out1l, PD_DESCALE_P##PASS); \ - out1h = _mm_add_pi32(out1h, PD_DESCALE_P##PASS); \ - out1l = _mm_srai_pi32(out1l, DESCALE_P##PASS); \ - out1h = _mm_srai_pi32(out1h, DESCALE_P##PASS); \ - \ - out7 = _mm_packs_pi32(out7l, out7h); \ - out1 = _mm_packs_pi32(out1l, out1h); \ - \ - tmp56l = _mm_unpacklo_pi16(tmp5, tmp6); \ - tmp56h = _mm_unpackhi_pi16(tmp5, tmp6); \ - \ - tmp5l = _mm_madd_pi16(tmp56l, PW_MF050_MF256); \ - tmp5h = _mm_madd_pi16(tmp56h, PW_MF050_MF256); \ - tmp6l = _mm_madd_pi16(tmp56l, PW_MF256_F050); \ - tmp6h = _mm_madd_pi16(tmp56h, PW_MF256_F050); \ - \ - out5l = _mm_add_pi32(tmp5l, z4l); \ - out5h = _mm_add_pi32(tmp5h, z4h); \ - out3l = _mm_add_pi32(tmp6l, z3l); \ - out3h = _mm_add_pi32(tmp6h, z3h); \ - \ - out5l = _mm_add_pi32(out5l, PD_DESCALE_P##PASS); \ - out5h = _mm_add_pi32(out5h, PD_DESCALE_P##PASS); \ - out5l = _mm_srai_pi32(out5l, DESCALE_P##PASS); \ - out5h = _mm_srai_pi32(out5h, DESCALE_P##PASS); \ - \ - out3l = _mm_add_pi32(out3l, PD_DESCALE_P##PASS); \ - out3h = _mm_add_pi32(out3h, PD_DESCALE_P##PASS); \ - out3l = _mm_srai_pi32(out3l, DESCALE_P##PASS); \ - out3h = _mm_srai_pi32(out3h, DESCALE_P##PASS); \ - \ - out5 = _mm_packs_pi32(out5l, out5h); \ - out3 = _mm_packs_pi32(out3l, out3h); \ -} - -#define DO_FDCT_PASS1() { \ - __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \ - __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \ - __m64 col0, col1, col2, col3, col4, col5, col6, col7; \ - __m64 tmp10, tmp11; \ - \ - row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]); /* (00 01 02 03) */ \ - row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4]); /* (04 05 06 07) */ \ - row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]); /* (10 11 12 13) */ \ - row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4]); /* (14 15 16 17) */ \ - row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]); /* (20 21 22 23) */ \ - row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4]); /* (24 25 26 27) */ \ - row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]); /* (30 31 32 33) */ \ - row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4]); /* (34 35 36 37) */ \ - \ - /* Transpose coefficients */ \ - \ - row23a = _mm_unpacklo_pi16(row2l, row3l); /* row23a=(20 30 21 31) */ \ - row23b = _mm_unpackhi_pi16(row2l, row3l); /* row23b=(22 32 23 33) */ \ - row23c = _mm_unpacklo_pi16(row2h, row3h); /* row23c=(24 34 25 35) */ \ - row23d = _mm_unpackhi_pi16(row2h, row3h); /* row23d=(26 36 27 37) */ \ - \ - row01a = _mm_unpacklo_pi16(row0l, row1l); /* row01a=(00 10 01 11) */ \ - row01b = _mm_unpackhi_pi16(row0l, row1l); /* row01b=(02 12 03 13) */ \ - row01c = _mm_unpacklo_pi16(row0h, row1h); /* row01c=(04 14 05 15) */ \ - row01d = _mm_unpackhi_pi16(row0h, row1h); /* row01d=(06 16 07 17) */ \ - \ - col0 = _mm_unpacklo_pi32(row01a, row23a); /* col0=(00 10 20 30) */ \ - col1 = _mm_unpackhi_pi32(row01a, row23a); /* col1=(01 11 21 31) */ \ - col6 = _mm_unpacklo_pi32(row01d, row23d); /* col6=(06 16 26 36) */ \ - col7 = _mm_unpackhi_pi32(row01d, row23d); /* col7=(07 17 27 37) */ \ - \ - tmp6 = _mm_sub_pi16(col1, col6); /* tmp6=col1-col6 */ \ - tmp7 = _mm_sub_pi16(col0, col7); /* tmp7=col0-col7 */ \ - tmp1 = _mm_add_pi16(col1, col6); /* tmp1=col1+col6 */ \ - tmp0 = _mm_add_pi16(col0, col7); /* tmp0=col0+col7 */ \ - \ - col2 = _mm_unpacklo_pi32(row01b, row23b); /* col2=(02 12 22 32) */ \ - col3 = _mm_unpackhi_pi32(row01b, row23b); /* col3=(03 13 23 33) */ \ - col4 = _mm_unpacklo_pi32(row01c, row23c); /* col4=(04 14 24 34) */ \ - col5 = _mm_unpackhi_pi32(row01c, row23c); /* col5=(05 15 25 35) */ \ - \ - tmp3 = _mm_add_pi16(col3, col4); /* tmp3=col3+col4 */ \ - tmp2 = _mm_add_pi16(col2, col5); /* tmp2=col2+col5 */ \ - tmp4 = _mm_sub_pi16(col3, col4); /* tmp4=col3-col4 */ \ - tmp5 = _mm_sub_pi16(col2, col5); /* tmp5=col2-col5 */ \ - \ - /* Even part */ \ - \ - tmp10 = _mm_add_pi16(tmp0, tmp3); /* tmp10=tmp0+tmp3 */ \ - tmp13 = _mm_sub_pi16(tmp0, tmp3); /* tmp13=tmp0-tmp3 */ \ - tmp11 = _mm_add_pi16(tmp1, tmp2); /* tmp11=tmp1+tmp2 */ \ - tmp12 = _mm_sub_pi16(tmp1, tmp2); /* tmp12=tmp1-tmp2 */ \ - \ - out0 = _mm_add_pi16(tmp10, tmp11); /* out0=tmp10+tmp11 */ \ - out4 = _mm_sub_pi16(tmp10, tmp11); /* out4=tmp10-tmp11 */ \ - out0 = _mm_slli_pi16(out0, PASS1_BITS); \ - out4 = _mm_slli_pi16(out4, PASS1_BITS); \ - \ - DO_FDCT_COMMON(1) \ - \ - _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \ - _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4], out4); \ - _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \ - _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4], out5); \ - _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \ - _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4], out6); \ - _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \ - _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4], out7); \ -} - -#define DO_FDCT_PASS2() { \ - __m64 col0l, col0h, col1l, col1h, col2l, col2h, col3l, col3h; \ - __m64 col01a, col01b, col01c, col01d, col23a, col23b, col23c, col23d; \ - __m64 row0, row1, row2, row3, row4, row5, row6, row7; \ - __m64 tmp10, tmp11; \ - \ - col0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]); /* (00 10 20 30) */ \ - col1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]); /* (01 11 21 31) */ \ - col2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]); /* (02 12 22 32) */ \ - col3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]); /* (03 13 23 33) */ \ - col0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 4]); /* (40 50 60 70) */ \ - col1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 5]); /* (41 51 61 71) */ \ - col2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 6]); /* (42 52 62 72) */ \ - col3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 7]); /* (43 53 63 73) */ \ - \ - /* Transpose coefficients */ \ - \ - col23a = _mm_unpacklo_pi16(col2l, col3l); /* col23a=(02 03 12 13) */ \ - col23b = _mm_unpackhi_pi16(col2l, col3l); /* col23b=(22 23 32 33) */ \ - col23c = _mm_unpacklo_pi16(col2h, col3h); /* col23c=(42 43 52 53) */ \ - col23d = _mm_unpackhi_pi16(col2h, col3h); /* col23d=(62 63 72 73) */ \ - \ - col01a = _mm_unpacklo_pi16(col0l, col1l); /* col01a=(00 01 10 11) */ \ - col01b = _mm_unpackhi_pi16(col0l, col1l); /* col01b=(20 21 30 31) */ \ - col01c = _mm_unpacklo_pi16(col0h, col1h); /* col01c=(40 41 50 51) */ \ - col01d = _mm_unpackhi_pi16(col0h, col1h); /* col01d=(60 61 70 71) */ \ - \ - row0 = _mm_unpacklo_pi32(col01a, col23a); /* row0=(00 01 02 03) */ \ - row1 = _mm_unpackhi_pi32(col01a, col23a); /* row1=(10 11 12 13) */ \ - row6 = _mm_unpacklo_pi32(col01d, col23d); /* row6=(60 61 62 63) */ \ - row7 = _mm_unpackhi_pi32(col01d, col23d); /* row7=(70 71 72 73) */ \ - \ - tmp6 = _mm_sub_pi16(row1, row6); /* tmp6=row1-row6 */ \ - tmp7 = _mm_sub_pi16(row0, row7); /* tmp7=row0-row7 */ \ - tmp1 = _mm_add_pi16(row1, row6); /* tmp1=row1+row6 */ \ - tmp0 = _mm_add_pi16(row0, row7); /* tmp0=row0+row7 */ \ - \ - row2 = _mm_unpacklo_pi32(col01b, col23b); /* row2=(20 21 22 23) */ \ - row3 = _mm_unpackhi_pi32(col01b, col23b); /* row3=(30 31 32 33) */ \ - row4 = _mm_unpacklo_pi32(col01c, col23c); /* row4=(40 41 42 43) */ \ - row5 = _mm_unpackhi_pi32(col01c, col23c); /* row5=(50 51 52 53) */ \ - \ - tmp3 = _mm_add_pi16(row3, row4); /* tmp3=row3+row4 */ \ - tmp2 = _mm_add_pi16(row2, row5); /* tmp2=row2+row5 */ \ - tmp4 = _mm_sub_pi16(row3, row4); /* tmp4=row3-row4 */ \ - tmp5 = _mm_sub_pi16(row2, row5); /* tmp5=row2-row5 */ \ - \ - /* Even part */ \ - \ - tmp10 = _mm_add_pi16(tmp0, tmp3); /* tmp10=tmp0+tmp3 */ \ - tmp13 = _mm_sub_pi16(tmp0, tmp3); /* tmp13=tmp0-tmp3 */ \ - tmp11 = _mm_add_pi16(tmp1, tmp2); /* tmp11=tmp1+tmp2 */ \ - tmp12 = _mm_sub_pi16(tmp1, tmp2); /* tmp12=tmp1-tmp2 */ \ - \ - out0 = _mm_add_pi16(tmp10, tmp11); /* out0=tmp10+tmp11 */ \ - out4 = _mm_sub_pi16(tmp10, tmp11); /* out4=tmp10-tmp11 */ \ - \ - out0 = _mm_add_pi16(out0, PW_DESCALE_P2X); \ - out4 = _mm_add_pi16(out4, PW_DESCALE_P2X); \ - out0 = _mm_srai_pi16(out0, PASS1_BITS); \ - out4 = _mm_srai_pi16(out4, PASS1_BITS); \ - \ - DO_FDCT_COMMON(2) \ - \ - _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \ - _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \ - _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \ - _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \ - _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 4], out4); \ - _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 5], out5); \ - _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 6], out6); \ - _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 7], out7); \ -} - -void jsimd_fdct_islow_mmi(DCTELEM *data) -{ - __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - __m64 out0, out1, out2, out3, out4, out5, out6, out7; - __m64 tmp12, tmp13; - DCTELEM *dataptr = data; - - /* Pass 1: process rows. */ - - DO_FDCT_PASS1() - dataptr += DCTSIZE * 4; - DO_FDCT_PASS1() - - /* Pass 2: process columns. */ - - dataptr = data; - DO_FDCT_PASS2() - dataptr += 4; - DO_FDCT_PASS2() -} diff --git a/simd/loongson/jidctint-mmi.c b/simd/loongson/jidctint-mmi.c deleted file mode 100644 index 419c638..0000000 --- a/simd/loongson/jidctint-mmi.c +++ /dev/null @@ -1,571 +0,0 @@ -/* - * Loongson MMI optimizations for libjpeg-turbo - * - * Copyright (C) 2014-2015, 2018, D. R. Commander. All Rights Reserved. - * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. - * All Rights Reserved. - * Authors: ZhuChen <zhuchen@loongson.cn> - * CaiWanwei <caiwanwei@loongson.cn> - * SunZhangzhi <sunzhangzhi-cq@loongson.cn> - * - * Based on the x86 SIMD extension for IJG JPEG library - * Copyright (C) 1999-2006, MIYASAKA Masaru. - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* SLOW INTEGER INVERSE DCT */ - -#include "jsimd_mmi.h" - - -#define CONST_BITS 13 -#define PASS1_BITS 2 -#define DESCALE_P1 (CONST_BITS - PASS1_BITS) -#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3) -#define CENTERJSAMPLE 128 - -#define FIX_0_298 ((short)2446) /* FIX(0.298631336) */ -#define FIX_0_390 ((short)3196) /* FIX(0.390180644) */ -#define FIX_0_899 ((short)7373) /* FIX(0.899976223) */ -#define FIX_0_541 ((short)4433) /* FIX(0.541196100) */ -#define FIX_0_765 ((short)6270) /* FIX(0.765366865) */ -#define FIX_1_175 ((short)9633) /* FIX(1.175875602) */ -#define FIX_1_501 ((short)12299) /* FIX(1.501321110) */ -#define FIX_1_847 ((short)15137) /* FIX(1.847759065) */ -#define FIX_1_961 ((short)16069) /* FIX(1.961570560) */ -#define FIX_2_053 ((short)16819) /* FIX(2.053119869) */ -#define FIX_2_562 ((short)20995) /* FIX(2.562915447) */ -#define FIX_3_072 ((short)25172) /* FIX(3.072711026) */ - -enum const_index { - index_PW_F130_F054, - index_PW_F054_MF130, - index_PW_MF078_F117, - index_PW_F117_F078, - index_PW_MF060_MF089, - index_PW_MF089_F060, - index_PW_MF050_MF256, - index_PW_MF256_F050, - index_PD_DESCALE_P1, - index_PD_DESCALE_P2, - index_PB_CENTERJSAMP -}; - -static uint64_t const_value[] = { - _uint64_set_pi16(FIX_0_541, (FIX_0_541 + FIX_0_765), - FIX_0_541, (FIX_0_541 + FIX_0_765)), - _uint64_set_pi16((FIX_0_541 - FIX_1_847), FIX_0_541, - (FIX_0_541 - FIX_1_847), FIX_0_541), - _uint64_set_pi16(FIX_1_175, (FIX_1_175 - FIX_1_961), - FIX_1_175, (FIX_1_175 - FIX_1_961)), - _uint64_set_pi16((FIX_1_175 - FIX_0_390), FIX_1_175, - (FIX_1_175 - FIX_0_390), FIX_1_175), - _uint64_set_pi16(-FIX_0_899, (FIX_0_298 - FIX_0_899), - -FIX_0_899, (FIX_0_298 - FIX_0_899)), - _uint64_set_pi16((FIX_1_501 - FIX_0_899), -FIX_0_899, - (FIX_1_501 - FIX_0_899), -FIX_0_899), - _uint64_set_pi16(-FIX_2_562, (FIX_2_053 - FIX_2_562), - -FIX_2_562, (FIX_2_053 - FIX_2_562)), - _uint64_set_pi16((FIX_3_072 - FIX_2_562), -FIX_2_562, - (FIX_3_072 - FIX_2_562), -FIX_2_562), - _uint64_set_pi32((1 << (DESCALE_P1 - 1)), (1 << (DESCALE_P1 - 1))), - _uint64_set_pi32((1 << (DESCALE_P2 - 1)), (1 << (DESCALE_P2 - 1))), - _uint64_set_pi8(CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE, - CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE) -}; - -#define PW_F130_F054 get_const_value(index_PW_F130_F054) -#define PW_F054_MF130 get_const_value(index_PW_F054_MF130) -#define PW_MF078_F117 get_const_value(index_PW_MF078_F117) -#define PW_F117_F078 get_const_value(index_PW_F117_F078) -#define PW_MF060_MF089 get_const_value(index_PW_MF060_MF089) -#define PW_MF089_F060 get_const_value(index_PW_MF089_F060) -#define PW_MF050_MF256 get_const_value(index_PW_MF050_MF256) -#define PW_MF256_F050 get_const_value(index_PW_MF256_F050) -#define PD_DESCALE_P1 get_const_value(index_PD_DESCALE_P1) -#define PD_DESCALE_P2 get_const_value(index_PD_DESCALE_P2) -#define PB_CENTERJSAMP get_const_value(index_PB_CENTERJSAMP) - - -#define test_m32_zero(mm32) (!(*(uint32_t *)&mm32)) -#define test_m64_zero(mm64) (!(*(uint64_t *)&mm64)) - - -#define DO_IDCT_COMMON(PASS) { \ - __m64 tmp0_3l, tmp0_3h, tmp1_2l, tmp1_2h; \ - __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \ - __m64 z34l, z34h, z3l, z3h, z4l, z4h, z3, z4; \ - __m64 out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h; \ - __m64 out4l, out4h, out5l, out5h, out6l, out6h, out7l, out7h; \ - \ - z3 = _mm_add_pi16(tmp0, tmp2); \ - z4 = _mm_add_pi16(tmp1, tmp3); \ - \ - /* (Original) \ - * z5 = (z3 + z4) * 1.175875602; \ - * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \ - * z3 += z5; z4 += z5; \ - * \ - * (This implementation) \ - * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \ - * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \ - */ \ - \ - z34l = _mm_unpacklo_pi16(z3, z4); \ - z34h = _mm_unpackhi_pi16(z3, z4); \ - z3l = _mm_madd_pi16(z34l, PW_MF078_F117); \ - z3h = _mm_madd_pi16(z34h, PW_MF078_F117); \ - z4l = _mm_madd_pi16(z34l, PW_F117_F078); \ - z4h = _mm_madd_pi16(z34h, PW_F117_F078); \ - \ - /* (Original) \ - * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; \ - * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; \ - * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; \ - * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \ - * tmp0 += z1 + z3; tmp1 += z2 + z4; \ - * tmp2 += z2 + z3; tmp3 += z1 + z4; \ - * \ - * (This implementation) \ - * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \ - * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \ - * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \ - * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \ - * tmp0 += z3; tmp1 += z4; \ - * tmp2 += z3; tmp3 += z4; \ - */ \ - \ - tmp0_3l = _mm_unpacklo_pi16(tmp0, tmp3); \ - tmp0_3h = _mm_unpackhi_pi16(tmp0, tmp3); \ - \ - tmp0l = _mm_madd_pi16(tmp0_3l, PW_MF060_MF089); \ - tmp0h = _mm_madd_pi16(tmp0_3h, PW_MF060_MF089); \ - tmp3l = _mm_madd_pi16(tmp0_3l, PW_MF089_F060); \ - tmp3h = _mm_madd_pi16(tmp0_3h, PW_MF089_F060); \ - \ - tmp0l = _mm_add_pi32(tmp0l, z3l); \ - tmp0h = _mm_add_pi32(tmp0h, z3h); \ - tmp3l = _mm_add_pi32(tmp3l, z4l); \ - tmp3h = _mm_add_pi32(tmp3h, z4h); \ - \ - tmp1_2l = _mm_unpacklo_pi16(tmp1, tmp2); \ - tmp1_2h = _mm_unpackhi_pi16(tmp1, tmp2); \ - \ - tmp1l = _mm_madd_pi16(tmp1_2l, PW_MF050_MF256); \ - tmp1h = _mm_madd_pi16(tmp1_2h, PW_MF050_MF256); \ - tmp2l = _mm_madd_pi16(tmp1_2l, PW_MF256_F050); \ - tmp2h = _mm_madd_pi16(tmp1_2h, PW_MF256_F050); \ - \ - tmp1l = _mm_add_pi32(tmp1l, z4l); \ - tmp1h = _mm_add_pi32(tmp1h, z4h); \ - tmp2l = _mm_add_pi32(tmp2l, z3l); \ - tmp2h = _mm_add_pi32(tmp2h, z3h); \ - \ - /* Final output stage */ \ - \ - out0l = _mm_add_pi32(tmp10l, tmp3l); \ - out0h = _mm_add_pi32(tmp10h, tmp3h); \ - out7l = _mm_sub_pi32(tmp10l, tmp3l); \ - out7h = _mm_sub_pi32(tmp10h, tmp3h); \ - \ - out0l = _mm_add_pi32(out0l, PD_DESCALE_P##PASS); \ - out0h = _mm_add_pi32(out0h, PD_DESCALE_P##PASS); \ - out0l = _mm_srai_pi32(out0l, DESCALE_P##PASS); \ - out0h = _mm_srai_pi32(out0h, DESCALE_P##PASS); \ - \ - out7l = _mm_add_pi32(out7l, PD_DESCALE_P##PASS); \ - out7h = _mm_add_pi32(out7h, PD_DESCALE_P##PASS); \ - out7l = _mm_srai_pi32(out7l, DESCALE_P##PASS); \ - out7h = _mm_srai_pi32(out7h, DESCALE_P##PASS); \ - \ - out0 = _mm_packs_pi32(out0l, out0h); \ - out7 = _mm_packs_pi32(out7l, out7h); \ - \ - out1l = _mm_add_pi32(tmp11l, tmp2l); \ - out1h = _mm_add_pi32(tmp11h, tmp2h); \ - out6l = _mm_sub_pi32(tmp11l, tmp2l); \ - out6h = _mm_sub_pi32(tmp11h, tmp2h); \ - \ - out1l = _mm_add_pi32(out1l, PD_DESCALE_P##PASS); \ - out1h = _mm_add_pi32(out1h, PD_DESCALE_P##PASS); \ - out1l = _mm_srai_pi32(out1l, DESCALE_P##PASS); \ - out1h = _mm_srai_pi32(out1h, DESCALE_P##PASS); \ - \ - out6l = _mm_add_pi32(out6l, PD_DESCALE_P##PASS); \ - out6h = _mm_add_pi32(out6h, PD_DESCALE_P##PASS); \ - out6l = _mm_srai_pi32(out6l, DESCALE_P##PASS); \ - out6h = _mm_srai_pi32(out6h, DESCALE_P##PASS); \ - \ - out1 = _mm_packs_pi32(out1l, out1h); \ - out6 = _mm_packs_pi32(out6l, out6h); \ - \ - out2l = _mm_add_pi32(tmp12l, tmp1l); \ - out2h = _mm_add_pi32(tmp12h, tmp1h); \ - out5l = _mm_sub_pi32(tmp12l, tmp1l); \ - out5h = _mm_sub_pi32(tmp12h, tmp1h); \ - \ - out2l = _mm_add_pi32(out2l, PD_DESCALE_P##PASS); \ - out2h = _mm_add_pi32(out2h, PD_DESCALE_P##PASS); \ - out2l = _mm_srai_pi32(out2l, DESCALE_P##PASS); \ - out2h = _mm_srai_pi32(out2h, DESCALE_P##PASS); \ - \ - out5l = _mm_add_pi32(out5l, PD_DESCALE_P##PASS); \ - out5h = _mm_add_pi32(out5h, PD_DESCALE_P##PASS); \ - out5l = _mm_srai_pi32(out5l, DESCALE_P##PASS); \ - out5h = _mm_srai_pi32(out5h, DESCALE_P##PASS); \ - \ - out2 = _mm_packs_pi32(out2l, out2h); \ - out5 = _mm_packs_pi32(out5l, out5h); \ - \ - out3l = _mm_add_pi32(tmp13l, tmp0l); \ - out3h = _mm_add_pi32(tmp13h, tmp0h); \ - \ - out4l = _mm_sub_pi32(tmp13l, tmp0l); \ - out4h = _mm_sub_pi32(tmp13h, tmp0h); \ - \ - out3l = _mm_add_pi32(out3l, PD_DESCALE_P##PASS); \ - out3h = _mm_add_pi32(out3h, PD_DESCALE_P##PASS); \ - out3l = _mm_srai_pi32(out3l, DESCALE_P##PASS); \ - out3h = _mm_srai_pi32(out3h, DESCALE_P##PASS); \ - \ - out4l = _mm_add_pi32(out4l, PD_DESCALE_P##PASS); \ - out4h = _mm_add_pi32(out4h, PD_DESCALE_P##PASS); \ - out4l = _mm_srai_pi32(out4l, DESCALE_P##PASS); \ - out4h = _mm_srai_pi32(out4h, DESCALE_P##PASS); \ - \ - out3 = _mm_packs_pi32(out3l, out3h); \ - out4 = _mm_packs_pi32(out4l, out4h); \ -} - -#define DO_IDCT_PASS1(iter) { \ - __m64 col0l, col1l, col2l, col3l, col4l, col5l, col6l, col7l; \ - __m64 quant0l, quant1l, quant2l, quant3l; \ - __m64 quant4l, quant5l, quant6l, quant7l; \ - __m64 z23, z2, z3, z23l, z23h; \ - __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \ - __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \ - __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \ - __m64 tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h; \ - __m32 col0a, col1a, mm0; \ - \ - col0a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 1]); \ - col1a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 2]); \ - mm0 = _mm_or_si32(col0a, col1a); \ - \ - if (test_m32_zero(mm0)) { \ - __m64 mm1, mm2; \ - \ - col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); \ - col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); \ - col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); \ - col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); \ - col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); \ - col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); \ - col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); \ - col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); \ - \ - mm1 = _mm_or_si64(col1l, col3l); \ - mm2 = _mm_or_si64(col2l, col4l); \ - mm1 = _mm_or_si64(mm1, col5l); \ - mm2 = _mm_or_si64(mm2, col6l); \ - mm1 = _mm_or_si64(mm1, col7l); \ - mm1 = _mm_or_si64(mm1, mm2); \ - \ - if (test_m64_zero(mm1)) { \ - __m64 dcval, dcvall, dcvalh, row0, row1, row2, row3; \ - \ - /* AC terms all zero */ \ - \ - quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \ - \ - dcval = _mm_mullo_pi16(col0l, quant0l); \ - dcval = _mm_slli_pi16(dcval, PASS1_BITS); /* dcval=(00 10 20 30) */ \ - \ - dcvall = _mm_unpacklo_pi16(dcval, dcval); /* dcvall=(00 00 10 10) */ \ - dcvalh = _mm_unpackhi_pi16(dcval, dcval); /* dcvalh=(20 20 30 30) */ \ - \ - row0 = _mm_unpacklo_pi32(dcvall, dcvall); /* row0=(00 00 00 00) */ \ - row1 = _mm_unpackhi_pi32(dcvall, dcvall); /* row1=(10 10 10 10) */ \ - row2 = _mm_unpacklo_pi32(dcvalh, dcvalh); /* row2=(20 20 20 20) */ \ - row3 = _mm_unpackhi_pi32(dcvalh, dcvalh); /* row3=(30 30 30 30) */ \ - \ - _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0); \ - _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0); \ - _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1); \ - _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1); \ - _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2); \ - _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2); \ - _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3); \ - _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3); \ - \ - goto nextcolumn##iter; \ - } \ - } \ - \ - /* Even part \ - * \ - * (Original) \ - * z1 = (z2 + z3) * 0.541196100; \ - * tmp2 = z1 + z3 * -1.847759065; \ - * tmp3 = z1 + z2 * 0.765366865; \ - * \ - * (This implementation) \ - * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \ - * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \ - */ \ - \ - col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); /* (00 10 20 30) */ \ - col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); /* (02 12 22 32) */ \ - col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); /* (04 14 24 34) */ \ - col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); /* (06 16 26 36) */ \ - \ - quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \ - quant2l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 2]); \ - quant4l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 4]); \ - quant6l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 6]); \ - \ - z2 = _mm_mullo_pi16(col2l, quant2l); \ - z3 = _mm_mullo_pi16(col6l, quant6l); \ - \ - z23l = _mm_unpacklo_pi16(z2, z3); \ - z23h = _mm_unpackhi_pi16(z2, z3); \ - tmp3l = _mm_madd_pi16(z23l, PW_F130_F054); \ - tmp3h = _mm_madd_pi16(z23h, PW_F130_F054); \ - tmp2l = _mm_madd_pi16(z23l, PW_F054_MF130); \ - tmp2h = _mm_madd_pi16(z23h, PW_F054_MF130); \ - \ - z2 = _mm_mullo_pi16(col0l, quant0l); \ - z3 = _mm_mullo_pi16(col4l, quant4l); \ - \ - z23 = _mm_add_pi16(z2, z3); \ - tmp0l = _mm_loadlo_pi16_f(z23); \ - tmp0h = _mm_loadhi_pi16_f(z23); \ - tmp0l = _mm_srai_pi32(tmp0l, (16 - CONST_BITS)); \ - tmp0h = _mm_srai_pi32(tmp0h, (16 - CONST_BITS)); \ - \ - tmp10l = _mm_add_pi32(tmp0l, tmp3l); \ - tmp10h = _mm_add_pi32(tmp0h, tmp3h); \ - tmp13l = _mm_sub_pi32(tmp0l, tmp3l); \ - tmp13h = _mm_sub_pi32(tmp0h, tmp3h); \ - \ - z23 = _mm_sub_pi16(z2, z3); \ - tmp1l = _mm_loadlo_pi16_f(z23); \ - tmp1h = _mm_loadhi_pi16_f(z23); \ - tmp1l = _mm_srai_pi32(tmp1l, (16 - CONST_BITS)); \ - tmp1h = _mm_srai_pi32(tmp1h, (16 - CONST_BITS)); \ - \ - tmp11l = _mm_add_pi32(tmp1l, tmp2l); \ - tmp11h = _mm_add_pi32(tmp1h, tmp2h); \ - tmp12l = _mm_sub_pi32(tmp1l, tmp2l); \ - tmp12h = _mm_sub_pi32(tmp1h, tmp2h); \ - \ - /* Odd part */ \ - \ - col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); /* (01 11 21 31) */ \ - col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); /* (03 13 23 33) */ \ - col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); /* (05 15 25 35) */ \ - col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); /* (07 17 27 37) */ \ - \ - quant1l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 1]); \ - quant3l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 3]); \ - quant5l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 5]); \ - quant7l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 7]); \ - \ - tmp0 = _mm_mullo_pi16(col7l, quant7l); \ - tmp1 = _mm_mullo_pi16(col5l, quant5l); \ - tmp2 = _mm_mullo_pi16(col3l, quant3l); \ - tmp3 = _mm_mullo_pi16(col1l, quant1l); \ - \ - DO_IDCT_COMMON(1) \ - \ - /* out0=(00 10 20 30), out1=(01 11 21 31) */ \ - /* out2=(02 12 22 32), out3=(03 13 23 33) */ \ - /* out4=(04 14 24 34), out5=(05 15 25 35) */ \ - /* out6=(06 16 26 36), out7=(07 17 27 37) */ \ - \ - /* Transpose coefficients */ \ - \ - row01a = _mm_unpacklo_pi16(out0, out1); /* row01a=(00 01 10 11) */ \ - row23a = _mm_unpackhi_pi16(out0, out1); /* row23a=(20 21 30 31) */ \ - row01d = _mm_unpacklo_pi16(out6, out7); /* row01d=(06 07 16 17) */ \ - row23d = _mm_unpackhi_pi16(out6, out7); /* row23d=(26 27 36 37) */ \ - \ - row01b = _mm_unpacklo_pi16(out2, out3); /* row01b=(02 03 12 13) */ \ - row23b = _mm_unpackhi_pi16(out2, out3); /* row23b=(22 23 32 33) */ \ - row01c = _mm_unpacklo_pi16(out4, out5); /* row01c=(04 05 14 15) */ \ - row23c = _mm_unpackhi_pi16(out4, out5); /* row23c=(24 25 34 35) */ \ - \ - row0l = _mm_unpacklo_pi32(row01a, row01b); /* row0l=(00 01 02 03) */ \ - row1l = _mm_unpackhi_pi32(row01a, row01b); /* row1l=(10 11 12 13) */ \ - row2l = _mm_unpacklo_pi32(row23a, row23b); /* row2l=(20 21 22 23) */ \ - row3l = _mm_unpackhi_pi32(row23a, row23b); /* row3l=(30 31 32 33) */ \ - \ - row0h = _mm_unpacklo_pi32(row01c, row01d); /* row0h=(04 05 06 07) */ \ - row1h = _mm_unpackhi_pi32(row01c, row01d); /* row1h=(14 15 16 17) */ \ - row2h = _mm_unpacklo_pi32(row23c, row23d); /* row2h=(24 25 26 27) */ \ - row3h = _mm_unpackhi_pi32(row23c, row23d); /* row3h=(34 35 36 37) */ \ - \ - _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0l); \ - _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0h); \ - _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1l); \ - _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1h); \ - _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2l); \ - _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2h); \ - _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3l); \ - _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3h); \ -} - -#define DO_IDCT_PASS2(ctr) { \ - __m64 row0l, row1l, row2l, row3l, row4l, row5l, row6l, row7l; \ - __m64 z23, z23l, z23h; \ - __m64 col0123a, col0123b, col0123c, col0123d; \ - __m64 col01l, col01h, col23l, col23h, row06, row17, row24, row35; \ - __m64 col0, col1, col2, col3; \ - __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \ - __m64 tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h; \ - \ - row0l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 0]); /* (00 01 02 03) */ \ - row1l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 1]); /* (10 11 12 13) */ \ - row2l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 2]); /* (20 21 22 23) */ \ - row3l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 3]); /* (30 31 32 33) */ \ - row4l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 4]); /* (40 41 42 43) */ \ - row5l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 5]); /* (50 51 52 53) */ \ - row6l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 6]); /* (60 61 62 63) */ \ - row7l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 7]); /* (70 71 72 73) */ \ - \ - /* Even part \ - * \ - * (Original) \ - * z1 = (z2 + z3) * 0.541196100; \ - * tmp2 = z1 + z3 * -1.847759065; \ - * tmp3 = z1 + z2 * 0.765366865; \ - * \ - * (This implementation) \ - * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \ - * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \ - */ \ - \ - z23l = _mm_unpacklo_pi16(row2l, row6l); \ - z23h = _mm_unpackhi_pi16(row2l, row6l); \ - \ - tmp3l = _mm_madd_pi16(z23l, PW_F130_F054); \ - tmp3h = _mm_madd_pi16(z23h, PW_F130_F054); \ - tmp2l = _mm_madd_pi16(z23l, PW_F054_MF130); \ - tmp2h = _mm_madd_pi16(z23h, PW_F054_MF130); \ - \ - z23 = _mm_add_pi16(row0l, row4l); \ - tmp0l = _mm_loadlo_pi16_f(z23); \ - tmp0h = _mm_loadhi_pi16_f(z23); \ - tmp0l = _mm_srai_pi32(tmp0l, (16 - CONST_BITS)); \ - tmp0h = _mm_srai_pi32(tmp0h, (16 - CONST_BITS)); \ - \ - tmp10l = _mm_add_pi32(tmp0l, tmp3l); \ - tmp10h = _mm_add_pi32(tmp0h, tmp3h); \ - tmp13l = _mm_sub_pi32(tmp0l, tmp3l); \ - tmp13h = _mm_sub_pi32(tmp0h, tmp3h); \ - \ - z23 = _mm_sub_pi16(row0l, row4l); \ - tmp1l = _mm_loadlo_pi16_f(z23); \ - tmp1h = _mm_loadhi_pi16_f(z23); \ - tmp1l = _mm_srai_pi32(tmp1l, (16 - CONST_BITS)); \ - tmp1h = _mm_srai_pi32(tmp1h, (16 - CONST_BITS)); \ - \ - tmp11l = _mm_add_pi32(tmp1l, tmp2l); \ - tmp11h = _mm_add_pi32(tmp1h, tmp2h); \ - tmp12l = _mm_sub_pi32(tmp1l, tmp2l); \ - tmp12h = _mm_sub_pi32(tmp1h, tmp2h); \ - \ - /* Odd part */ \ - \ - tmp0 = row7l; \ - tmp1 = row5l; \ - tmp2 = row3l; \ - tmp3 = row1l; \ - \ - DO_IDCT_COMMON(2) \ - \ - /* out0=(00 01 02 03), out1=(10 11 12 13) */ \ - /* out2=(20 21 22 23), out3=(30 31 32 33) */ \ - /* out4=(40 41 42 43), out5=(50 51 52 53) */ \ - /* out6=(60 61 62 63), out7=(70 71 72 73) */ \ - \ - row06 = _mm_packs_pi16(out0, out6); /* row06=(00 01 02 03 60 61 62 63) */ \ - row17 = _mm_packs_pi16(out1, out7); /* row17=(10 11 12 13 70 71 72 73) */ \ - row24 = _mm_packs_pi16(out2, out4); /* row24=(20 21 22 23 40 41 42 43) */ \ - row35 = _mm_packs_pi16(out3, out5); /* row35=(30 31 32 33 50 51 52 53) */ \ - \ - row06 = _mm_add_pi8(row06, PB_CENTERJSAMP); \ - row17 = _mm_add_pi8(row17, PB_CENTERJSAMP); \ - row24 = _mm_add_pi8(row24, PB_CENTERJSAMP); \ - row35 = _mm_add_pi8(row35, PB_CENTERJSAMP); \ - \ - /* Transpose coefficients */ \ - \ - col0123a = _mm_unpacklo_pi8(row06, row17); /* col0123a=(00 10 01 11 02 12 03 13) */ \ - col0123d = _mm_unpackhi_pi8(row06, row17); /* col0123d=(60 70 61 71 62 72 63 73) */ \ - col0123b = _mm_unpacklo_pi8(row24, row35); /* col0123b=(20 30 21 31 22 32 23 33) */ \ - col0123c = _mm_unpackhi_pi8(row24, row35); /* col0123c=(40 50 41 51 42 52 43 53) */ \ - \ - col01l = _mm_unpacklo_pi16(col0123a, col0123b); /* col01l=(00 10 20 30 01 11 21 31) */ \ - col23l = _mm_unpackhi_pi16(col0123a, col0123b); /* col23l=(02 12 22 32 03 13 23 33) */ \ - col01h = _mm_unpacklo_pi16(col0123c, col0123d); /* col01h=(40 50 60 70 41 51 61 71) */ \ - col23h = _mm_unpackhi_pi16(col0123c, col0123d); /* col23h=(42 52 62 72 43 53 63 73) */ \ - \ - col0 = _mm_unpacklo_pi32(col01l, col01h); /* col0=(00 10 20 30 40 50 60 70) */ \ - col1 = _mm_unpackhi_pi32(col01l, col01h); /* col1=(01 11 21 31 41 51 61 71) */ \ - col2 = _mm_unpacklo_pi32(col23l, col23h); /* col2=(02 12 22 32 42 52 62 72) */ \ - col3 = _mm_unpackhi_pi32(col23l, col23h); /* col3=(03 13 23 33 43 53 63 73) */ \ - \ - _mm_store_si64((__m64 *)(output_buf[ctr + 0] + output_col), col0); \ - _mm_store_si64((__m64 *)(output_buf[ctr + 1] + output_col), col1); \ - _mm_store_si64((__m64 *)(output_buf[ctr + 2] + output_col), col2); \ - _mm_store_si64((__m64 *)(output_buf[ctr + 3] + output_col), col3); \ -} - -void jsimd_idct_islow_mmi(void *dct_table, JCOEFPTR coef_block, - JSAMPARRAY output_buf, JDIMENSION output_col) -{ - __m64 tmp0, tmp1, tmp2, tmp3; - __m64 out0, out1, out2, out3, out4, out5, out6, out7; - JCOEFPTR inptr; - ISLOW_MULT_TYPE *quantptr; - JCOEF *wsptr; - JCOEF workspace[DCTSIZE2]; /* buffers data between passes */ - - /* Pass 1: process columns. */ - - inptr = coef_block; - quantptr = (ISLOW_MULT_TYPE *)dct_table; - wsptr = workspace; - - DO_IDCT_PASS1(1) -nextcolumn1: - inptr += 4; - quantptr += 4; - wsptr += DCTSIZE * 4; - DO_IDCT_PASS1(2) -nextcolumn2: - - /* Pass 2: process rows. */ - - wsptr = workspace; - - DO_IDCT_PASS2(0) - wsptr += 4; - DO_IDCT_PASS2(4) -} diff --git a/simd/loongson/jquanti-mmi.c b/simd/loongson/jquanti-mmi.c deleted file mode 100644 index f9a3f81..0000000 --- a/simd/loongson/jquanti-mmi.c +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Loongson MMI optimizations for libjpeg-turbo - * - * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. - * All Rights Reserved. - * Authors: ZhuChen <zhuchen@loongson.cn> - * CaiWanwei <caiwanwei@loongson.cn> - * SunZhangzhi <sunzhangzhi-cq@loongson.cn> - * Copyright (C) 2018, D. R. Commander. All Rights Reserved. - * - * Based on the x86 SIMD extension for IJG JPEG library - * Copyright (C) 1999-2006, MIYASAKA Masaru. - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */ - -#include "jsimd_mmi.h" - - -#define DO_QUANT() { \ - mm2 = _mm_load_si64((__m64 *)&workspace[0]); \ - mm3 = _mm_load_si64((__m64 *)&workspace[4]); \ - \ - mm0 = mm2; \ - mm1 = mm3; \ - \ - mm2 = _mm_srai_pi16(mm2, (WORD_BIT - 1)); /* -1 if value < 0, */ \ - /* 0 otherwise */ \ - mm3 = _mm_srai_pi16(mm3, (WORD_BIT - 1)); \ - \ - mm0 = _mm_xor_si64(mm0, mm2); /* val = -val */ \ - mm1 = _mm_xor_si64(mm1, mm3); \ - mm0 = _mm_sub_pi16(mm0, mm2); \ - mm1 = _mm_sub_pi16(mm1, mm3); \ - \ - corr0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1]); /* correction */ \ - corr1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1 + 4]); \ - \ - mm0 = _mm_add_pi16(mm0, corr0); /* correction + roundfactor */ \ - mm1 = _mm_add_pi16(mm1, corr1); \ - \ - mm4 = mm0; \ - mm5 = mm1; \ - \ - recip0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0]); /* reciprocal */ \ - recip1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0 + 4]); \ - \ - mm0 = _mm_mulhi_pi16(mm0, recip0); \ - mm1 = _mm_mulhi_pi16(mm1, recip1); \ - \ - mm0 = _mm_add_pi16(mm0, mm4); /* reciprocal is always negative */ \ - mm1 = _mm_add_pi16(mm1, mm5); /* (MSB=1), so we always need to add the */ \ - /* initial value (input value is never */ \ - /* negative as we inverted it at the */ \ - /* start of this routine) */ \ - \ - scale0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2]); /* scale */ \ - scale1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2 + 4]); \ - \ - mm6 = scale0; \ - mm7 = scale1; \ - mm4 = mm0; \ - mm5 = mm1; \ - \ - mm0 = _mm_mulhi_pi16(mm0, mm6); \ - mm1 = _mm_mulhi_pi16(mm1, mm7); \ - \ - mm6 = _mm_srai_pi16(mm6, (WORD_BIT - 1)); /* determine if scale... */ \ - /* is negative */ \ - mm7 = _mm_srai_pi16(mm7, (WORD_BIT - 1)); \ - \ - mm6 = _mm_and_si64(mm6, mm4); /* and add input if it is */ \ - mm7 = _mm_and_si64(mm7, mm5); \ - mm0 = _mm_add_pi16(mm0, mm6); \ - mm1 = _mm_add_pi16(mm1, mm7); \ - \ - mm4 = _mm_srai_pi16(mm4, (WORD_BIT - 1)); /* then check if... */ \ - mm5 = _mm_srai_pi16(mm5, (WORD_BIT - 1)); /* negative input */ \ - \ - mm4 = _mm_and_si64(mm4, scale0); /* and add scale if it is */ \ - mm5 = _mm_and_si64(mm5, scale1); \ - mm0 = _mm_add_pi16(mm0, mm4); \ - mm1 = _mm_add_pi16(mm1, mm5); \ - \ - mm0 = _mm_xor_si64(mm0, mm2); /* val = -val */ \ - mm1 = _mm_xor_si64(mm1, mm3); \ - mm0 = _mm_sub_pi16(mm0, mm2); \ - mm1 = _mm_sub_pi16(mm1, mm3); \ - \ - _mm_store_si64((__m64 *)&output_ptr[0], mm0); \ - _mm_store_si64((__m64 *)&output_ptr[4], mm1); \ - \ - workspace += DCTSIZE; \ - divisors += DCTSIZE; \ - output_ptr += DCTSIZE; \ -} - - -void jsimd_quantize_mmi(JCOEFPTR coef_block, DCTELEM *divisors, - DCTELEM *workspace) -{ - JCOEFPTR output_ptr = coef_block; - __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; - __m64 corr0, corr1, recip0, recip1, scale0, scale1; - - DO_QUANT() - DO_QUANT() - DO_QUANT() - DO_QUANT() - DO_QUANT() - DO_QUANT() - DO_QUANT() - DO_QUANT() -} diff --git a/simd/loongson/jsimd.c b/simd/loongson/jsimd.c deleted file mode 100644 index e8b1832..0000000 --- a/simd/loongson/jsimd.c +++ /dev/null @@ -1,610 +0,0 @@ -/* - * jsimd_loongson.c - * - * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB - * Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander. - * Copyright (C) 2013-2014, MIPS Technologies, Inc., California. - * Copyright (C) 2015, 2018, Matthieu Darbois. - * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. - * - * Based on the x86 SIMD extension for IJG JPEG library, - * Copyright (C) 1999-2006, MIYASAKA Masaru. - * For conditions of distribution and use, see copyright notice in jsimdext.inc - * - * This file contains the interface between the "normal" portions - * of the library and the SIMD implementations when running on a - * Loongson architecture. - */ - -#define JPEG_INTERNALS -#include "../../jinclude.h" -#include "../../jpeglib.h" -#include "../../jsimd.h" -#include "../../jdct.h" -#include "../../jsimddct.h" -#include "../jsimd.h" - -static unsigned int simd_support = ~0; - -/* - * Check what SIMD accelerations are supported. - * - * FIXME: This code is racy under a multi-threaded environment. - */ -LOCAL(void) -init_simd(void) -{ -#ifndef NO_GETENV - char *env = NULL; -#endif - - if (simd_support != ~0U) - return; - - simd_support |= JSIMD_MMI; - -#ifndef NO_GETENV - /* Force different settings through environment variables */ - env = getenv("JSIMD_FORCENONE"); - if ((env != NULL) && (strcmp(env, "1") == 0)) - simd_support = 0; -#endif -} - -GLOBAL(int) -jsimd_can_rgb_ycc(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) - return 0; - - if (simd_support & JSIMD_MMI) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_rgb_gray(void) -{ - return 0; -} - -GLOBAL(int) -jsimd_can_ycc_rgb(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) - return 0; - - if (simd_support & JSIMD_MMI) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_ycc_rgb565(void) -{ - return 0; -} - -GLOBAL(int) -jsimd_c_can_null_convert(void) -{ - return 0; -} - -GLOBAL(void) -jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, - JSAMPIMAGE output_buf, JDIMENSION output_row, - int num_rows) -{ - void (*mmifct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); - - switch (cinfo->in_color_space) { - case JCS_EXT_RGB: - mmifct = jsimd_extrgb_ycc_convert_mmi; - break; - case JCS_EXT_RGBX: - case JCS_EXT_RGBA: - mmifct = jsimd_extrgbx_ycc_convert_mmi; - break; - case JCS_EXT_BGR: - mmifct = jsimd_extbgr_ycc_convert_mmi; - break; - case JCS_EXT_BGRX: - case JCS_EXT_BGRA: - mmifct = jsimd_extbgrx_ycc_convert_mmi; - break; - case JCS_EXT_XBGR: - case JCS_EXT_ABGR: - mmifct = jsimd_extxbgr_ycc_convert_mmi; - break; - case JCS_EXT_XRGB: - case JCS_EXT_ARGB: - mmifct = jsimd_extxrgb_ycc_convert_mmi; - break; - default: - mmifct = jsimd_rgb_ycc_convert_mmi; - break; - } - - mmifct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); -} - -GLOBAL(void) -jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, - JSAMPIMAGE output_buf, JDIMENSION output_row, - int num_rows) -{ -} - -GLOBAL(void) -jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, - JDIMENSION input_row, JSAMPARRAY output_buf, - int num_rows) -{ - void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); - - switch (cinfo->out_color_space) { - case JCS_EXT_RGB: - mmifct = jsimd_ycc_extrgb_convert_mmi; - break; - case JCS_EXT_RGBX: - case JCS_EXT_RGBA: - mmifct = jsimd_ycc_extrgbx_convert_mmi; - break; - case JCS_EXT_BGR: - mmifct = jsimd_ycc_extbgr_convert_mmi; - break; - case JCS_EXT_BGRX: - case JCS_EXT_BGRA: - mmifct = jsimd_ycc_extbgrx_convert_mmi; - break; - case JCS_EXT_XBGR: - case JCS_EXT_ABGR: - mmifct = jsimd_ycc_extxbgr_convert_mmi; - break; - case JCS_EXT_XRGB: - case JCS_EXT_ARGB: - mmifct = jsimd_ycc_extxrgb_convert_mmi; - break; - default: - mmifct = jsimd_ycc_rgb_convert_mmi; - break; - } - - mmifct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); -} - -GLOBAL(void) -jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, - JDIMENSION input_row, JSAMPARRAY output_buf, - int num_rows) -{ -} - -GLOBAL(void) -jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, - JSAMPIMAGE output_buf, JDIMENSION output_row, - int num_rows) -{ -} - -GLOBAL(int) -jsimd_can_h2v2_downsample(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - - if (simd_support & JSIMD_MMI) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_h2v2_smooth_downsample(void) -{ - return 0; -} - -GLOBAL(int) -jsimd_can_h2v1_downsample(void) -{ - return 0; -} - -GLOBAL(void) -jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, - JSAMPARRAY input_data, JSAMPARRAY output_data) -{ - jsimd_h2v2_downsample_mmi(cinfo->image_width, cinfo->max_v_samp_factor, - compptr->v_samp_factor, compptr->width_in_blocks, - input_data, output_data); -} - -GLOBAL(void) -jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo, - jpeg_component_info *compptr, - JSAMPARRAY input_data, JSAMPARRAY output_data) -{ -} - -GLOBAL(void) -jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, - JSAMPARRAY input_data, JSAMPARRAY output_data) -{ -} - -GLOBAL(int) -jsimd_can_h2v2_upsample(void) -{ - return 0; -} - -GLOBAL(int) -jsimd_can_h2v1_upsample(void) -{ - return 0; -} - -GLOBAL(int) -jsimd_can_int_upsample(void) -{ - return 0; -} - -GLOBAL(void) -jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) -{ -} - -GLOBAL(void) -jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) -{ -} - -GLOBAL(void) -jsimd_int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) -{ -} - -GLOBAL(int) -jsimd_can_h2v2_fancy_upsample(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - - if (simd_support & JSIMD_MMI) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_h2v1_fancy_upsample(void) -{ - return 0; -} - -GLOBAL(void) -jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) -{ - jsimd_h2v2_fancy_upsample_mmi(cinfo->max_v_samp_factor, - compptr->downsampled_width, input_data, - output_data_ptr); -} - -GLOBAL(void) -jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) -{ -} - -GLOBAL(int) -jsimd_can_h2v2_merged_upsample(void) -{ - return 0; -} - -GLOBAL(int) -jsimd_can_h2v1_merged_upsample(void) -{ - return 0; -} - -GLOBAL(void) -jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) -{ -} - -GLOBAL(void) -jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) -{ -} - -GLOBAL(int) -jsimd_can_convsamp(void) -{ - return 0; -} - -GLOBAL(int) -jsimd_can_convsamp_float(void) -{ - return 0; -} - -GLOBAL(void) -jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, - DCTELEM *workspace) -{ -} - -GLOBAL(void) -jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col, - FAST_FLOAT *workspace) -{ -} - -GLOBAL(int) -jsimd_can_fdct_islow(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (DCTSIZE != 8) - return 0; - if (sizeof(DCTELEM) != 2) - return 0; - - if (simd_support & JSIMD_MMI) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_fdct_ifast(void) -{ - return 0; -} - -GLOBAL(int) -jsimd_can_fdct_float(void) -{ - return 0; -} - -GLOBAL(void) -jsimd_fdct_islow(DCTELEM *data) -{ - jsimd_fdct_islow_mmi(data); -} - -GLOBAL(void) -jsimd_fdct_ifast(DCTELEM *data) -{ -} - -GLOBAL(void) -jsimd_fdct_float(FAST_FLOAT *data) -{ -} - -GLOBAL(int) -jsimd_can_quantize(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (DCTSIZE != 8) - return 0; - if (sizeof(JCOEF) != 2) - return 0; - if (sizeof(DCTELEM) != 2) - return 0; - - if (simd_support & JSIMD_MMI) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_quantize_float(void) -{ - return 0; -} - -GLOBAL(void) -jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace) -{ - jsimd_quantize_mmi(coef_block, divisors, workspace); -} - -GLOBAL(void) -jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors, - FAST_FLOAT *workspace) -{ -} - -GLOBAL(int) -jsimd_can_idct_2x2(void) -{ - return 0; -} - -GLOBAL(int) -jsimd_can_idct_4x4(void) -{ - return 0; -} - -GLOBAL(int) -jsimd_can_idct_6x6(void) -{ - return 0; -} - -GLOBAL(int) -jsimd_can_idct_12x12(void) -{ - return 0; -} - -GLOBAL(void) -jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) -{ -} - -GLOBAL(void) -jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) -{ -} - -GLOBAL(void) -jsimd_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) -{ -} - -GLOBAL(void) -jsimd_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) -{ -} - -GLOBAL(int) -jsimd_can_idct_islow(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (DCTSIZE != 8) - return 0; - if (sizeof(JCOEF) != 2) - return 0; - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - if (sizeof(ISLOW_MULT_TYPE) != 2) - return 0; - - if (simd_support & JSIMD_MMI) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_idct_ifast(void) -{ - return 0; -} - -GLOBAL(int) -jsimd_can_idct_float(void) -{ - return 0; -} - -GLOBAL(void) -jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) -{ - jsimd_idct_islow_mmi(compptr->dct_table, coef_block, output_buf, output_col); -} - -GLOBAL(void) -jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) -{ -} - -GLOBAL(void) -jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) -{ -} - -GLOBAL(int) -jsimd_can_huff_encode_one_block(void) -{ - return 0; -} - -GLOBAL(JOCTET *) -jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, - int last_dc_val, c_derived_tbl *dctbl, - c_derived_tbl *actbl) -{ - return NULL; -} - -GLOBAL(int) -jsimd_can_encode_mcu_AC_first_prepare(void) -{ - return 0; -} - -GLOBAL(void) -jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, - const int *jpeg_natural_order_start, int Sl, - int Al, JCOEF *values, size_t *zerobits) -{ -} - -GLOBAL(int) -jsimd_can_encode_mcu_AC_refine_prepare(void) -{ - return 0; -} - -GLOBAL(int) -jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, - const int *jpeg_natural_order_start, int Sl, - int Al, JCOEF *absvalues, size_t *bits) -{ - return 0; -} diff --git a/simd/loongson/jsimd_mmi.h b/simd/loongson/jsimd_mmi.h deleted file mode 100644 index 2506aa8..0000000 --- a/simd/loongson/jsimd_mmi.h +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Loongson MMI optimizations for libjpeg-turbo - * - * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. - * All Rights Reserved. - * Authors: ZhuChen <zhuchen@loongson.cn> - * CaiWanwei <caiwanwei@loongson.cn> - * SunZhangzhi <sunzhangzhi-cq@loongson.cn> - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -#define JPEG_INTERNALS -#include "../../jinclude.h" -#include "../../jpeglib.h" -#include "../../jdct.h" -#include "loongson-mmintrin.h" - - -/* Common code */ - -#define SIZEOF_MMWORD 8 -#define BYTE_BIT 8 -#define WORD_BIT 16 -#define SCALEBITS 16 - -#define _uint64_set_pi8(a, b, c, d, e, f, g, h) \ - (((uint64_t)(uint8_t)a << 56) | \ - ((uint64_t)(uint8_t)b << 48) | \ - ((uint64_t)(uint8_t)c << 40) | \ - ((uint64_t)(uint8_t)d << 32) | \ - ((uint64_t)(uint8_t)e << 24) | \ - ((uint64_t)(uint8_t)f << 16) | \ - ((uint64_t)(uint8_t)g << 8) | \ - ((uint64_t)(uint8_t)h)) -#define _uint64_set_pi16(a, b, c, d) (((uint64_t)(uint16_t)a << 48) | \ - ((uint64_t)(uint16_t)b << 32) | \ - ((uint64_t)(uint16_t)c << 16) | \ - ((uint64_t)(uint16_t)d)) -#define _uint64_set_pi32(a, b) (((uint64_t)(uint32_t)a << 32) | \ - ((uint64_t)(uint32_t)b)) - -#define get_const_value(index) (*(__m64 *)&const_value[index]) diff --git a/simd/loongson/loongson-mmintrin.h b/simd/loongson/loongson-mmintrin.h deleted file mode 100644 index 50d166b..0000000 --- a/simd/loongson/loongson-mmintrin.h +++ /dev/null @@ -1,1324 +0,0 @@ -/* - * Loongson MMI optimizations for libjpeg-turbo - * - * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing. - * All Rights Reserved. - * Copyright (C) 2019, D. R. Commander. All Rights Reserved. - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -#ifndef __LOONGSON_MMINTRIN_H__ -#define __LOONGSON_MMINTRIN_H__ - -#include <stdint.h> - - -#define FUNCTION_ATTRIBS \ - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - - -/* Vectors are stored in 64-bit floating-point registers. */ -typedef double __m64; - -/* Having a 32-bit datatype allows us to use 32-bit loads in places like - load8888. */ -typedef float __m32; - - -/********** Set Operations **********/ - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_setzero_si64(void) -{ - return 0.0; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_set_pi8(uint8_t __b7, uint8_t __b6, uint8_t __b5, uint8_t __b4, - uint8_t __b3, uint8_t __b2, uint8_t __b1, uint8_t __b0) -{ - __m64 ret; - uint32_t lo = ((uint32_t)__b6 << 24) | - ((uint32_t)__b4 << 16) | - ((uint32_t)__b2 << 8) | - (uint32_t)__b0; - uint32_t hi = ((uint32_t)__b7 << 24) | - ((uint32_t)__b5 << 16) | - ((uint32_t)__b3 << 8) | - (uint32_t)__b1; - - asm("mtc1 %1, %0\n\t" - "mtc1 %2, $f0\n\t" - "punpcklbh %0, %0, $f0\n\t" - : "=f" (ret) - : "r" (lo), "r" (hi) - : "$f0" - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_set_pi16(uint16_t __h3, uint16_t __h2, uint16_t __h1, uint16_t __h0) -{ - __m64 ret; - uint32_t lo = ((uint32_t)__h2 << 16) | (uint32_t)__h0; - uint32_t hi = ((uint32_t)__h3 << 16) | (uint32_t)__h1; - - asm("mtc1 %1, %0\n\t" - "mtc1 %2, $f0\n\t" - "punpcklhw %0, %0, $f0\n\t" - : "=f" (ret) - : "r" (lo), "r" (hi) - : "$f0" - ); - - return ret; -} - -#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ - (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_set_pi32(uint32_t __i1, uint32_t __i0) -{ - if (__builtin_constant_p(__i1) && __builtin_constant_p(__i0)) { - uint64_t val = ((uint64_t)__i1 << 32) | - ((uint64_t)__i0 << 0); - - return *(__m64 *)&val; - } else if (__i1 == __i0) { - uint64_t imm = _MM_SHUFFLE(1, 0, 1, 0); - __m64 ret; - - asm("pshufh %0, %1, %2\n\t" - : "=f" (ret) - : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm) - ); - - return ret; - } else { - uint64_t val = ((uint64_t)__i1 << 32) | - ((uint64_t)__i0 << 0); - - return *(__m64 *)&val; - } -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_set1_pi8(uint8_t __b0) -{ - __m64 ret; - - asm("sll $8, %1, 8\n\t" - "or %1, %1, $8\n\t" - "mtc1 %1, %0\n\t" - "mtc1 $0, $f0\n\t" - "pshufh %0, %0, $f0\n\t" - : "=f" (ret) - : "r" (__b0) - : "$8", "$f0" - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_set1_pi16(uint16_t __h0) -{ - __m64 ret; - - asm("mtc1 %1, %0\n\t" - "mtc1 $0, $f0\n\t" - "pshufh %0, %0, $f0\n\t" - : "=f" (ret) - : "r" (__h0) - : "$8", "$f0" - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_set1_pi32(unsigned __i0) -{ - return _mm_set_pi32(__i0, __i0); -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_setr_pi8(uint8_t __h0, uint8_t __h1, uint8_t __h2, uint8_t __h3, - uint8_t __h4, uint8_t __h5, uint8_t __h6, uint8_t __h7) -{ - return _mm_set_pi8(__h7, __h6, __h5, __h4, - __h3, __h2, __h1, __h0); -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_setr_pi16(uint16_t __w0, uint16_t __w1, uint16_t __w2, uint16_t __w3) -{ - return _mm_set_pi16(__w3, __w2, __w1, __w0); -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_setr_pi32(uint32_t __i0, uint32_t __i1) -{ - return _mm_set_pi32(__i1, __i0); -} - - -/********** Arithmetic Operations **********/ - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_add_pi8(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("paddb %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_add_pi16(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("paddh %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_add_pi32(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("paddw %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_add_si64(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("paddd %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_adds_pi8(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("paddsb %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_adds_pi16(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("paddsh %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_adds_pu8(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("paddusb %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_adds_pu16(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("paddush %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_avg_pu8(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("pavgb %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_avg_pu16(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("pavgh %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_madd_pi16(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("pmaddhw %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_max_pi16(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("pmaxsh %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_max_pu8(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("pmaxub %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_min_pi16(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("pminsh %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_min_pu8(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("pminub %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline int FUNCTION_ATTRIBS -_mm_movemask_pi8(__m64 __m1) -{ - int ret; - - asm("pmovmskb %0, %1\n\t" - : "=r" (ret) - : "y" (__m1) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_mulhi_pi16(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("pmulhh %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_mulhi_pu16(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("pmulhuh %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_mullo_pi16(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("pmullh %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_mul_pu32(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("pmuluw %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_sad_pu8(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("psadbh %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_asub_pu8(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("pasubub %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_biadd_pu8(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("biadd %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_sub_pi8(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("psubb %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_sub_pi16(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("psubh %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_sub_pi32(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("psubw %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_sub_si64(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("psubd %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_subs_pi8(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("psubsb %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_subs_pi16(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("psubsh %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_subs_pu8(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("psubusb %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_subs_pu16(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("psubush %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - - -/********** Logical Operations **********/ - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_and_si64(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("and %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_andnot_si64(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("andn %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_or_si32(__m32 __m1, __m32 __m2) -{ - __m32 ret; - - asm("or %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_or_si64(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("or %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_xor_si64(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("xor %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - - -/********** Shift Operations **********/ - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_slli_pi16(__m64 __m, int64_t __count) -{ - __m64 ret; - - asm("psllh %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m), "f" (*(__m64 *)&__count) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_slli_pi32(__m64 __m, int64_t __count) -{ - __m64 ret; - - asm("psllw %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m), "f" (*(__m64 *)&__count) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_slli_si64(__m64 __m, int64_t __count) -{ - __m64 ret; - - asm("dsll %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m), "f" (*(__m64 *)&__count) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_srli_pi16(__m64 __m, int64_t __count) -{ - __m64 ret; - - asm("psrlh %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m), "f" (*(__m64 *)&__count) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_srli_pi32(__m64 __m, int64_t __count) -{ - __m64 ret; - - asm("psrlw %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m), "f" (*(__m64 *)&__count) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_srli_si64(__m64 __m, int64_t __count) -{ - __m64 ret; - - asm("dsrl %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m), "f" (*(__m64 *)&__count) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_srai_pi16(__m64 __m, int64_t __count) -{ - __m64 ret; - - asm("psrah %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m), "f" (*(__m64 *)&__count) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_srai_pi32(__m64 __m, int64_t __count) -{ - __m64 ret; - - asm("psraw %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m), "f" (*(__m64 *)&__count) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_srai_si64(__m64 __m, int64_t __count) -{ - __m64 ret; - - asm("dsra %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m), "f" (*(__m64 *)&__count) - ); - - return ret; -} - - -/********** Conversion Intrinsics **********/ - -extern __inline __m64 FUNCTION_ATTRIBS -to_m64(uint64_t x) -{ - return *(__m64 *)&x; -} - -extern __inline uint64_t FUNCTION_ATTRIBS -to_uint64(__m64 x) -{ - return *(uint64_t *)&x; -} - - -/********** Comparison Intrinsics **********/ - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_cmpeq_pi8(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("pcmpeqb %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_cmpeq_pi16(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("pcmpeqh %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_cmpeq_pi32(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("pcmpeqw %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_cmpgt_pi8(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("pcmpgtb %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_cmpgt_pi16(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("pcmpgth %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_cmpgt_pi32(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("pcmpgtw %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_cmplt_pi8(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("pcmpltb %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_cmplt_pi16(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("pcmplth %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_cmplt_pi32(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("pcmpltw %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - - -/********** Miscellaneous Operations **********/ - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_packs_pi16(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("packsshb %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_packs_pi32(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("packsswh %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_packs_pi32_f(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("packsswh %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_packs_pu16(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("packushb %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_extract_pi16(__m64 __m, int64_t __pos) -{ - __m64 ret; - - asm("pextrh %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m), "f" (*(__m64 *)&__pos) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_insert_pi16(__m64 __m1, __m64 __m2, int64_t __pos) -{ - __m64 ret; - - switch (__pos) { - case 0: - - asm("pinsrh_0 %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2), "i" (__pos) - ); - - break; - - case 1: - - asm("pinsrh_1 %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2), "i" (__pos) - ); - - break; - case 2: - - asm("pinsrh_2 %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2), "i" (__pos) - ); - - break; - - case 3: - - asm("pinsrh_3 %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2), "i" (__pos) - ); - - break; - } - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_shuffle_pi16(__m64 __m, int64_t __n) -{ - __m64 ret; - - asm("pshufh %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m), "f" (*(__m64 *)&__n) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_unpackhi_pi8(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("punpckhbh %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_unpackhi_pi8_f(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("punpckhbh %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_unpackhi_pi16(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("punpckhhw %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_unpackhi_pi16_f(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("punpckhhw %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_unpackhi_pi32(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("punpckhwd %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_unpacklo_pi8(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("punpcklbh %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -/* Since punpcklbh cares about the high 32-bits, we use the __m64 datatype, - which preserves the data. */ - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_unpacklo_pi8_f64(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("punpcklbh %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -/* Since punpcklbh doesn't care about the high 32-bits, we use the __m32, - datatype, which allows load8888 to use 32-bit loads. */ - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_unpacklo_pi8_f(__m32 __m1, __m64 __m2) -{ - __m64 ret; - - asm("punpcklbh %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_unpacklo_pi16(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("punpcklhw %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_unpacklo_pi16_f(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("punpcklhw %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_unpacklo_pi32(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("punpcklwd %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_unpacklo_pi32_f(__m64 __m1, __m64 __m2) -{ - __m64 ret; - - asm("punpcklwd %0, %1, %2\n\t" - : "=f" (ret) - : "f" (__m1), "f" (__m2) - ); - - return ret; -} - -extern __inline void FUNCTION_ATTRIBS -_mm_store_pi32(__m32 *dest, __m64 src) -{ - src = _mm_packs_pu16(src, _mm_setzero_si64()); - - asm("swc1 %1, %0\n\t" - : "=m" (*dest) - : "f" (src) - : "memory" - ); -} - -extern __inline void FUNCTION_ATTRIBS -_mm_store_si64(__m64 *dest, __m64 src) -{ - asm("gssdlc1 %1, 7+%0\n\t" - "gssdrc1 %1, %0\n\t" - : "=m" (*dest) - : "f" (src) - : "memory" - ); -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_load_si32(const __m32 *src) -{ - __m32 ret; - - asm("lwc1 %0, %1\n\t" - : "=f" (ret) - : "m" (*src) - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_load_si64(const __m64 *src) -{ - __m64 ret; - - asm("ldc1 %0, %1\n\t" - : "=f" (ret) - : "m" (*src) - : "memory" - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_loadu_si64(const __m64 *src) -{ - __m64 ret; - - asm("gsldlc1 %0, 7(%1)\n\t" - "gsldrc1 %0, 0(%1)\n\t" - : "=f" (ret) - : "r" (src) - : "memory" - ); - - return ret; -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_loadlo_pi8(const uint32_t *src) -{ - return _mm_unpacklo_pi8_f(*(__m32 *)src, _mm_setzero_si64()); -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_loadlo_pi8_f(__m64 src) -{ - return _mm_unpacklo_pi8_f64(src, _mm_setzero_si64()); -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_loadhi_pi8_f(__m64 src) -{ - return _mm_unpackhi_pi8_f(src, _mm_setzero_si64()); -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_loadlo_pi16(__m64 src) -{ - return _mm_unpacklo_pi16(src, _mm_setzero_si64()); -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_loadlo_pi16_f(__m64 src) -{ - return _mm_unpacklo_pi16_f(_mm_setzero_si64(), src); -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_loadhi_pi16(__m64 src) -{ - return _mm_unpackhi_pi16(src, _mm_setzero_si64()); -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_loadhi_pi16_f(__m64 src) -{ - return _mm_unpackhi_pi16_f(_mm_setzero_si64(), src); -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_expand_alpha(__m64 pixel) -{ - return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(3, 3, 3, 3)); -} - -extern __inline __m64 FUNCTION_ATTRIBS -_mm_expand_alpha_rev(__m64 pixel) -{ - return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(0, 0, 0, 0)); -} - -#endif /* __LOONGSON_MMINTRIN_H__ */ diff --git a/simd/mips/jsimd.c b/simd/mips/jsimd.c deleted file mode 100644 index 454cc99..0000000 --- a/simd/mips/jsimd.c +++ /dev/null @@ -1,1123 +0,0 @@ -/* - * jsimd_mips.c - * - * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB - * Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander. - * Copyright (C) 2013-2014, MIPS Technologies, Inc., California. - * Copyright (C) 2015-2016, 2018, Matthieu Darbois. - * - * Based on the x86 SIMD extension for IJG JPEG library, - * Copyright (C) 1999-2006, MIYASAKA Masaru. - * For conditions of distribution and use, see copyright notice in jsimdext.inc - * - * This file contains the interface between the "normal" portions - * of the library and the SIMD implementations when running on a - * MIPS architecture. - */ - -#define JPEG_INTERNALS -#include "../../jinclude.h" -#include "../../jpeglib.h" -#include "../../jsimd.h" -#include "../../jdct.h" -#include "../../jsimddct.h" -#include "../jsimd.h" - -#include <stdio.h> -#include <string.h> -#include <ctype.h> - -static unsigned int simd_support = ~0; - -#if defined(__linux__) - -LOCAL(int) -parse_proc_cpuinfo(const char *search_string) -{ - const char *file_name = "/proc/cpuinfo"; - char cpuinfo_line[256]; - FILE *f = NULL; - - simd_support = 0; - - if ((f = fopen(file_name, "r")) != NULL) { - while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f) != NULL) { - if (strstr(cpuinfo_line, search_string) != NULL) { - fclose(f); - simd_support |= JSIMD_DSPR2; - return 1; - } - } - fclose(f); - } - /* Did not find string in the proc file, or not Linux ELF. */ - return 0; -} - -#endif - -/* - * Check what SIMD accelerations are supported. - * - * FIXME: This code is racy under a multi-threaded environment. - */ -LOCAL(void) -init_simd(void) -{ -#ifndef NO_GETENV - char *env = NULL; -#endif - - if (simd_support != ~0U) - return; - - simd_support = 0; - -#if defined(__MIPSEL__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2) - simd_support |= JSIMD_DSPR2; -#elif defined(__linux__) - /* We still have a chance to use MIPS DSPR2 regardless of globally used - * -mdspr2 options passed to gcc by performing runtime detection via - * /proc/cpuinfo parsing on linux */ - if (!parse_proc_cpuinfo("MIPS 74K")) - return; -#endif - -#ifndef NO_GETENV - /* Force different settings through environment variables */ - env = getenv("JSIMD_FORCEDSPR2"); - if ((env != NULL) && (strcmp(env, "1") == 0)) - simd_support = JSIMD_DSPR2; - env = getenv("JSIMD_FORCENONE"); - if ((env != NULL) && (strcmp(env, "1") == 0)) - simd_support = 0; -#endif -} - -static const int mips_idct_ifast_coefs[4] = { - 0x45404540, /* FIX( 1.082392200 / 2) = 17734 = 0x4546 */ - 0x5A805A80, /* FIX( 1.414213562 / 2) = 23170 = 0x5A82 */ - 0x76407640, /* FIX( 1.847759065 / 2) = 30274 = 0x7642 */ - 0xAC60AC60 /* FIX(-2.613125930 / 4) = -21407 = 0xAC61 */ -}; - -/* The following struct is borrowed from jdsample.c */ -typedef void (*upsample1_ptr) (j_decompress_ptr cinfo, - jpeg_component_info *compptr, - JSAMPARRAY input_data, - JSAMPARRAY *output_data_ptr); -typedef struct { - struct jpeg_upsampler pub; - JSAMPARRAY color_buf[MAX_COMPONENTS]; - upsample1_ptr methods[MAX_COMPONENTS]; - int next_row_out; - JDIMENSION rows_to_go; - int rowgroup_height[MAX_COMPONENTS]; - UINT8 h_expand[MAX_COMPONENTS]; - UINT8 v_expand[MAX_COMPONENTS]; -} my_upsampler; - -typedef my_upsampler *my_upsample_ptr; - -GLOBAL(int) -jsimd_can_rgb_ycc(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) - return 0; - - if (simd_support & JSIMD_DSPR2) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_rgb_gray(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) - return 0; - - if (simd_support & JSIMD_DSPR2) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_ycc_rgb(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) - return 0; - - if (simd_support & JSIMD_DSPR2) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_ycc_rgb565(void) -{ - return 0; -} - -GLOBAL(int) -jsimd_c_can_null_convert(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - - if (simd_support & JSIMD_DSPR2) - return 1; - - return 0; -} - -GLOBAL(void) -jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, - JSAMPIMAGE output_buf, JDIMENSION output_row, - int num_rows) -{ - void (*dspr2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); - - switch (cinfo->in_color_space) { - case JCS_EXT_RGB: - dspr2fct = jsimd_extrgb_ycc_convert_dspr2; - break; - case JCS_EXT_RGBX: - case JCS_EXT_RGBA: - dspr2fct = jsimd_extrgbx_ycc_convert_dspr2; - break; - case JCS_EXT_BGR: - dspr2fct = jsimd_extbgr_ycc_convert_dspr2; - break; - case JCS_EXT_BGRX: - case JCS_EXT_BGRA: - dspr2fct = jsimd_extbgrx_ycc_convert_dspr2; - break; - case JCS_EXT_XBGR: - case JCS_EXT_ABGR: - dspr2fct = jsimd_extxbgr_ycc_convert_dspr2; - break; - case JCS_EXT_XRGB: - case JCS_EXT_ARGB: - dspr2fct = jsimd_extxrgb_ycc_convert_dspr2; - break; - default: - dspr2fct = jsimd_extrgb_ycc_convert_dspr2; - break; - } - - dspr2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); -} - -GLOBAL(void) -jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, - JSAMPIMAGE output_buf, JDIMENSION output_row, - int num_rows) -{ - void (*dspr2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); - - switch (cinfo->in_color_space) { - case JCS_EXT_RGB: - dspr2fct = jsimd_extrgb_gray_convert_dspr2; - break; - case JCS_EXT_RGBX: - case JCS_EXT_RGBA: - dspr2fct = jsimd_extrgbx_gray_convert_dspr2; - break; - case JCS_EXT_BGR: - dspr2fct = jsimd_extbgr_gray_convert_dspr2; - break; - case JCS_EXT_BGRX: - case JCS_EXT_BGRA: - dspr2fct = jsimd_extbgrx_gray_convert_dspr2; - break; - case JCS_EXT_XBGR: - case JCS_EXT_ABGR: - dspr2fct = jsimd_extxbgr_gray_convert_dspr2; - break; - case JCS_EXT_XRGB: - case JCS_EXT_ARGB: - dspr2fct = jsimd_extxrgb_gray_convert_dspr2; - break; - default: - dspr2fct = jsimd_extrgb_gray_convert_dspr2; - break; - } - - dspr2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); -} - -GLOBAL(void) -jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, - JDIMENSION input_row, JSAMPARRAY output_buf, - int num_rows) -{ - void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); - - switch (cinfo->out_color_space) { - case JCS_EXT_RGB: - dspr2fct = jsimd_ycc_extrgb_convert_dspr2; - break; - case JCS_EXT_RGBX: - case JCS_EXT_RGBA: - dspr2fct = jsimd_ycc_extrgbx_convert_dspr2; - break; - case JCS_EXT_BGR: - dspr2fct = jsimd_ycc_extbgr_convert_dspr2; - break; - case JCS_EXT_BGRX: - case JCS_EXT_BGRA: - dspr2fct = jsimd_ycc_extbgrx_convert_dspr2; - break; - case JCS_EXT_XBGR: - case JCS_EXT_ABGR: - dspr2fct = jsimd_ycc_extxbgr_convert_dspr2; - break; - case JCS_EXT_XRGB: - case JCS_EXT_ARGB: - dspr2fct = jsimd_ycc_extxrgb_convert_dspr2; - break; - default: - dspr2fct = jsimd_ycc_extrgb_convert_dspr2; - break; - } - - dspr2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); -} - -GLOBAL(void) -jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, - JDIMENSION input_row, JSAMPARRAY output_buf, - int num_rows) -{ -} - -GLOBAL(void) -jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, - JSAMPIMAGE output_buf, JDIMENSION output_row, - int num_rows) -{ - jsimd_c_null_convert_dspr2(cinfo->image_width, input_buf, output_buf, - output_row, num_rows, cinfo->num_components); -} - -GLOBAL(int) -jsimd_can_h2v2_downsample(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - - if (simd_support & JSIMD_DSPR2) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_h2v2_smooth_downsample(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - if (DCTSIZE != 8) - return 0; - - if (simd_support & JSIMD_DSPR2) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_h2v1_downsample(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - - if (simd_support & JSIMD_DSPR2) - return 1; - - return 0; -} - -GLOBAL(void) -jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, - JSAMPARRAY input_data, JSAMPARRAY output_data) -{ - jsimd_h2v2_downsample_dspr2(cinfo->image_width, cinfo->max_v_samp_factor, - compptr->v_samp_factor, compptr->width_in_blocks, - input_data, output_data); -} - -GLOBAL(void) -jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo, - jpeg_component_info *compptr, - JSAMPARRAY input_data, JSAMPARRAY output_data) -{ - jsimd_h2v2_smooth_downsample_dspr2(input_data, output_data, - compptr->v_samp_factor, - cinfo->max_v_samp_factor, - cinfo->smoothing_factor, - compptr->width_in_blocks, - cinfo->image_width); -} - -GLOBAL(void) -jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, - JSAMPARRAY input_data, JSAMPARRAY output_data) -{ - jsimd_h2v1_downsample_dspr2(cinfo->image_width, cinfo->max_v_samp_factor, - compptr->v_samp_factor, compptr->width_in_blocks, - input_data, output_data); -} - -GLOBAL(int) -jsimd_can_h2v2_upsample(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - - if (simd_support & JSIMD_DSPR2) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_h2v1_upsample(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - - if (simd_support & JSIMD_DSPR2) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_int_upsample(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - - if (simd_support & JSIMD_DSPR2) - return 1; - - return 0; -} - -GLOBAL(void) -jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) -{ - jsimd_h2v2_upsample_dspr2(cinfo->max_v_samp_factor, cinfo->output_width, - input_data, output_data_ptr); -} - -GLOBAL(void) -jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) -{ - jsimd_h2v1_upsample_dspr2(cinfo->max_v_samp_factor, cinfo->output_width, - input_data, output_data_ptr); -} - -GLOBAL(void) -jsimd_int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) -{ - my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample; - - jsimd_int_upsample_dspr2(upsample->h_expand[compptr->component_index], - upsample->v_expand[compptr->component_index], - input_data, output_data_ptr, cinfo->output_width, - cinfo->max_v_samp_factor); -} - -GLOBAL(int) -jsimd_can_h2v2_fancy_upsample(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - - if (simd_support & JSIMD_DSPR2) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_h2v1_fancy_upsample(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - - if (simd_support & JSIMD_DSPR2) - return 1; - - return 0; -} - -GLOBAL(void) -jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) -{ - jsimd_h2v2_fancy_upsample_dspr2(cinfo->max_v_samp_factor, - compptr->downsampled_width, input_data, - output_data_ptr); -} - -GLOBAL(void) -jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) -{ - jsimd_h2v1_fancy_upsample_dspr2(cinfo->max_v_samp_factor, - compptr->downsampled_width, input_data, - output_data_ptr); -} - -GLOBAL(int) -jsimd_can_h2v2_merged_upsample(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - - if (simd_support & JSIMD_DSPR2) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_h2v1_merged_upsample(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - - if (simd_support & JSIMD_DSPR2) - return 1; - - return 0; -} - -GLOBAL(void) -jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) -{ - void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, JSAMPLE *); - - switch (cinfo->out_color_space) { - case JCS_EXT_RGB: - dspr2fct = jsimd_h2v2_extrgb_merged_upsample_dspr2; - break; - case JCS_EXT_RGBX: - case JCS_EXT_RGBA: - dspr2fct = jsimd_h2v2_extrgbx_merged_upsample_dspr2; - break; - case JCS_EXT_BGR: - dspr2fct = jsimd_h2v2_extbgr_merged_upsample_dspr2; - break; - case JCS_EXT_BGRX: - case JCS_EXT_BGRA: - dspr2fct = jsimd_h2v2_extbgrx_merged_upsample_dspr2; - break; - case JCS_EXT_XBGR: - case JCS_EXT_ABGR: - dspr2fct = jsimd_h2v2_extxbgr_merged_upsample_dspr2; - break; - case JCS_EXT_XRGB: - case JCS_EXT_ARGB: - dspr2fct = jsimd_h2v2_extxrgb_merged_upsample_dspr2; - break; - default: - dspr2fct = jsimd_h2v2_extrgb_merged_upsample_dspr2; - break; - } - - dspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf, - cinfo->sample_range_limit); -} - -GLOBAL(void) -jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) -{ - void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, JSAMPLE *); - - switch (cinfo->out_color_space) { - case JCS_EXT_RGB: - dspr2fct = jsimd_h2v1_extrgb_merged_upsample_dspr2; - break; - case JCS_EXT_RGBX: - case JCS_EXT_RGBA: - dspr2fct = jsimd_h2v1_extrgbx_merged_upsample_dspr2; - break; - case JCS_EXT_BGR: - dspr2fct = jsimd_h2v1_extbgr_merged_upsample_dspr2; - break; - case JCS_EXT_BGRX: - case JCS_EXT_BGRA: - dspr2fct = jsimd_h2v1_extbgrx_merged_upsample_dspr2; - break; - case JCS_EXT_XBGR: - case JCS_EXT_ABGR: - dspr2fct = jsimd_h2v1_extxbgr_merged_upsample_dspr2; - break; - case JCS_EXT_XRGB: - case JCS_EXT_ARGB: - dspr2fct = jsimd_h2v1_extxrgb_merged_upsample_dspr2; - break; - default: - dspr2fct = jsimd_h2v1_extrgb_merged_upsample_dspr2; - break; - } - - dspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf, - cinfo->sample_range_limit); -} - -GLOBAL(int) -jsimd_can_convsamp(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (DCTSIZE != 8) - return 0; - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - if (sizeof(DCTELEM) != 2) - return 0; - - if (simd_support & JSIMD_DSPR2) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_convsamp_float(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (DCTSIZE != 8) - return 0; - if (sizeof(JCOEF) != 2) - return 0; - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - if (sizeof(ISLOW_MULT_TYPE) != 2) - return 0; - -#ifndef __mips_soft_float - if (simd_support & JSIMD_DSPR2) - return 1; -#endif - - return 0; -} - -GLOBAL(void) -jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, - DCTELEM *workspace) -{ - jsimd_convsamp_dspr2(sample_data, start_col, workspace); -} - -GLOBAL(void) -jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col, - FAST_FLOAT *workspace) -{ -#ifndef __mips_soft_float - jsimd_convsamp_float_dspr2(sample_data, start_col, workspace); -#endif -} - -GLOBAL(int) -jsimd_can_fdct_islow(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (DCTSIZE != 8) - return 0; - if (sizeof(DCTELEM) != 2) - return 0; - - if (simd_support & JSIMD_DSPR2) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_fdct_ifast(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (DCTSIZE != 8) - return 0; - if (sizeof(DCTELEM) != 2) - return 0; - - if (simd_support & JSIMD_DSPR2) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_fdct_float(void) -{ - return 0; -} - -GLOBAL(void) -jsimd_fdct_islow(DCTELEM *data) -{ - jsimd_fdct_islow_dspr2(data); -} - -GLOBAL(void) -jsimd_fdct_ifast(DCTELEM *data) -{ - jsimd_fdct_ifast_dspr2(data); -} - -GLOBAL(void) -jsimd_fdct_float(FAST_FLOAT *data) -{ -} - -GLOBAL(int) -jsimd_can_quantize(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (DCTSIZE != 8) - return 0; - if (sizeof(JCOEF) != 2) - return 0; - if (sizeof(DCTELEM) != 2) - return 0; - - if (simd_support & JSIMD_DSPR2) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_quantize_float(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (DCTSIZE != 8) - return 0; - if (sizeof(JCOEF) != 2) - return 0; - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - if (sizeof(ISLOW_MULT_TYPE) != 2) - return 0; - -#ifndef __mips_soft_float - if (simd_support & JSIMD_DSPR2) - return 1; -#endif - - return 0; -} - -GLOBAL(void) -jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace) -{ - jsimd_quantize_dspr2(coef_block, divisors, workspace); -} - -GLOBAL(void) -jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors, - FAST_FLOAT *workspace) -{ -#ifndef __mips_soft_float - jsimd_quantize_float_dspr2(coef_block, divisors, workspace); -#endif -} - -GLOBAL(int) -jsimd_can_idct_2x2(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (DCTSIZE != 8) - return 0; - if (sizeof(JCOEF) != 2) - return 0; - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - if (sizeof(ISLOW_MULT_TYPE) != 2) - return 0; - - if (simd_support & JSIMD_DSPR2) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_idct_4x4(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (DCTSIZE != 8) - return 0; - if (sizeof(JCOEF) != 2) - return 0; - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - if (sizeof(ISLOW_MULT_TYPE) != 2) - return 0; - - if (simd_support & JSIMD_DSPR2) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_idct_6x6(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (DCTSIZE != 8) - return 0; - if (sizeof(JCOEF) != 2) - return 0; - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - if (sizeof(ISLOW_MULT_TYPE) != 2) - return 0; - - if (simd_support & JSIMD_DSPR2) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_idct_12x12(void) -{ - init_simd(); - - if (BITS_IN_JSAMPLE != 8) - return 0; - if (DCTSIZE != 8) - return 0; - if (sizeof(JCOEF) != 2) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - if (sizeof(ISLOW_MULT_TYPE) != 2) - return 0; - - if (simd_support & JSIMD_DSPR2) - return 1; - - return 0; -} - -GLOBAL(void) -jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) -{ - jsimd_idct_2x2_dspr2(compptr->dct_table, coef_block, output_buf, output_col); -} - -GLOBAL(void) -jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) -{ - int workspace[DCTSIZE * 4]; /* buffers data between passes */ - - jsimd_idct_4x4_dspr2(compptr->dct_table, coef_block, output_buf, output_col, - workspace); -} - -GLOBAL(void) -jsimd_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) -{ - jsimd_idct_6x6_dspr2(compptr->dct_table, coef_block, output_buf, output_col); -} - -GLOBAL(void) -jsimd_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) -{ - int workspace[96]; - int output[12] = { - (int)(output_buf[0] + output_col), - (int)(output_buf[1] + output_col), - (int)(output_buf[2] + output_col), - (int)(output_buf[3] + output_col), - (int)(output_buf[4] + output_col), - (int)(output_buf[5] + output_col), - (int)(output_buf[6] + output_col), - (int)(output_buf[7] + output_col), - (int)(output_buf[8] + output_col), - (int)(output_buf[9] + output_col), - (int)(output_buf[10] + output_col), - (int)(output_buf[11] + output_col) - }; - - jsimd_idct_12x12_pass1_dspr2(coef_block, compptr->dct_table, workspace); - jsimd_idct_12x12_pass2_dspr2(workspace, output); -} - -GLOBAL(int) -jsimd_can_idct_islow(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (DCTSIZE != 8) - return 0; - if (sizeof(JCOEF) != 2) - return 0; - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - if (sizeof(ISLOW_MULT_TYPE) != 2) - return 0; - - if (simd_support & JSIMD_DSPR2) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_idct_ifast(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (DCTSIZE != 8) - return 0; - if (sizeof(JCOEF) != 2) - return 0; - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - if (sizeof(IFAST_MULT_TYPE) != 2) - return 0; - if (IFAST_SCALE_BITS != 2) - return 0; - - if (simd_support & JSIMD_DSPR2) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_idct_float(void) -{ - return 0; -} - -GLOBAL(void) -jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) -{ - int output[8] = { - (int)(output_buf[0] + output_col), - (int)(output_buf[1] + output_col), - (int)(output_buf[2] + output_col), - (int)(output_buf[3] + output_col), - (int)(output_buf[4] + output_col), - (int)(output_buf[5] + output_col), - (int)(output_buf[6] + output_col), - (int)(output_buf[7] + output_col) - }; - - jsimd_idct_islow_dspr2(coef_block, compptr->dct_table, output, - IDCT_range_limit(cinfo)); -} - -GLOBAL(void) -jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) -{ - JCOEFPTR inptr; - IFAST_MULT_TYPE *quantptr; - DCTELEM workspace[DCTSIZE2]; /* buffers data between passes */ - - /* Pass 1: process columns from input, store into work array. */ - - inptr = coef_block; - quantptr = (IFAST_MULT_TYPE *)compptr->dct_table; - - jsimd_idct_ifast_cols_dspr2(inptr, quantptr, workspace, - mips_idct_ifast_coefs); - - /* Pass 2: process rows from work array, store into output array. */ - /* Note that we must descale the results by a factor of 8 == 2**3, */ - /* and also undo the PASS1_BITS scaling. */ - - jsimd_idct_ifast_rows_dspr2(workspace, output_buf, output_col, - mips_idct_ifast_coefs); -} - -GLOBAL(void) -jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) -{ -} - -GLOBAL(int) -jsimd_can_huff_encode_one_block(void) -{ - return 0; -} - -GLOBAL(JOCTET *) -jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, - int last_dc_val, c_derived_tbl *dctbl, - c_derived_tbl *actbl) -{ - return NULL; -} - -GLOBAL(int) -jsimd_can_encode_mcu_AC_first_prepare(void) -{ - return 0; -} - -GLOBAL(void) -jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, - const int *jpeg_natural_order_start, int Sl, - int Al, JCOEF *values, size_t *zerobits) -{ -} - -GLOBAL(int) -jsimd_can_encode_mcu_AC_refine_prepare(void) -{ - return 0; -} - -GLOBAL(int) -jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, - const int *jpeg_natural_order_start, int Sl, - int Al, JCOEF *absvalues, size_t *bits) -{ - return 0; -} diff --git a/simd/mips/jsimd_dspr2.S b/simd/mips/jsimd_dspr2.S deleted file mode 100644 index a28c116..0000000 --- a/simd/mips/jsimd_dspr2.S +++ /dev/null @@ -1,4479 +0,0 @@ -/* - * MIPS DSPr2 optimizations for libjpeg-turbo - * - * Copyright (C) 2013-2014, MIPS Technologies, Inc., California. - * All Rights Reserved. - * Authors: Teodora Novkovic <teodora.novkovic@imgtec.com> - * Darko Laus <darko.laus@imgtec.com> - * Copyright (C) 2015, D. R. Commander. All Rights Reserved. - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -#include "jsimd_dspr2_asm.h" - - -/*****************************************************************************/ -LEAF_DSPR2(jsimd_c_null_convert_dspr2) -/* - * a0 = cinfo->image_width - * a1 = input_buf - * a2 = output_buf - * a3 = output_row - * 16(sp) = num_rows - * 20(sp) = cinfo->num_components - * - * Null conversion for compression - */ - SAVE_REGS_ON_STACK 8, s0, s1 - - lw t9, 24(sp) // t9 = num_rows - lw s0, 28(sp) // s0 = cinfo->num_components - andi t0, a0, 3 // t0 = cinfo->image_width & 3 - beqz t0, 4f // no residual - nop -0: - addiu t9, t9, -1 - bltz t9, 7f - li t1, 0 -1: - sll t3, t1, 2 - lwx t5, t3(a2) // t5 = outptr = output_buf[ci] - lw t2, 0(a1) // t2 = inptr = *input_buf - sll t4, a3, 2 - lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row] - addu t2, t2, t1 - addu s1, t5, a0 - addu t6, t5, t0 -2: - lbu t3, 0(t2) - addiu t5, t5, 1 - sb t3, -1(t5) - bne t6, t5, 2b - addu t2, t2, s0 -3: - lbu t3, 0(t2) - addu t4, t2, s0 - addu t7, t4, s0 - addu t8, t7, s0 - addu t2, t8, s0 - lbu t4, 0(t4) - lbu t7, 0(t7) - lbu t8, 0(t8) - addiu t5, t5, 4 - sb t3, -4(t5) - sb t4, -3(t5) - sb t7, -2(t5) - bne s1, t5, 3b - sb t8, -1(t5) - addiu t1, t1, 1 - bne t1, s0, 1b - nop - addiu a1, a1, 4 - bgez t9, 0b - addiu a3, a3, 1 - b 7f - nop -4: - addiu t9, t9, -1 - bltz t9, 7f - li t1, 0 -5: - sll t3, t1, 2 - lwx t5, t3(a2) // t5 = outptr = output_buf[ci] - lw t2, 0(a1) // t2 = inptr = *input_buf - sll t4, a3, 2 - lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row] - addu t2, t2, t1 - addu s1, t5, a0 - addu t6, t5, t0 -6: - lbu t3, 0(t2) - addu t4, t2, s0 - addu t7, t4, s0 - addu t8, t7, s0 - addu t2, t8, s0 - lbu t4, 0(t4) - lbu t7, 0(t7) - lbu t8, 0(t8) - addiu t5, t5, 4 - sb t3, -4(t5) - sb t4, -3(t5) - sb t7, -2(t5) - bne s1, t5, 6b - sb t8, -1(t5) - addiu t1, t1, 1 - bne t1, s0, 5b - nop - addiu a1, a1, 4 - bgez t9, 4b - addiu a3, a3, 1 -7: - RESTORE_REGS_FROM_STACK 8, s0, s1 - - j ra - nop - -END(jsimd_c_null_convert_dspr2) - - -/*****************************************************************************/ -/* - * jsimd_extrgb_ycc_convert_dspr2 - * jsimd_extbgr_ycc_convert_dspr2 - * jsimd_extrgbx_ycc_convert_dspr2 - * jsimd_extbgrx_ycc_convert_dspr2 - * jsimd_extxbgr_ycc_convert_dspr2 - * jsimd_extxrgb_ycc_convert_dspr2 - * - * Colorspace conversion RGB -> YCbCr - */ - -.macro GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 colorid, pixel_size, \ - r_offs, g_offs, b_offs - -.macro DO_RGB_TO_YCC r, g, b, inptr - lbu \r, \r_offs(\inptr) - lbu \g, \g_offs(\inptr) - lbu \b, \b_offs(\inptr) - addiu \inptr, \pixel_size -.endm - -LEAF_DSPR2(jsimd_\colorid\()_ycc_convert_dspr2) -/* - * a0 = cinfo->image_width - * a1 = input_buf - * a2 = output_buf - * a3 = output_row - * 16(sp) = num_rows - */ - SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 - - lw t7, 48(sp) // t7 = num_rows - li s0, 0x4c8b // FIX(0.29900) - li s1, 0x9646 // FIX(0.58700) - li s2, 0x1d2f // FIX(0.11400) - li s3, 0xffffd4cd // -FIX(0.16874) - li s4, 0xffffab33 // -FIX(0.33126) - li s5, 0x8000 // FIX(0.50000) - li s6, 0xffff94d1 // -FIX(0.41869) - li s7, 0xffffeb2f // -FIX(0.08131) - li t8, 0x807fff // CBCR_OFFSET + ONE_HALF-1 - -0: - addiu t7, -1 // --num_rows - lw t6, 0(a1) // t6 = input_buf[0] - lw t0, 0(a2) - lw t1, 4(a2) - lw t2, 8(a2) - sll t3, a3, 2 - lwx t0, t3(t0) // t0 = output_buf[0][output_row] - lwx t1, t3(t1) // t1 = output_buf[1][output_row] - lwx t2, t3(t2) // t2 = output_buf[2][output_row] - - addu t9, t2, a0 // t9 = end address - addiu a3, 1 - -1: - DO_RGB_TO_YCC t3, t4, t5, t6 - - mtlo s5, $ac0 - mtlo t8, $ac1 - mtlo t8, $ac2 - maddu $ac0, s2, t5 - maddu $ac1, s5, t5 - maddu $ac2, s5, t3 - maddu $ac0, s0, t3 - maddu $ac1, s3, t3 - maddu $ac2, s6, t4 - maddu $ac0, s1, t4 - maddu $ac1, s4, t4 - maddu $ac2, s7, t5 - extr.w t3, $ac0, 16 - extr.w t4, $ac1, 16 - extr.w t5, $ac2, 16 - sb t3, 0(t0) - sb t4, 0(t1) - sb t5, 0(t2) - addiu t0, 1 - addiu t2, 1 - bne t2, t9, 1b - addiu t1, 1 - bgtz t7, 0b - addiu a1, 4 - - RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 - - j ra - nop -END(jsimd_\colorid\()_ycc_convert_dspr2) - -.purgem DO_RGB_TO_YCC - -.endm - -/*-------------------------------------id -- pix R G B */ -GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgb, 3, 0, 1, 2 -GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgr, 3, 2, 1, 0 -GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2 -GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0 -GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1 -GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3 - - -/*****************************************************************************/ -/* - * jsimd_ycc_extrgb_convert_dspr2 - * jsimd_ycc_extbgr_convert_dspr2 - * jsimd_ycc_extrgbx_convert_dspr2 - * jsimd_ycc_extbgrx_convert_dspr2 - * jsimd_ycc_extxbgr_convert_dspr2 - * jsimd_ycc_extxrgb_convert_dspr2 - * - * Colorspace conversion YCbCr -> RGB - */ - -.macro GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 colorid, pixel_size, \ - r_offs, g_offs, b_offs, a_offs - -.macro STORE_YCC_TO_RGB scratch0 scratch1 scratch2 outptr - sb \scratch0, \r_offs(\outptr) - sb \scratch1, \g_offs(\outptr) - sb \scratch2, \b_offs(\outptr) -.if (\pixel_size == 4) - li t0, 0xFF - sb t0, \a_offs(\outptr) -.endif - addiu \outptr, \pixel_size -.endm - -LEAF_DSPR2(jsimd_ycc_\colorid\()_convert_dspr2) -/* - * a0 = cinfo->image_width - * a1 = input_buf - * a2 = input_row - * a3 = output_buf - * 16(sp) = num_rows - */ - SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 - - lw s1, 48(sp) - li t3, 0x8000 - li t4, 0x166e9 // FIX(1.40200) - li t5, 0x1c5a2 // FIX(1.77200) - li t6, 0xffff492e // -FIX(0.71414) - li t7, 0xffffa7e6 // -FIX(0.34414) - repl.ph t8, 128 - -0: - lw s0, 0(a3) - lw t0, 0(a1) - lw t1, 4(a1) - lw t2, 8(a1) - sll s5, a2, 2 - addiu s1, -1 - lwx s2, s5(t0) - lwx s3, s5(t1) - lwx s4, s5(t2) - addu t9, s2, a0 - addiu a2, 1 - -1: - lbu s7, 0(s4) // cr - lbu s6, 0(s3) // cb - lbu s5, 0(s2) // y - addiu s2, 1 - addiu s4, 1 - addiu s7, -128 - addiu s6, -128 - mul t2, t7, s6 - mul t0, t6, s7 // Crgtab[cr] - sll s7, 15 - mulq_rs.w t1, t4, s7 // Crrtab[cr] - sll s6, 15 - addu t2, t3 // Cbgtab[cb] - addu t2, t0 - - mulq_rs.w t0, t5, s6 // Cbbtab[cb] - sra t2, 16 - addu t1, s5 - addu t2, s5 // add y - ins t2, t1, 16, 16 - subu.ph t2, t2, t8 - addu t0, s5 - shll_s.ph t2, t2, 8 - subu t0, 128 - shra.ph t2, t2, 8 - shll_s.w t0, t0, 24 - addu.ph t2, t2, t8 // clip & store - sra t0, t0, 24 - sra t1, t2, 16 - addiu t0, 128 - - STORE_YCC_TO_RGB t1, t2, t0, s0 - - bne s2, t9, 1b - addiu s3, 1 - bgtz s1, 0b - addiu a3, 4 - - RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 - - j ra - nop -END(jsimd_ycc_\colorid\()_convert_dspr2) - -.purgem STORE_YCC_TO_RGB - -.endm - -/*-------------------------------------id -- pix R G B A */ -GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgb, 3, 0, 1, 2, 3 -GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgr, 3, 2, 1, 0, 3 -GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2, 3 -GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0, 3 -GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1, 0 -GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3, 0 - - -/*****************************************************************************/ -/* - * jsimd_extrgb_gray_convert_dspr2 - * jsimd_extbgr_gray_convert_dspr2 - * jsimd_extrgbx_gray_convert_dspr2 - * jsimd_extbgrx_gray_convert_dspr2 - * jsimd_extxbgr_gray_convert_dspr2 - * jsimd_extxrgb_gray_convert_dspr2 - * - * Colorspace conversion RGB -> GRAY - */ - -.macro GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 colorid, pixel_size, \ - r_offs, g_offs, b_offs - -.macro DO_RGB_TO_GRAY r, g, b, inptr - lbu \r, \r_offs(\inptr) - lbu \g, \g_offs(\inptr) - lbu \b, \b_offs(\inptr) - addiu \inptr, \pixel_size -.endm - -LEAF_DSPR2(jsimd_\colorid\()_gray_convert_dspr2) -/* - * a0 = cinfo->image_width - * a1 = input_buf - * a2 = output_buf - * a3 = output_row - * 16(sp) = num_rows - */ - SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 - - li s0, 0x4c8b // s0 = FIX(0.29900) - li s1, 0x9646 // s1 = FIX(0.58700) - li s2, 0x1d2f // s2 = FIX(0.11400) - li s7, 0x8000 // s7 = FIX(0.50000) - lw s6, 48(sp) - andi t7, a0, 3 - -0: - addiu s6, -1 // s6 = num_rows - lw t0, 0(a1) - lw t1, 0(a2) - sll t3, a3, 2 - lwx t1, t3(t1) - addiu a3, 1 - addu t9, t1, a0 - subu t8, t9, t7 - beq t1, t8, 2f - nop - -1: - DO_RGB_TO_GRAY t3, t4, t5, t0 - DO_RGB_TO_GRAY s3, s4, s5, t0 - - mtlo s7, $ac0 - maddu $ac0, s2, t5 - maddu $ac0, s1, t4 - maddu $ac0, s0, t3 - mtlo s7, $ac1 - maddu $ac1, s2, s5 - maddu $ac1, s1, s4 - maddu $ac1, s0, s3 - extr.w t6, $ac0, 16 - - DO_RGB_TO_GRAY t3, t4, t5, t0 - DO_RGB_TO_GRAY s3, s4, s5, t0 - - mtlo s7, $ac0 - maddu $ac0, s2, t5 - maddu $ac0, s1, t4 - extr.w t2, $ac1, 16 - maddu $ac0, s0, t3 - mtlo s7, $ac1 - maddu $ac1, s2, s5 - maddu $ac1, s1, s4 - maddu $ac1, s0, s3 - extr.w t5, $ac0, 16 - sb t6, 0(t1) - sb t2, 1(t1) - extr.w t3, $ac1, 16 - addiu t1, 4 - sb t5, -2(t1) - sb t3, -1(t1) - bne t1, t8, 1b - nop - -2: - beqz t7, 4f - nop - -3: - DO_RGB_TO_GRAY t3, t4, t5, t0 - - mtlo s7, $ac0 - maddu $ac0, s2, t5 - maddu $ac0, s1, t4 - maddu $ac0, s0, t3 - extr.w t6, $ac0, 16 - sb t6, 0(t1) - addiu t1, 1 - bne t1, t9, 3b - nop - -4: - bgtz s6, 0b - addiu a1, 4 - - RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 - - j ra - nop -END(jsimd_\colorid\()_gray_convert_dspr2) - -.purgem DO_RGB_TO_GRAY - -.endm - -/*-------------------------------------id -- pix R G B */ -GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgb, 3, 0, 1, 2 -GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgr, 3, 2, 1, 0 -GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2 -GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0 -GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1 -GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3 - - -/*****************************************************************************/ -/* - * jsimd_h2v2_merged_upsample_dspr2 - * jsimd_h2v2_extrgb_merged_upsample_dspr2 - * jsimd_h2v2_extrgbx_merged_upsample_dspr2 - * jsimd_h2v2_extbgr_merged_upsample_dspr2 - * jsimd_h2v2_extbgrx_merged_upsample_dspr2 - * jsimd_h2v2_extxbgr_merged_upsample_dspr2 - * jsimd_h2v2_extxrgb_merged_upsample_dspr2 - * - * Merged h2v2 upsample routines - */ -.macro GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \ - r1_offs, g1_offs, \ - b1_offs, a1_offs, \ - r2_offs, g2_offs, \ - b2_offs, a2_offs - -.macro STORE_H2V2_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \ - scratch5 outptr - sb \scratch0, \r1_offs(\outptr) - sb \scratch1, \g1_offs(\outptr) - sb \scratch2, \b1_offs(\outptr) - sb \scratch3, \r2_offs(\outptr) - sb \scratch4, \g2_offs(\outptr) - sb \scratch5, \b2_offs(\outptr) -.if (\pixel_size == 8) - li \scratch0, 0xFF - sb \scratch0, \a1_offs(\outptr) - sb \scratch0, \a2_offs(\outptr) -.endif - addiu \outptr, \pixel_size -.endm - -.macro STORE_H2V2_1_PIXEL scratch0 scratch1 scratch2 outptr - sb \scratch0, \r1_offs(\outptr) - sb \scratch1, \g1_offs(\outptr) - sb \scratch2, \b1_offs(\outptr) - -.if (\pixel_size == 8) - li t0, 0xFF - sb t0, \a1_offs(\outptr) -.endif -.endm - -LEAF_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_dspr2) -/* - * a0 = cinfo->output_width - * a1 = input_buf - * a2 = in_row_group_ctr - * a3 = output_buf - * 16(sp) = cinfo->sample_range_limit - */ - SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra - - lw t9, 56(sp) // cinfo->sample_range_limit - lw v0, 0(a1) - lw v1, 4(a1) - lw t0, 8(a1) - sll t1, a2, 3 - addiu t2, t1, 4 - sll t3, a2, 2 - lw t4, 0(a3) // t4 = output_buf[0] - lwx t1, t1(v0) // t1 = input_buf[0][in_row_group_ctr*2] - lwx t2, t2(v0) // t2 = input_buf[0][in_row_group_ctr*2 + 1] - lwx t5, t3(v1) // t5 = input_buf[1][in_row_group_ctr] - lwx t6, t3(t0) // t6 = input_buf[2][in_row_group_ctr] - lw t7, 4(a3) // t7 = output_buf[1] - li s1, 0xe6ea - addiu t8, s1, 0x7fff // t8 = 0x166e9 [FIX(1.40200)] - addiu s0, t8, 0x5eb9 // s0 = 0x1c5a2 [FIX(1.77200)] - addiu s1, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)] - xori s2, s1, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)] - srl t3, a0, 1 - blez t3, 2f - addu t0, t5, t3 // t0 = end address - 1: - lbu t3, 0(t5) - lbu s3, 0(t6) - addiu t5, t5, 1 - addiu t3, t3, -128 // (cb - 128) - addiu s3, s3, -128 // (cr - 128) - mult $ac1, s1, t3 - madd $ac1, s2, s3 - sll s3, s3, 15 - sll t3, t3, 15 - mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS - extr_r.w s5, $ac1, 16 - mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS - lbu v0, 0(t1) - addiu t6, t6, 1 - addiu t1, t1, 2 - addu t3, v0, s4 // y+cred - addu s3, v0, s5 // y+cgreen - addu v1, v0, s6 // y+cblue - addu t3, t9, t3 // y+cred - addu s3, t9, s3 // y+cgreen - addu v1, t9, v1 // y+cblue - lbu AT, 0(t3) - lbu s7, 0(s3) - lbu ra, 0(v1) - lbu v0, -1(t1) - addu t3, v0, s4 // y+cred - addu s3, v0, s5 // y+cgreen - addu v1, v0, s6 // y+cblue - addu t3, t9, t3 // y+cred - addu s3, t9, s3 // y+cgreen - addu v1, t9, v1 // y+cblue - lbu t3, 0(t3) - lbu s3, 0(s3) - lbu v1, 0(v1) - lbu v0, 0(t2) - - STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4 - - addu t3, v0, s4 // y+cred - addu s3, v0, s5 // y+cgreen - addu v1, v0, s6 // y+cblue - addu t3, t9, t3 // y+cred - addu s3, t9, s3 // y+cgreen - addu v1, t9, v1 // y+cblue - lbu AT, 0(t3) - lbu s7, 0(s3) - lbu ra, 0(v1) - lbu v0, 1(t2) - addiu t2, t2, 2 - addu t3, v0, s4 // y+cred - addu s3, v0, s5 // y+cgreen - addu v1, v0, s6 // y+cblue - addu t3, t9, t3 // y+cred - addu s3, t9, s3 // y+cgreen - addu v1, t9, v1 // y+cblue - lbu t3, 0(t3) - lbu s3, 0(s3) - lbu v1, 0(v1) - - STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7 - - bne t0, t5, 1b - nop -2: - andi t0, a0, 1 - beqz t0, 4f - lbu t3, 0(t5) - lbu s3, 0(t6) - addiu t3, t3, -128 // (cb - 128) - addiu s3, s3, -128 // (cr - 128) - mult $ac1, s1, t3 - madd $ac1, s2, s3 - sll s3, s3, 15 - sll t3, t3, 15 - lbu v0, 0(t1) - extr_r.w s5, $ac1, 16 - mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS - mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS - addu t3, v0, s4 // y+cred - addu s3, v0, s5 // y+cgreen - addu v1, v0, s6 // y+cblue - addu t3, t9, t3 // y+cred - addu s3, t9, s3 // y+cgreen - addu v1, t9, v1 // y+cblue - lbu t3, 0(t3) - lbu s3, 0(s3) - lbu v1, 0(v1) - lbu v0, 0(t2) - - STORE_H2V2_1_PIXEL t3, s3, v1, t4 - - addu t3, v0, s4 // y+cred - addu s3, v0, s5 // y+cgreen - addu v1, v0, s6 // y+cblue - addu t3, t9, t3 // y+cred - addu s3, t9, s3 // y+cgreen - addu v1, t9, v1 // y+cblue - lbu t3, 0(t3) - lbu s3, 0(s3) - lbu v1, 0(v1) - - STORE_H2V2_1_PIXEL t3, s3, v1, t7 -4: - RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra - - j ra - nop - -END(jsimd_h2v2_\colorid\()_merged_upsample_dspr2) - -.purgem STORE_H2V2_1_PIXEL -.purgem STORE_H2V2_2_PIXELS -.endm - -/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */ -GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6 -GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6 -GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7 -GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7 -GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4 -GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4 - - -/*****************************************************************************/ -/* - * jsimd_h2v1_merged_upsample_dspr2 - * jsimd_h2v1_extrgb_merged_upsample_dspr2 - * jsimd_h2v1_extrgbx_merged_upsample_dspr2 - * jsimd_h2v1_extbgr_merged_upsample_dspr2 - * jsimd_h2v1_extbgrx_merged_upsample_dspr2 - * jsimd_h2v1_extxbgr_merged_upsample_dspr2 - * jsimd_h2v1_extxrgb_merged_upsample_dspr2 - * - * Merged h2v1 upsample routines - */ - -.macro GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \ - r1_offs, g1_offs, \ - b1_offs, a1_offs, \ - r2_offs, g2_offs, \ - b2_offs, a2_offs - -.macro STORE_H2V1_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \ - scratch5 outptr - sb \scratch0, \r1_offs(\outptr) - sb \scratch1, \g1_offs(\outptr) - sb \scratch2, \b1_offs(\outptr) - sb \scratch3, \r2_offs(\outptr) - sb \scratch4, \g2_offs(\outptr) - sb \scratch5, \b2_offs(\outptr) -.if (\pixel_size == 8) - li t0, 0xFF - sb t0, \a1_offs(\outptr) - sb t0, \a2_offs(\outptr) -.endif - addiu \outptr, \pixel_size -.endm - -.macro STORE_H2V1_1_PIXEL scratch0 scratch1 scratch2 outptr - sb \scratch0, \r1_offs(\outptr) - sb \scratch1, \g1_offs(\outptr) - sb \scratch2, \b1_offs(\outptr) -.if (\pixel_size == 8) - li t0, 0xFF - sb t0, \a1_offs(\outptr) -.endif -.endm - -LEAF_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_dspr2) -/* - * a0 = cinfo->output_width - * a1 = input_buf - * a2 = in_row_group_ctr - * a3 = output_buf - * 16(sp) = range_limit - */ - SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra - - li t0, 0xe6ea - lw t1, 0(a1) // t1 = input_buf[0] - lw t2, 4(a1) // t2 = input_buf[1] - lw t3, 8(a1) // t3 = input_buf[2] - lw t8, 56(sp) // t8 = range_limit - addiu s1, t0, 0x7fff // s1 = 0x166e9 [FIX(1.40200)] - addiu s2, s1, 0x5eb9 // s2 = 0x1c5a2 [FIX(1.77200)] - addiu s0, t0, 0x9916 // s0 = 0x8000 - addiu s4, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)] - xori s3, s4, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)] - srl t0, a0, 1 - sll t4, a2, 2 - lwx s5, t4(t1) // s5 = inptr0 - lwx s6, t4(t2) // s6 = inptr1 - lwx s7, t4(t3) // s7 = inptr2 - lw t7, 0(a3) // t7 = outptr - blez t0, 2f - addu t9, s6, t0 // t9 = end address -1: - lbu t2, 0(s6) // t2 = cb - lbu t0, 0(s7) // t0 = cr - lbu t1, 0(s5) // t1 = y - addiu t2, t2, -128 // t2 = cb - 128 - addiu t0, t0, -128 // t0 = cr - 128 - mult $ac1, s4, t2 - madd $ac1, s3, t0 - sll t0, t0, 15 - sll t2, t2, 15 - mulq_rs.w t0, s1, t0 // t0 = (C1*cr + ONE_HALF)>> SCALEBITS - extr_r.w t5, $ac1, 16 - mulq_rs.w t6, s2, t2 // t6 = (C2*cb + ONE_HALF)>> SCALEBITS - addiu s7, s7, 1 - addiu s6, s6, 1 - addu t2, t1, t0 // t2 = y + cred - addu t3, t1, t5 // t3 = y + cgreen - addu t4, t1, t6 // t4 = y + cblue - addu t2, t8, t2 - addu t3, t8, t3 - addu t4, t8, t4 - lbu t1, 1(s5) - lbu v0, 0(t2) - lbu v1, 0(t3) - lbu ra, 0(t4) - addu t2, t1, t0 - addu t3, t1, t5 - addu t4, t1, t6 - addu t2, t8, t2 - addu t3, t8, t3 - addu t4, t8, t4 - lbu t2, 0(t2) - lbu t3, 0(t3) - lbu t4, 0(t4) - - STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7 - - bne t9, s6, 1b - addiu s5, s5, 2 -2: - andi t0, a0, 1 - beqz t0, 4f - nop -3: - lbu t2, 0(s6) - lbu t0, 0(s7) - lbu t1, 0(s5) - addiu t2, t2, -128 // (cb - 128) - addiu t0, t0, -128 // (cr - 128) - mul t3, s4, t2 - mul t4, s3, t0 - sll t0, t0, 15 - sll t2, t2, 15 - mulq_rs.w t0, s1, t0 // (C1*cr + ONE_HALF)>> SCALEBITS - mulq_rs.w t6, s2, t2 // (C2*cb + ONE_HALF)>> SCALEBITS - addu t3, t3, s0 - addu t3, t4, t3 - sra t5, t3, 16 // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS - addu t2, t1, t0 // y + cred - addu t3, t1, t5 // y + cgreen - addu t4, t1, t6 // y + cblue - addu t2, t8, t2 - addu t3, t8, t3 - addu t4, t8, t4 - lbu t2, 0(t2) - lbu t3, 0(t3) - lbu t4, 0(t4) - - STORE_H2V1_1_PIXEL t2, t3, t4, t7 -4: - RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra - - j ra - nop - -END(jsimd_h2v1_\colorid\()_merged_upsample_dspr2) - -.purgem STORE_H2V1_1_PIXEL -.purgem STORE_H2V1_2_PIXELS -.endm - -/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */ -GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6 -GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6 -GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7 -GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7 -GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4 -GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4 - - -/*****************************************************************************/ -/* - * jsimd_h2v2_fancy_upsample_dspr2 - * - * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. - */ -LEAF_DSPR2(jsimd_h2v2_fancy_upsample_dspr2) -/* - * a0 = cinfo->max_v_samp_factor - * a1 = downsampled_width - * a2 = input_data - * a3 = output_data_ptr - */ - SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5 - - li s4, 0 - lw s2, 0(a3) // s2 = *output_data_ptr -0: - li t9, 2 - lw s1, -4(a2) // s1 = inptr1 - -1: - lw s0, 0(a2) // s0 = inptr0 - lwx s3, s4(s2) - addiu s5, a1, -2 // s5 = downsampled_width - 2 - srl t4, s5, 1 - sll t4, t4, 1 - lbu t0, 0(s0) - lbu t1, 1(s0) - lbu t2, 0(s1) - lbu t3, 1(s1) - addiu s0, 2 - addiu s1, 2 - addu t8, s0, t4 // t8 = end address - andi s5, s5, 1 // s5 = residual - sll t4, t0, 1 - sll t6, t1, 1 - addu t0, t0, t4 // t0 = (*inptr0++) * 3 - addu t1, t1, t6 // t1 = (*inptr0++) * 3 - addu t7, t0, t2 // t7 = thiscolsum - addu t6, t1, t3 // t5 = nextcolsum - sll t0, t7, 2 // t0 = thiscolsum * 4 - subu t1, t0, t7 // t1 = thiscolsum * 3 - shra_r.w t0, t0, 4 - addiu t1, 7 - addu t1, t1, t6 - srl t1, t1, 4 - sb t0, 0(s3) - sb t1, 1(s3) - beq t8, s0, 22f // skip to final iteration if width == 3 - addiu s3, 2 -2: - lh t0, 0(s0) // t0 = A3|A2 - lh t2, 0(s1) // t2 = B3|B2 - addiu s0, 2 - addiu s1, 2 - preceu.ph.qbr t0, t0 // t0 = 0|A3|0|A2 - preceu.ph.qbr t2, t2 // t2 = 0|B3|0|B2 - shll.ph t1, t0, 1 - sll t3, t6, 1 - addu.ph t0, t1, t0 // t0 = A3*3|A2*3 - addu t3, t3, t6 // t3 = this * 3 - addu.ph t0, t0, t2 // t0 = next2|next1 - addu t1, t3, t7 - andi t7, t0, 0xFFFF // t7 = next1 - sll t2, t7, 1 - addu t2, t7, t2 // t2 = next1*3 - addu t4, t2, t6 - srl t6, t0, 16 // t6 = next2 - shra_r.w t1, t1, 4 // t1 = (this*3 + last + 8) >> 4 - addu t0, t3, t7 - addiu t0, 7 - srl t0, t0, 4 // t0 = (this*3 + next1 + 7) >> 4 - shra_r.w t4, t4, 4 // t3 = (next1*3 + this + 8) >> 4 - addu t2, t2, t6 - addiu t2, 7 - srl t2, t2, 4 // t2 = (next1*3 + next2 + 7) >> 4 - sb t1, 0(s3) - sb t0, 1(s3) - sb t4, 2(s3) - sb t2, 3(s3) - bne t8, s0, 2b - addiu s3, 4 -22: - beqz s5, 4f - addu t8, s0, s5 -3: - lbu t0, 0(s0) - lbu t2, 0(s1) - addiu s0, 1 - addiu s1, 1 - sll t3, t6, 1 - sll t1, t0, 1 - addu t1, t0, t1 // t1 = inptr0 * 3 - addu t3, t3, t6 // t3 = thiscolsum * 3 - addu t5, t1, t2 - addu t1, t3, t7 - shra_r.w t1, t1, 4 - addu t0, t3, t5 - addiu t0, 7 - srl t0, t0, 4 - sb t1, 0(s3) - sb t0, 1(s3) - addiu s3, 2 - move t7, t6 - bne t8, s0, 3b - move t6, t5 -4: - sll t0, t6, 2 // t0 = thiscolsum * 4 - subu t1, t0, t6 // t1 = thiscolsum * 3 - addu t1, t1, t7 - addiu s4, 4 - shra_r.w t1, t1, 4 - addiu t0, 7 - srl t0, t0, 4 - sb t1, 0(s3) - sb t0, 1(s3) - addiu t9, -1 - addiu s3, 2 - bnez t9, 1b - lw s1, 4(a2) - srl t0, s4, 2 - subu t0, a0, t0 - bgtz t0, 0b - addiu a2, 4 - - RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5 - - j ra - nop -END(jsimd_h2v2_fancy_upsample_dspr2) - - -/*****************************************************************************/ -LEAF_DSPR2(jsimd_h2v1_fancy_upsample_dspr2) -/* - * a0 = cinfo->max_v_samp_factor - * a1 = downsampled_width - * a2 = input_data - * a3 = output_data_ptr - */ - SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 - - .set at - - beqz a0, 3f - sll t0, a0, 2 - lw s1, 0(a3) - li s3, 0x10001 - addu s0, s1, t0 -0: - addiu t8, a1, -2 - srl t9, t8, 2 - lw t7, 0(a2) - lw s2, 0(s1) - lbu t0, 0(t7) - lbu t1, 1(t7) // t1 = inptr[1] - sll t2, t0, 1 - addu t2, t2, t0 // t2 = invalue*3 - addu t2, t2, t1 - shra_r.w t2, t2, 2 - sb t0, 0(s2) - sb t2, 1(s2) - beqz t9, 11f - addiu s2, 2 -1: - ulw t0, 0(t7) // t0 = |P3|P2|P1|P0| - ulw t1, 1(t7) - ulh t2, 4(t7) // t2 = |0|0|P5|P4| - preceu.ph.qbl t3, t0 // t3 = |0|P3|0|P2| - preceu.ph.qbr t0, t0 // t0 = |0|P1|0|P0| - preceu.ph.qbr t2, t2 // t2 = |0|P5|0|P4| - preceu.ph.qbl t4, t1 // t4 = |0|P4|0|P3| - preceu.ph.qbr t1, t1 // t1 = |0|P2|0|P1| - shll.ph t5, t4, 1 - shll.ph t6, t1, 1 - addu.ph t5, t5, t4 // t5 = |P4*3|P3*3| - addu.ph t6, t6, t1 // t6 = |P2*3|P1*3| - addu.ph t4, t3, s3 - addu.ph t0, t0, s3 - addu.ph t4, t4, t5 - addu.ph t0, t0, t6 - shrl.ph t4, t4, 2 // t4 = |0|P3|0|P2| - shrl.ph t0, t0, 2 // t0 = |0|P1|0|P0| - addu.ph t2, t2, t5 - addu.ph t3, t3, t6 - shra_r.ph t2, t2, 2 // t2 = |0|P5|0|P4| - shra_r.ph t3, t3, 2 // t3 = |0|P3|0|P2| - shll.ph t2, t2, 8 - shll.ph t3, t3, 8 - or t2, t4, t2 - or t3, t3, t0 - addiu t9, -1 - usw t3, 0(s2) - usw t2, 4(s2) - addiu s2, 8 - bgtz t9, 1b - addiu t7, 4 -11: - andi t8, 3 - beqz t8, 22f - addiu t7, 1 - -2: - lbu t0, 0(t7) - addiu t7, 1 - sll t1, t0, 1 - addu t2, t0, t1 // t2 = invalue - lbu t3, -2(t7) - lbu t4, 0(t7) - addiu t3, 1 - addiu t4, 2 - addu t3, t3, t2 - addu t4, t4, t2 - srl t3, 2 - srl t4, 2 - sb t3, 0(s2) - sb t4, 1(s2) - addiu t8, -1 - bgtz t8, 2b - addiu s2, 2 - -22: - lbu t0, 0(t7) - lbu t2, -1(t7) - sll t1, t0, 1 - addu t1, t1, t0 // t1 = invalue * 3 - addu t1, t1, t2 - addiu t1, 1 - srl t1, t1, 2 - sb t1, 0(s2) - sb t0, 1(s2) - addiu s1, 4 - bne s1, s0, 0b - addiu a2, 4 -3: - RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 - - j ra - nop -END(jsimd_h2v1_fancy_upsample_dspr2) - - -/*****************************************************************************/ -LEAF_DSPR2(jsimd_h2v1_downsample_dspr2) -/* - * a0 = cinfo->image_width - * a1 = cinfo->max_v_samp_factor - * a2 = compptr->v_samp_factor - * a3 = compptr->width_in_blocks - * 16(sp) = input_data - * 20(sp) = output_data - */ - .set at - - SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4 - - beqz a2, 7f - lw s1, 44(sp) // s1 = output_data - lw s0, 40(sp) // s0 = input_data - srl s2, a0, 2 - andi t9, a0, 2 - srl t7, t9, 1 - addu s2, t7, s2 - sll t0, a3, 3 // t0 = width_in_blocks*DCT - srl t7, t0, 1 - subu s2, t7, s2 -0: - andi t6, a0, 1 // t6 = temp_index - addiu t6, -1 - lw t4, 0(s1) // t4 = outptr - lw t5, 0(s0) // t5 = inptr0 - li s3, 0 // s3 = bias - srl t7, a0, 1 // t7 = image_width1 - srl s4, t7, 2 - andi t8, t7, 3 -1: - ulhu t0, 0(t5) - ulhu t1, 2(t5) - ulhu t2, 4(t5) - ulhu t3, 6(t5) - raddu.w.qb t0, t0 - raddu.w.qb t1, t1 - raddu.w.qb t2, t2 - raddu.w.qb t3, t3 - shra.ph t0, t0, 1 - shra_r.ph t1, t1, 1 - shra.ph t2, t2, 1 - shra_r.ph t3, t3, 1 - sb t0, 0(t4) - sb t1, 1(t4) - sb t2, 2(t4) - sb t3, 3(t4) - addiu s4, -1 - addiu t4, 4 - bgtz s4, 1b - addiu t5, 8 - beqz t8, 3f - addu s4, t4, t8 -2: - ulhu t0, 0(t5) - raddu.w.qb t0, t0 - addqh.w t0, t0, s3 - xori s3, s3, 1 - sb t0, 0(t4) - addiu t4, 1 - bne t4, s4, 2b - addiu t5, 2 -3: - lbux t1, t6(t5) - sll t1, 1 - addqh.w t2, t1, s3 // t2 = pixval1 - xori s3, s3, 1 - addqh.w t3, t1, s3 // t3 = pixval2 - blez s2, 5f - append t3, t2, 8 - addu t5, t4, s2 // t5 = loop_end2 -4: - ush t3, 0(t4) - addiu s2, -1 - bgtz s2, 4b - addiu t4, 2 -5: - beqz t9, 6f - nop - sb t2, 0(t4) -6: - addiu s1, 4 - addiu a2, -1 - bnez a2, 0b - addiu s0, 4 -7: - RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4 - - j ra - nop -END(jsimd_h2v1_downsample_dspr2) - - -/*****************************************************************************/ -LEAF_DSPR2(jsimd_h2v2_downsample_dspr2) -/* - * a0 = cinfo->image_width - * a1 = cinfo->max_v_samp_factor - * a2 = compptr->v_samp_factor - * a3 = compptr->width_in_blocks - * 16(sp) = input_data - * 20(sp) = output_data - */ - .set at - - SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 - - beqz a2, 8f - lw s1, 52(sp) // s1 = output_data - lw s0, 48(sp) // s0 = input_data - - andi t6, a0, 1 // t6 = temp_index - addiu t6, -1 - srl t7, a0, 1 // t7 = image_width1 - srl s4, t7, 2 - andi t8, t7, 3 - andi t9, a0, 2 - srl s2, a0, 2 - srl t7, t9, 1 - addu s2, t7, s2 - sll t0, a3, 3 // s2 = width_in_blocks*DCT - srl t7, t0, 1 - subu s2, t7, s2 -0: - lw t4, 0(s1) // t4 = outptr - lw t5, 0(s0) // t5 = inptr0 - lw s7, 4(s0) // s7 = inptr1 - li s6, 1 // s6 = bias -2: - ulw t0, 0(t5) // t0 = |P3|P2|P1|P0| - ulw t1, 0(s7) // t1 = |Q3|Q2|Q1|Q0| - ulw t2, 4(t5) - ulw t3, 4(s7) - precrq.ph.w t7, t0, t1 // t2 = |P3|P2|Q3|Q2| - ins t0, t1, 16, 16 // t0 = |Q1|Q0|P1|P0| - raddu.w.qb t1, t7 - raddu.w.qb t0, t0 - shra_r.w t1, t1, 2 - addiu t0, 1 - srl t0, 2 - precrq.ph.w t7, t2, t3 - ins t2, t3, 16, 16 - raddu.w.qb t7, t7 - raddu.w.qb t2, t2 - shra_r.w t7, t7, 2 - addiu t2, 1 - srl t2, 2 - sb t0, 0(t4) - sb t1, 1(t4) - sb t2, 2(t4) - sb t7, 3(t4) - addiu t4, 4 - addiu t5, 8 - addiu s4, s4, -1 - bgtz s4, 2b - addiu s7, 8 - beqz t8, 4f - addu t8, t4, t8 -3: - ulhu t0, 0(t5) - ulhu t1, 0(s7) - ins t0, t1, 16, 16 - raddu.w.qb t0, t0 - addu t0, t0, s6 - srl t0, 2 - xori s6, s6, 3 - sb t0, 0(t4) - addiu t5, 2 - addiu t4, 1 - bne t8, t4, 3b - addiu s7, 2 -4: - lbux t1, t6(t5) - sll t1, 1 - lbux t0, t6(s7) - sll t0, 1 - addu t1, t1, t0 - addu t3, t1, s6 - srl t0, t3, 2 // t2 = pixval1 - xori s6, s6, 3 - addu t2, t1, s6 - srl t1, t2, 2 // t3 = pixval2 - blez s2, 6f - append t1, t0, 8 -5: - ush t1, 0(t4) - addiu s2, -1 - bgtz s2, 5b - addiu t4, 2 -6: - beqz t9, 7f - nop - sb t0, 0(t4) -7: - addiu s1, 4 - addiu a2, -1 - bnez a2, 0b - addiu s0, 8 -8: - RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 - - j ra - nop -END(jsimd_h2v2_downsample_dspr2) - - -/*****************************************************************************/ -LEAF_DSPR2(jsimd_h2v2_smooth_downsample_dspr2) -/* - * a0 = input_data - * a1 = output_data - * a2 = compptr->v_samp_factor - * a3 = cinfo->max_v_samp_factor - * 16(sp) = cinfo->smoothing_factor - * 20(sp) = compptr->width_in_blocks - * 24(sp) = cinfo->image_width - */ - .set at - - SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 - - lw s7, 52(sp) // compptr->width_in_blocks - lw s0, 56(sp) // cinfo->image_width - lw s6, 48(sp) // cinfo->smoothing_factor - sll s7, 3 // output_cols = width_in_blocks * DCTSIZE - sll v0, s7, 1 - subu v0, v0, s0 - blez v0, 2f - move v1, zero - addiu t0, a3, 2 // t0 = cinfo->max_v_samp_factor + 2 -0: - addiu t1, a0, -4 - sll t2, v1, 2 - lwx t1, t2(t1) - move t3, v0 - addu t1, t1, s0 - lbu t2, -1(t1) -1: - addiu t3, t3, -1 - sb t2, 0(t1) - bgtz t3, 1b - addiu t1, t1, 1 - addiu v1, v1, 1 - bne v1, t0, 0b - nop -2: - li v0, 80 - mul v0, s6, v0 - li v1, 16384 - move t4, zero - move t5, zero - subu t6, v1, v0 // t6 = 16384 - tmp_smoot_f * 80 - sll t7, s6, 4 // t7 = tmp_smoot_f * 16 -3: -/* Special case for first column: pretend column -1 is same as column 0 */ - sll v0, t4, 2 - lwx t8, v0(a1) // outptr = output_data[outrow] - sll v1, t5, 2 - addiu t9, v1, 4 - addiu s0, v1, -4 - addiu s1, v1, 8 - lwx s2, v1(a0) // inptr0 = input_data[inrow] - lwx t9, t9(a0) // inptr1 = input_data[inrow+1] - lwx s0, s0(a0) // above_ptr = input_data[inrow-1] - lwx s1, s1(a0) // below_ptr = input_data[inrow+2] - lh v0, 0(s2) - lh v1, 0(t9) - lh t0, 0(s0) - lh t1, 0(s1) - ins v0, v1, 16, 16 - ins t0, t1, 16, 16 - raddu.w.qb t2, v0 - raddu.w.qb s3, t0 - lbu v0, 0(s2) - lbu v1, 2(s2) - lbu t0, 0(t9) - lbu t1, 2(t9) - addu v0, v0, v1 - mult $ac1, t2, t6 - addu t0, t0, t1 - lbu t2, 2(s0) - addu t0, t0, v0 - lbu t3, 2(s1) - addu s3, t0, s3 - lbu v0, 0(s0) - lbu t0, 0(s1) - sll s3, s3, 1 - addu v0, v0, t2 - addu t0, t0, t3 - addu t0, t0, v0 - addu s3, t0, s3 - madd $ac1, s3, t7 - extr_r.w v0, $ac1, 16 - addiu t8, t8, 1 - addiu s2, s2, 2 - addiu t9, t9, 2 - addiu s0, s0, 2 - addiu s1, s1, 2 - sb v0, -1(t8) - addiu s4, s7, -2 - and s4, s4, 3 - addu s5, s4, t8 // end address -4: - lh v0, 0(s2) - lh v1, 0(t9) - lh t0, 0(s0) - lh t1, 0(s1) - ins v0, v1, 16, 16 - ins t0, t1, 16, 16 - raddu.w.qb t2, v0 - raddu.w.qb s3, t0 - lbu v0, -1(s2) - lbu v1, 2(s2) - lbu t0, -1(t9) - lbu t1, 2(t9) - addu v0, v0, v1 - mult $ac1, t2, t6 - addu t0, t0, t1 - lbu t2, 2(s0) - addu t0, t0, v0 - lbu t3, 2(s1) - addu s3, t0, s3 - lbu v0, -1(s0) - lbu t0, -1(s1) - sll s3, s3, 1 - addu v0, v0, t2 - addu t0, t0, t3 - addu t0, t0, v0 - addu s3, t0, s3 - madd $ac1, s3, t7 - extr_r.w t2, $ac1, 16 - addiu t8, t8, 1 - addiu s2, s2, 2 - addiu t9, t9, 2 - addiu s0, s0, 2 - sb t2, -1(t8) - bne s5, t8, 4b - addiu s1, s1, 2 - addiu s5, s7, -2 - subu s5, s5, s4 - addu s5, s5, t8 // end address -5: - lh v0, 0(s2) - lh v1, 0(t9) - lh t0, 0(s0) - lh t1, 0(s1) - ins v0, v1, 16, 16 - ins t0, t1, 16, 16 - raddu.w.qb t2, v0 - raddu.w.qb s3, t0 - lbu v0, -1(s2) - lbu v1, 2(s2) - lbu t0, -1(t9) - lbu t1, 2(t9) - addu v0, v0, v1 - mult $ac1, t2, t6 - addu t0, t0, t1 - lbu t2, 2(s0) - addu t0, t0, v0 - lbu t3, 2(s1) - addu s3, t0, s3 - lbu v0, -1(s0) - lbu t0, -1(s1) - sll s3, s3, 1 - addu v0, v0, t2 - addu t0, t0, t3 - lh v1, 2(t9) - addu t0, t0, v0 - lh v0, 2(s2) - addu s3, t0, s3 - lh t0, 2(s0) - lh t1, 2(s1) - madd $ac1, s3, t7 - extr_r.w t2, $ac1, 16 - ins t0, t1, 16, 16 - ins v0, v1, 16, 16 - raddu.w.qb s3, t0 - lbu v1, 4(s2) - lbu t0, 1(t9) - lbu t1, 4(t9) - sb t2, 0(t8) - raddu.w.qb t3, v0 - lbu v0, 1(s2) - addu t0, t0, t1 - mult $ac1, t3, t6 - addu v0, v0, v1 - lbu t2, 4(s0) - addu t0, t0, v0 - lbu v0, 1(s0) - addu s3, t0, s3 - lbu t0, 1(s1) - lbu t3, 4(s1) - addu v0, v0, t2 - sll s3, s3, 1 - addu t0, t0, t3 - lh v1, 4(t9) - addu t0, t0, v0 - lh v0, 4(s2) - addu s3, t0, s3 - lh t0, 4(s0) - lh t1, 4(s1) - madd $ac1, s3, t7 - extr_r.w t2, $ac1, 16 - ins t0, t1, 16, 16 - ins v0, v1, 16, 16 - raddu.w.qb s3, t0 - lbu v1, 6(s2) - lbu t0, 3(t9) - lbu t1, 6(t9) - sb t2, 1(t8) - raddu.w.qb t3, v0 - lbu v0, 3(s2) - addu t0, t0, t1 - mult $ac1, t3, t6 - addu v0, v0, v1 - lbu t2, 6(s0) - addu t0, t0, v0 - lbu v0, 3(s0) - addu s3, t0, s3 - lbu t0, 3(s1) - lbu t3, 6(s1) - addu v0, v0, t2 - sll s3, s3, 1 - addu t0, t0, t3 - lh v1, 6(t9) - addu t0, t0, v0 - lh v0, 6(s2) - addu s3, t0, s3 - lh t0, 6(s0) - lh t1, 6(s1) - madd $ac1, s3, t7 - extr_r.w t3, $ac1, 16 - ins t0, t1, 16, 16 - ins v0, v1, 16, 16 - raddu.w.qb s3, t0 - lbu v1, 8(s2) - lbu t0, 5(t9) - lbu t1, 8(t9) - sb t3, 2(t8) - raddu.w.qb t2, v0 - lbu v0, 5(s2) - addu t0, t0, t1 - mult $ac1, t2, t6 - addu v0, v0, v1 - lbu t2, 8(s0) - addu t0, t0, v0 - lbu v0, 5(s0) - addu s3, t0, s3 - lbu t0, 5(s1) - lbu t3, 8(s1) - addu v0, v0, t2 - sll s3, s3, 1 - addu t0, t0, t3 - addiu t8, t8, 4 - addu t0, t0, v0 - addiu s2, s2, 8 - addu s3, t0, s3 - addiu t9, t9, 8 - madd $ac1, s3, t7 - extr_r.w t1, $ac1, 16 - addiu s0, s0, 8 - addiu s1, s1, 8 - bne s5, t8, 5b - sb t1, -1(t8) -/* Special case for last column */ - lh v0, 0(s2) - lh v1, 0(t9) - lh t0, 0(s0) - lh t1, 0(s1) - ins v0, v1, 16, 16 - ins t0, t1, 16, 16 - raddu.w.qb t2, v0 - raddu.w.qb s3, t0 - lbu v0, -1(s2) - lbu v1, 1(s2) - lbu t0, -1(t9) - lbu t1, 1(t9) - addu v0, v0, v1 - mult $ac1, t2, t6 - addu t0, t0, t1 - lbu t2, 1(s0) - addu t0, t0, v0 - lbu t3, 1(s1) - addu s3, t0, s3 - lbu v0, -1(s0) - lbu t0, -1(s1) - sll s3, s3, 1 - addu v0, v0, t2 - addu t0, t0, t3 - addu t0, t0, v0 - addu s3, t0, s3 - madd $ac1, s3, t7 - extr_r.w t0, $ac1, 16 - addiu t5, t5, 2 - sb t0, 0(t8) - addiu t4, t4, 1 - bne t4, a2, 3b - addiu t5, t5, 2 - - RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 - - j ra - nop - -END(jsimd_h2v2_smooth_downsample_dspr2) - - -/*****************************************************************************/ -LEAF_DSPR2(jsimd_int_upsample_dspr2) -/* - * a0 = upsample->h_expand[compptr->component_index] - * a1 = upsample->v_expand[compptr->component_index] - * a2 = input_data - * a3 = output_data_ptr - * 16(sp) = cinfo->output_width - * 20(sp) = cinfo->max_v_samp_factor - */ - .set at - - SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 - - lw s0, 0(a3) // s0 = output_data - lw s1, 32(sp) // s1 = cinfo->output_width - lw s2, 36(sp) // s2 = cinfo->max_v_samp_factor - li t6, 0 // t6 = inrow - beqz s2, 10f - li s3, 0 // s3 = outrow -0: - addu t0, a2, t6 - addu t7, s0, s3 - lw t3, 0(t0) // t3 = inptr - lw t8, 0(t7) // t8 = outptr - beqz s1, 4f - addu t5, t8, s1 // t5 = outend -1: - lb t2, 0(t3) // t2 = invalue = *inptr++ - addiu t3, 1 - beqz a0, 3f - move t0, a0 // t0 = h_expand -2: - sb t2, 0(t8) - addiu t0, -1 - bgtz t0, 2b - addiu t8, 1 -3: - bgt t5, t8, 1b - nop -4: - addiu t9, a1, -1 // t9 = v_expand - 1 - blez t9, 9f - nop -5: - lw t3, 0(s0) - lw t4, 4(s0) - subu t0, s1, 0xF - blez t0, 7f - addu t5, t3, s1 // t5 = end address - andi t7, s1, 0xF // t7 = residual - subu t8, t5, t7 -6: - ulw t0, 0(t3) - ulw t1, 4(t3) - ulw t2, 8(t3) - usw t0, 0(t4) - ulw t0, 12(t3) - usw t1, 4(t4) - usw t2, 8(t4) - usw t0, 12(t4) - addiu t3, 16 - bne t3, t8, 6b - addiu t4, 16 - beqz t7, 8f - nop -7: - lbu t0, 0(t3) - sb t0, 0(t4) - addiu t3, 1 - bne t3, t5, 7b - addiu t4, 1 -8: - addiu t9, -1 - bgtz t9, 5b - addiu s0, 8 -9: - addu s3, s3, a1 - bne s3, s2, 0b - addiu t6, 1 -10: - RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 - - j ra - nop -END(jsimd_int_upsample_dspr2) - - -/*****************************************************************************/ -LEAF_DSPR2(jsimd_h2v1_upsample_dspr2) -/* - * a0 = cinfo->max_v_samp_factor - * a1 = cinfo->output_width - * a2 = input_data - * a3 = output_data_ptr - */ - lw t7, 0(a3) // t7 = output_data - andi t8, a1, 0xf // t8 = residual - sll t0, a0, 2 - blez a0, 4f - addu t9, t7, t0 // t9 = output_data end address -0: - lw t5, 0(t7) // t5 = outptr - lw t6, 0(a2) // t6 = inptr - addu t3, t5, a1 // t3 = outptr + output_width (end address) - subu t3, t8 // t3 = end address - residual - beq t5, t3, 2f - move t4, t8 -1: - ulw t0, 0(t6) // t0 = |P3|P2|P1|P0| - ulw t2, 4(t6) // t2 = |P7|P6|P5|P4| - srl t1, t0, 16 // t1 = |X|X|P3|P2| - ins t0, t0, 16, 16 // t0 = |P1|P0|P1|P0| - ins t1, t1, 16, 16 // t1 = |P3|P2|P3|P2| - ins t0, t0, 8, 16 // t0 = |P1|P1|P0|P0| - ins t1, t1, 8, 16 // t1 = |P3|P3|P2|P2| - usw t0, 0(t5) - usw t1, 4(t5) - srl t0, t2, 16 // t0 = |X|X|P7|P6| - ins t2, t2, 16, 16 // t2 = |P5|P4|P5|P4| - ins t0, t0, 16, 16 // t0 = |P7|P6|P7|P6| - ins t2, t2, 8, 16 // t2 = |P5|P5|P4|P4| - ins t0, t0, 8, 16 // t0 = |P7|P7|P6|P6| - usw t2, 8(t5) - usw t0, 12(t5) - addiu t5, 16 - bne t5, t3, 1b - addiu t6, 8 - beqz t8, 3f - move t4, t8 -2: - lbu t1, 0(t6) - sb t1, 0(t5) - sb t1, 1(t5) - addiu t4, -2 - addiu t6, 1 - bgtz t4, 2b - addiu t5, 2 -3: - addiu t7, 4 - bne t9, t7, 0b - addiu a2, 4 -4: - j ra - nop -END(jsimd_h2v1_upsample_dspr2) - - -/*****************************************************************************/ -LEAF_DSPR2(jsimd_h2v2_upsample_dspr2) -/* - * a0 = cinfo->max_v_samp_factor - * a1 = cinfo->output_width - * a2 = input_data - * a3 = output_data_ptr - */ - lw t7, 0(a3) - blez a0, 7f - andi t9, a1, 0xf // t9 = residual -0: - lw t6, 0(a2) // t6 = inptr - lw t5, 0(t7) // t5 = outptr - addu t8, t5, a1 // t8 = outptr end address - subu t8, t9 // t8 = end address - residual - beq t5, t8, 2f - move t4, t9 -1: - ulw t0, 0(t6) - srl t1, t0, 16 - ins t0, t0, 16, 16 - ins t0, t0, 8, 16 - ins t1, t1, 16, 16 - ins t1, t1, 8, 16 - ulw t2, 4(t6) - usw t0, 0(t5) - usw t1, 4(t5) - srl t3, t2, 16 - ins t2, t2, 16, 16 - ins t2, t2, 8, 16 - ins t3, t3, 16, 16 - ins t3, t3, 8, 16 - usw t2, 8(t5) - usw t3, 12(t5) - addiu t5, 16 - bne t5, t8, 1b - addiu t6, 8 - beqz t9, 3f - move t4, t9 -2: - lbu t0, 0(t6) - sb t0, 0(t5) - sb t0, 1(t5) - addiu t4, -2 - addiu t6, 1 - bgtz t4, 2b - addiu t5, 2 -3: - lw t6, 0(t7) // t6 = outptr[0] - lw t5, 4(t7) // t5 = outptr[1] - addu t4, t6, a1 // t4 = new end address - beq a1, t9, 5f - subu t8, t4, t9 -4: - ulw t0, 0(t6) - ulw t1, 4(t6) - ulw t2, 8(t6) - usw t0, 0(t5) - ulw t0, 12(t6) - usw t1, 4(t5) - usw t2, 8(t5) - usw t0, 12(t5) - addiu t6, 16 - bne t6, t8, 4b - addiu t5, 16 - beqz t9, 6f - nop -5: - lbu t0, 0(t6) - sb t0, 0(t5) - addiu t6, 1 - bne t6, t4, 5b - addiu t5, 1 -6: - addiu t7, 8 - addiu a0, -2 - bgtz a0, 0b - addiu a2, 4 -7: - j ra - nop -END(jsimd_h2v2_upsample_dspr2) - - -/*****************************************************************************/ -LEAF_DSPR2(jsimd_idct_islow_dspr2) -/* - * a0 = coef_block - * a1 = compptr->dcttable - * a2 = output - * a3 = range_limit - */ - SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 - - addiu sp, sp, -256 - move v0, sp - addiu v1, zero, 8 // v1 = DCTSIZE = 8 -1: - lh s4, 32(a0) // s4 = inptr[16] - lh s5, 64(a0) // s5 = inptr[32] - lh s6, 96(a0) // s6 = inptr[48] - lh t1, 112(a0) // t1 = inptr[56] - lh t7, 16(a0) // t7 = inptr[8] - lh t5, 80(a0) // t5 = inptr[40] - lh t3, 48(a0) // t3 = inptr[24] - or s4, s4, t1 - or s4, s4, t3 - or s4, s4, t5 - or s4, s4, t7 - or s4, s4, s5 - or s4, s4, s6 - bnez s4, 2f - addiu v1, v1, -1 - lh s5, 0(a1) // quantptr[DCTSIZE*0] - lh s6, 0(a0) // inptr[DCTSIZE*0] - mul s5, s5, s6 // DEQUANTIZE(inptr[0], quantptr[0]) - sll s5, s5, 2 - sw s5, 0(v0) - sw s5, 32(v0) - sw s5, 64(v0) - sw s5, 96(v0) - sw s5, 128(v0) - sw s5, 160(v0) - sw s5, 192(v0) - b 3f - sw s5, 224(v0) -2: - lh t0, 112(a1) - lh t2, 48(a1) - lh t4, 80(a1) - lh t6, 16(a1) - mul t0, t0, t1 // DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) - mul t1, t2, t3 // DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) - mul t2, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) - mul t3, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) - lh t4, 32(a1) - lh t5, 32(a0) - lh t6, 96(a1) - lh t7, 96(a0) - addu s0, t0, t1 // z3 = tmp0 + tmp2 - addu s1, t1, t2 // z2 = tmp1 + tmp2 - addu s2, t2, t3 // z4 = tmp1 + tmp3 - addu s3, s0, s2 // z3 + z4 - addiu t9, zero, 9633 // FIX_1_175875602 - mul s3, s3, t9 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602) - addu t8, t0, t3 // z1 = tmp0 + tmp3 - addiu t9, zero, 2446 // FIX_0_298631336 - mul t0, t0, t9 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336) - addiu t9, zero, 16819 // FIX_2_053119869 - mul t2, t2, t9 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869) - addiu t9, zero, 25172 // FIX_3_072711026 - mul t1, t1, t9 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026) - addiu t9, zero, 12299 // FIX_1_501321110 - mul t3, t3, t9 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110) - addiu t9, zero, 16069 // FIX_1_961570560 - mul s0, s0, t9 // -z3 = MULTIPLY(z3, FIX_1_961570560) - addiu t9, zero, 3196 // FIX_0_390180644 - mul s2, s2, t9 // -z4 = MULTIPLY(z4, FIX_0_390180644) - addiu t9, zero, 7373 // FIX_0_899976223 - mul t8, t8, t9 // -z1 = MULTIPLY(z1, FIX_0_899976223) - addiu t9, zero, 20995 // FIX_2_562915447 - mul s1, s1, t9 // -z2 = MULTIPLY(z2, FIX_2_562915447) - subu s0, s3, s0 // z3 += z5 - addu t0, t0, s0 // tmp0 += z3 - addu t1, t1, s0 // tmp2 += z3 - subu s2, s3, s2 // z4 += z5 - addu t2, t2, s2 // tmp1 += z4 - addu t3, t3, s2 // tmp3 += z4 - subu t0, t0, t8 // tmp0 += z1 - subu t1, t1, s1 // tmp2 += z2 - subu t2, t2, s1 // tmp1 += z2 - subu t3, t3, t8 // tmp3 += z1 - mul s0, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) - addiu t9, zero, 6270 // FIX_0_765366865 - mul s1, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) - lh t4, 0(a1) - lh t5, 0(a0) - lh t6, 64(a1) - lh t7, 64(a0) - mul s2, t9, s0 // MULTIPLY(z2, FIX_0_765366865) - mul t5, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - mul t6, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) - addiu t9, zero, 4433 // FIX_0_541196100 - addu s3, s0, s1 // z2 + z3 - mul s3, s3, t9 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100) - addiu t9, zero, 15137 // FIX_1_847759065 - mul t8, s1, t9 // MULTIPLY(z3, FIX_1_847759065) - addu t4, t5, t6 - subu t5, t5, t6 - sll t4, t4, 13 // tmp0 = (z2 + z3) << CONST_BITS - sll t5, t5, 13 // tmp1 = (z2 - z3) << CONST_BITS - addu t7, s3, s2 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) - subu t6, s3, t8 // tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065) - addu s0, t4, t7 - subu s1, t4, t7 - addu s2, t5, t6 - subu s3, t5, t6 - addu t4, s0, t3 - subu s0, s0, t3 - addu t3, s2, t1 - subu s2, s2, t1 - addu t1, s3, t2 - subu s3, s3, t2 - addu t2, s1, t0 - subu s1, s1, t0 - shra_r.w t4, t4, 11 - shra_r.w t3, t3, 11 - shra_r.w t1, t1, 11 - shra_r.w t2, t2, 11 - shra_r.w s1, s1, 11 - shra_r.w s3, s3, 11 - shra_r.w s2, s2, 11 - shra_r.w s0, s0, 11 - sw t4, 0(v0) - sw t3, 32(v0) - sw t1, 64(v0) - sw t2, 96(v0) - sw s1, 128(v0) - sw s3, 160(v0) - sw s2, 192(v0) - sw s0, 224(v0) -3: - addiu a1, a1, 2 - addiu a0, a0, 2 - bgtz v1, 1b - addiu v0, v0, 4 - move v0, sp - addiu v1, zero, 8 -4: - lw t0, 8(v0) // z2 = (JLONG)wsptr[2] - lw t1, 24(v0) // z3 = (JLONG)wsptr[6] - lw t2, 0(v0) // (JLONG)wsptr[0] - lw t3, 16(v0) // (JLONG)wsptr[4] - lw s4, 4(v0) // (JLONG)wsptr[1] - lw s5, 12(v0) // (JLONG)wsptr[3] - lw s6, 20(v0) // (JLONG)wsptr[5] - lw s7, 28(v0) // (JLONG)wsptr[7] - or s4, s4, t0 - or s4, s4, t1 - or s4, s4, t3 - or s4, s4, s7 - or s4, s4, s5 - or s4, s4, s6 - bnez s4, 5f - addiu v1, v1, -1 - shra_r.w s5, t2, 5 - andi s5, s5, 0x3ff - lbux s5, s5(a3) - lw s1, 0(a2) - replv.qb s5, s5 - usw s5, 0(s1) - usw s5, 4(s1) - b 6f - nop -5: - addu t4, t0, t1 // z2 + z3 - addiu t8, zero, 4433 // FIX_0_541196100 - mul t5, t4, t8 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100) - addiu t8, zero, 15137 // FIX_1_847759065 - mul t1, t1, t8 // MULTIPLY(z3, FIX_1_847759065) - addiu t8, zero, 6270 // FIX_0_765366865 - mul t0, t0, t8 // MULTIPLY(z2, FIX_0_765366865) - addu t4, t2, t3 // (JLONG)wsptr[0] + (JLONG)wsptr[4] - subu t2, t2, t3 // (JLONG)wsptr[0] - (JLONG)wsptr[4] - sll t4, t4, 13 // tmp0 = (wsptr[0] + wsptr[4]) << CONST_BITS - sll t2, t2, 13 // tmp1 = (wsptr[0] - wsptr[4]) << CONST_BITS - subu t1, t5, t1 // tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065) - subu t3, t2, t1 // tmp12 = tmp1 - tmp2 - addu t2, t2, t1 // tmp11 = tmp1 + tmp2 - addu t5, t5, t0 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) - subu t1, t4, t5 // tmp13 = tmp0 - tmp3 - addu t0, t4, t5 // tmp10 = tmp0 + tmp3 - lw t4, 28(v0) // tmp0 = (JLONG)wsptr[7] - lw t6, 12(v0) // tmp2 = (JLONG)wsptr[3] - lw t5, 20(v0) // tmp1 = (JLONG)wsptr[5] - lw t7, 4(v0) // tmp3 = (JLONG)wsptr[1] - addu s0, t4, t6 // z3 = tmp0 + tmp2 - addiu t8, zero, 9633 // FIX_1_175875602 - addu s1, t5, t7 // z4 = tmp1 + tmp3 - addu s2, s0, s1 // z3 + z4 - mul s2, s2, t8 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602) - addu s3, t4, t7 // z1 = tmp0 + tmp3 - addu t9, t5, t6 // z2 = tmp1 + tmp2 - addiu t8, zero, 16069 // FIX_1_961570560 - mul s0, s0, t8 // -z3 = MULTIPLY(z3, FIX_1_961570560) - addiu t8, zero, 3196 // FIX_0_390180644 - mul s1, s1, t8 // -z4 = MULTIPLY(z4, FIX_0_390180644) - addiu t8, zero, 2446 // FIX_0_298631336 - mul t4, t4, t8 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336) - addiu t8, zero, 7373 // FIX_0_899976223 - mul s3, s3, t8 // -z1 = MULTIPLY(z1, FIX_0_899976223) - addiu t8, zero, 16819 // FIX_2_053119869 - mul t5, t5, t8 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869) - addiu t8, zero, 20995 // FIX_2_562915447 - mul t9, t9, t8 // -z2 = MULTIPLY(z2, FIX_2_562915447) - addiu t8, zero, 25172 // FIX_3_072711026 - mul t6, t6, t8 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026) - addiu t8, zero, 12299 // FIX_1_501321110 - mul t7, t7, t8 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110) - subu s0, s2, s0 // z3 += z5 - subu s1, s2, s1 // z4 += z5 - addu t4, t4, s0 - subu t4, t4, s3 // tmp0 - addu t5, t5, s1 - subu t5, t5, t9 // tmp1 - addu t6, t6, s0 - subu t6, t6, t9 // tmp2 - addu t7, t7, s1 - subu t7, t7, s3 // tmp3 - addu s0, t0, t7 - subu t0, t0, t7 - addu t7, t2, t6 - subu t2, t2, t6 - addu t6, t3, t5 - subu t3, t3, t5 - addu t5, t1, t4 - subu t1, t1, t4 - shra_r.w s0, s0, 18 - shra_r.w t7, t7, 18 - shra_r.w t6, t6, 18 - shra_r.w t5, t5, 18 - shra_r.w t1, t1, 18 - shra_r.w t3, t3, 18 - shra_r.w t2, t2, 18 - shra_r.w t0, t0, 18 - andi s0, s0, 0x3ff - andi t7, t7, 0x3ff - andi t6, t6, 0x3ff - andi t5, t5, 0x3ff - andi t1, t1, 0x3ff - andi t3, t3, 0x3ff - andi t2, t2, 0x3ff - andi t0, t0, 0x3ff - lw s1, 0(a2) - lbux s0, s0(a3) - lbux t7, t7(a3) - lbux t6, t6(a3) - lbux t5, t5(a3) - lbux t1, t1(a3) - lbux t3, t3(a3) - lbux t2, t2(a3) - lbux t0, t0(a3) - sb s0, 0(s1) - sb t7, 1(s1) - sb t6, 2(s1) - sb t5, 3(s1) - sb t1, 4(s1) - sb t3, 5(s1) - sb t2, 6(s1) - sb t0, 7(s1) -6: - addiu v0, v0, 32 - bgtz v1, 4b - addiu a2, a2, 4 - addiu sp, sp, 256 - - RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 - - j ra - nop - -END(jsimd_idct_islow_dspr2) - - -/*****************************************************************************/ -LEAF_DSPR2(jsimd_idct_ifast_cols_dspr2) -/* - * a0 = inptr - * a1 = quantptr - * a2 = wsptr - * a3 = mips_idct_ifast_coefs - */ - SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 - - addiu t9, a0, 16 // end address - or AT, a3, zero - -0: - lw s0, 0(a1) // quantptr[DCTSIZE*0] - lw t0, 0(a0) // inptr[DCTSIZE*0] - lw t1, 16(a0) // inptr[DCTSIZE*1] - muleq_s.w.phl v0, t0, s0 // tmp0 ... - lw t2, 32(a0) // inptr[DCTSIZE*2] - lw t3, 48(a0) // inptr[DCTSIZE*3] - lw t4, 64(a0) // inptr[DCTSIZE*4] - lw t5, 80(a0) // inptr[DCTSIZE*5] - muleq_s.w.phr t0, t0, s0 // ... tmp0 ... - lw t6, 96(a0) // inptr[DCTSIZE*6] - lw t7, 112(a0) // inptr[DCTSIZE*7] - or s4, t1, t2 - or s5, t3, t4 - bnez s4, 1f - ins t0, v0, 16, 16 // ... tmp0 - bnez s5, 1f - or s6, t5, t6 - or s6, s6, t7 - bnez s6, 1f - sw t0, 0(a2) // wsptr[DCTSIZE*0] - sw t0, 16(a2) // wsptr[DCTSIZE*1] - sw t0, 32(a2) // wsptr[DCTSIZE*2] - sw t0, 48(a2) // wsptr[DCTSIZE*3] - sw t0, 64(a2) // wsptr[DCTSIZE*4] - sw t0, 80(a2) // wsptr[DCTSIZE*5] - sw t0, 96(a2) // wsptr[DCTSIZE*6] - sw t0, 112(a2) // wsptr[DCTSIZE*7] - addiu a0, a0, 4 - b 2f - addiu a1, a1, 4 - -1: - lw s1, 32(a1) // quantptr[DCTSIZE*2] - lw s2, 64(a1) // quantptr[DCTSIZE*4] - muleq_s.w.phl v0, t2, s1 // tmp1 ... - muleq_s.w.phr t2, t2, s1 // ... tmp1 ... - lw s0, 16(a1) // quantptr[DCTSIZE*1] - lw s1, 48(a1) // quantptr[DCTSIZE*3] - lw s3, 96(a1) // quantptr[DCTSIZE*6] - muleq_s.w.phl v1, t4, s2 // tmp2 ... - muleq_s.w.phr t4, t4, s2 // ... tmp2 ... - lw s2, 80(a1) // quantptr[DCTSIZE*5] - lw t8, 4(AT) // FIX(1.414213562) - ins t2, v0, 16, 16 // ... tmp1 - muleq_s.w.phl v0, t6, s3 // tmp3 ... - muleq_s.w.phr t6, t6, s3 // ... tmp3 ... - ins t4, v1, 16, 16 // ... tmp2 - addq.ph s4, t0, t4 // tmp10 - subq.ph s5, t0, t4 // tmp11 - ins t6, v0, 16, 16 // ... tmp3 - subq.ph s6, t2, t6 // tmp12 ... - addq.ph s7, t2, t6 // tmp13 - mulq_s.ph s6, s6, t8 // ... tmp12 ... - addq.ph t0, s4, s7 // tmp0 - subq.ph t6, s4, s7 // tmp3 - muleq_s.w.phl v0, t1, s0 // tmp4 ... - muleq_s.w.phr t1, t1, s0 // ... tmp4 ... - shll_s.ph s6, s6, 1 // x2 - lw s3, 112(a1) // quantptr[DCTSIZE*7] - subq.ph s6, s6, s7 // ... tmp12 - muleq_s.w.phl v1, t7, s3 // tmp7 ... - muleq_s.w.phr t7, t7, s3 // ... tmp7 ... - ins t1, v0, 16, 16 // ... tmp4 - addq.ph t2, s5, s6 // tmp1 - subq.ph t4, s5, s6 // tmp2 - muleq_s.w.phl v0, t5, s2 // tmp6 ... - muleq_s.w.phr t5, t5, s2 // ... tmp6 ... - ins t7, v1, 16, 16 // ... tmp7 - addq.ph s5, t1, t7 // z11 - subq.ph s6, t1, t7 // z12 - muleq_s.w.phl v1, t3, s1 // tmp5 ... - muleq_s.w.phr t3, t3, s1 // ... tmp5 ... - ins t5, v0, 16, 16 // ... tmp6 - ins t3, v1, 16, 16 // ... tmp5 - addq.ph s7, t5, t3 // z13 - subq.ph v0, t5, t3 // z10 - addq.ph t7, s5, s7 // tmp7 - subq.ph s5, s5, s7 // tmp11 ... - addq.ph v1, v0, s6 // z5 ... - mulq_s.ph s5, s5, t8 // ... tmp11 - lw t8, 8(AT) // FIX(1.847759065) - lw s4, 0(AT) // FIX(1.082392200) - addq.ph s0, t0, t7 - subq.ph s1, t0, t7 - mulq_s.ph v1, v1, t8 // ... z5 - shll_s.ph s5, s5, 1 // x2 - lw t8, 12(AT) // FIX(-2.613125930) - sw s0, 0(a2) // wsptr[DCTSIZE*0] - shll_s.ph v0, v0, 1 // x4 - mulq_s.ph v0, v0, t8 // tmp12 ... - mulq_s.ph s4, s6, s4 // tmp10 ... - shll_s.ph v1, v1, 1 // x2 - addiu a0, a0, 4 - addiu a1, a1, 4 - sw s1, 112(a2) // wsptr[DCTSIZE*7] - shll_s.ph s6, v0, 1 // x4 - shll_s.ph s4, s4, 1 // x2 - addq.ph s6, s6, v1 // ... tmp12 - subq.ph t5, s6, t7 // tmp6 - subq.ph s4, s4, v1 // ... tmp10 - subq.ph t3, s5, t5 // tmp5 - addq.ph s2, t2, t5 - addq.ph t1, s4, t3 // tmp4 - subq.ph s3, t2, t5 - sw s2, 16(a2) // wsptr[DCTSIZE*1] - sw s3, 96(a2) // wsptr[DCTSIZE*6] - addq.ph v0, t4, t3 - subq.ph v1, t4, t3 - sw v0, 32(a2) // wsptr[DCTSIZE*2] - sw v1, 80(a2) // wsptr[DCTSIZE*5] - addq.ph v0, t6, t1 - subq.ph v1, t6, t1 - sw v0, 64(a2) // wsptr[DCTSIZE*4] - sw v1, 48(a2) // wsptr[DCTSIZE*3] - -2: - bne a0, t9, 0b - addiu a2, a2, 4 - - RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 - - j ra - nop - -END(jsimd_idct_ifast_cols_dspr2) - - -/*****************************************************************************/ -LEAF_DSPR2(jsimd_idct_ifast_rows_dspr2) -/* - * a0 = wsptr - * a1 = output_buf - * a2 = output_col - * a3 = mips_idct_ifast_coefs - */ - SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3 - - addiu t9, a0, 128 // end address - lui s8, 0x8080 - ori s8, s8, 0x8080 - -0: - lw AT, 36(sp) // restore $a3 (mips_idct_ifast_coefs) - lw t0, 0(a0) // wsptr[DCTSIZE*0+0/1] b a - lw s0, 16(a0) // wsptr[DCTSIZE*1+0/1] B A - lw t2, 4(a0) // wsptr[DCTSIZE*0+2/3] d c - lw s2, 20(a0) // wsptr[DCTSIZE*1+2/3] D C - lw t4, 8(a0) // wsptr[DCTSIZE*0+4/5] f e - lw s4, 24(a0) // wsptr[DCTSIZE*1+4/5] F E - lw t6, 12(a0) // wsptr[DCTSIZE*0+6/7] h g - lw s6, 28(a0) // wsptr[DCTSIZE*1+6/7] H G - precrq.ph.w t1, s0, t0 // B b - ins t0, s0, 16, 16 // A a - bnez t1, 1f - or s0, t2, s2 - bnez s0, 1f - or s0, t4, s4 - bnez s0, 1f - or s0, t6, s6 - bnez s0, 1f - shll_s.ph s0, t0, 2 // A a - lw a3, 0(a1) - lw AT, 4(a1) - precrq.ph.w t0, s0, s0 // A A - ins s0, s0, 16, 16 // a a - addu a3, a3, a2 - addu AT, AT, a2 - precrq.qb.ph t0, t0, t0 // A A A A - precrq.qb.ph s0, s0, s0 // a a a a - addu.qb s0, s0, s8 - addu.qb t0, t0, s8 - sw s0, 0(a3) - sw s0, 4(a3) - sw t0, 0(AT) - sw t0, 4(AT) - addiu a0, a0, 32 - bne a0, t9, 0b - addiu a1, a1, 8 - b 2f - nop - -1: - precrq.ph.w t3, s2, t2 - ins t2, s2, 16, 16 - precrq.ph.w t5, s4, t4 - ins t4, s4, 16, 16 - precrq.ph.w t7, s6, t6 - ins t6, s6, 16, 16 - lw t8, 4(AT) // FIX(1.414213562) - addq.ph s4, t0, t4 // tmp10 - subq.ph s5, t0, t4 // tmp11 - subq.ph s6, t2, t6 // tmp12 ... - addq.ph s7, t2, t6 // tmp13 - mulq_s.ph s6, s6, t8 // ... tmp12 ... - addq.ph t0, s4, s7 // tmp0 - subq.ph t6, s4, s7 // tmp3 - shll_s.ph s6, s6, 1 // x2 - subq.ph s6, s6, s7 // ... tmp12 - addq.ph t2, s5, s6 // tmp1 - subq.ph t4, s5, s6 // tmp2 - addq.ph s5, t1, t7 // z11 - subq.ph s6, t1, t7 // z12 - addq.ph s7, t5, t3 // z13 - subq.ph v0, t5, t3 // z10 - addq.ph t7, s5, s7 // tmp7 - subq.ph s5, s5, s7 // tmp11 ... - addq.ph v1, v0, s6 // z5 ... - mulq_s.ph s5, s5, t8 // ... tmp11 - lw t8, 8(AT) // FIX(1.847759065) - lw s4, 0(AT) // FIX(1.082392200) - addq.ph s0, t0, t7 // tmp0 + tmp7 - subq.ph s7, t0, t7 // tmp0 - tmp7 - mulq_s.ph v1, v1, t8 // ... z5 - lw a3, 0(a1) - lw t8, 12(AT) // FIX(-2.613125930) - shll_s.ph s5, s5, 1 // x2 - addu a3, a3, a2 - shll_s.ph v0, v0, 1 // x4 - mulq_s.ph v0, v0, t8 // tmp12 ... - mulq_s.ph s4, s6, s4 // tmp10 ... - shll_s.ph v1, v1, 1 // x2 - addiu a0, a0, 32 - addiu a1, a1, 8 - shll_s.ph s6, v0, 1 // x4 - shll_s.ph s4, s4, 1 // x2 - addq.ph s6, s6, v1 // ... tmp12 - shll_s.ph s0, s0, 2 - subq.ph t5, s6, t7 // tmp6 - subq.ph s4, s4, v1 // ... tmp10 - subq.ph t3, s5, t5 // tmp5 - shll_s.ph s7, s7, 2 - addq.ph t1, s4, t3 // tmp4 - addq.ph s1, t2, t5 // tmp1 + tmp6 - subq.ph s6, t2, t5 // tmp1 - tmp6 - addq.ph s2, t4, t3 // tmp2 + tmp5 - subq.ph s5, t4, t3 // tmp2 - tmp5 - addq.ph s4, t6, t1 // tmp3 + tmp4 - subq.ph s3, t6, t1 // tmp3 - tmp4 - shll_s.ph s1, s1, 2 - shll_s.ph s2, s2, 2 - shll_s.ph s3, s3, 2 - shll_s.ph s4, s4, 2 - shll_s.ph s5, s5, 2 - shll_s.ph s6, s6, 2 - precrq.ph.w t0, s1, s0 // B A - ins s0, s1, 16, 16 // b a - precrq.ph.w t2, s3, s2 // D C - ins s2, s3, 16, 16 // d c - precrq.ph.w t4, s5, s4 // F E - ins s4, s5, 16, 16 // f e - precrq.ph.w t6, s7, s6 // H G - ins s6, s7, 16, 16 // h g - precrq.qb.ph t0, t2, t0 // D C B A - precrq.qb.ph s0, s2, s0 // d c b a - precrq.qb.ph t4, t6, t4 // H G F E - precrq.qb.ph s4, s6, s4 // h g f e - addu.qb s0, s0, s8 - addu.qb s4, s4, s8 - sw s0, 0(a3) // outptr[0/1/2/3] d c b a - sw s4, 4(a3) // outptr[4/5/6/7] h g f e - lw a3, -4(a1) - addu.qb t0, t0, s8 - addu a3, a3, a2 - addu.qb t4, t4, s8 - sw t0, 0(a3) // outptr[0/1/2/3] D C B A - bne a0, t9, 0b - sw t4, 4(a3) // outptr[4/5/6/7] H G F E - -2: - - RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3 - - j ra - nop - -END(jsimd_idct_ifast_rows_dspr2) - - -/*****************************************************************************/ -LEAF_DSPR2(jsimd_fdct_islow_dspr2) -/* - * a0 = data - */ - SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8 - - lui t0, 6437 - ori t0, 2260 - lui t1, 9633 - ori t1, 11363 - lui t2, 0xd39e - ori t2, 0xe6dc - lui t3, 0xf72d - ori t3, 9633 - lui t4, 2261 - ori t4, 9633 - lui t5, 0xd39e - ori t5, 6437 - lui t6, 9633 - ori t6, 0xd39d - lui t7, 0xe6dc - ori t7, 2260 - lui t8, 4433 - ori t8, 10703 - lui t9, 0xd630 - ori t9, 4433 - li s8, 8 - move a1, a0 -1: - lw s0, 0(a1) // tmp0 = 1|0 - lw s1, 4(a1) // tmp1 = 3|2 - lw s2, 8(a1) // tmp2 = 5|4 - lw s3, 12(a1) // tmp3 = 7|6 - packrl.ph s1, s1, s1 // tmp1 = 2|3 - packrl.ph s3, s3, s3 // tmp3 = 6|7 - subq.ph s7, s1, s2 // tmp7 = 2-5|3-4 = t5|t4 - subq.ph s5, s0, s3 // tmp5 = 1-6|0-7 = t6|t7 - mult $0, $0 // ac0 = 0 - dpa.w.ph $ac0, s7, t0 // ac0 += t5* 6437 + t4* 2260 - dpa.w.ph $ac0, s5, t1 // ac0 += t6* 9633 + t7* 11363 - mult $ac1, $0, $0 // ac1 = 0 - dpa.w.ph $ac1, s7, t2 // ac1 += t5*-11362 + t4* -6436 - dpa.w.ph $ac1, s5, t3 // ac1 += t6* -2259 + t7* 9633 - mult $ac2, $0, $0 // ac2 = 0 - dpa.w.ph $ac2, s7, t4 // ac2 += t5* 2261 + t4* 9633 - dpa.w.ph $ac2, s5, t5 // ac2 += t6*-11362 + t7* 6437 - mult $ac3, $0, $0 // ac3 = 0 - dpa.w.ph $ac3, s7, t6 // ac3 += t5* 9633 + t4*-11363 - dpa.w.ph $ac3, s5, t7 // ac3 += t6* -6436 + t7* 2260 - addq.ph s6, s1, s2 // tmp6 = 2+5|3+4 = t2|t3 - addq.ph s4, s0, s3 // tmp4 = 1+6|0+7 = t1|t0 - extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11 - extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11 - extr_r.w s2, $ac2, 11 // tmp2 = (ac2 + 1024) >> 11 - extr_r.w s3, $ac3, 11 // tmp3 = (ac3 + 1024) >> 11 - addq.ph s5, s4, s6 // tmp5 = t1+t2|t0+t3 = t11|t10 - subq.ph s7, s4, s6 // tmp7 = t1-t2|t0-t3 = t12|t13 - sh s0, 2(a1) - sh s1, 6(a1) - sh s2, 10(a1) - sh s3, 14(a1) - mult $0, $0 // ac0 = 0 - dpa.w.ph $ac0, s7, t8 // ac0 += t12* 4433 + t13* 10703 - mult $ac1, $0, $0 // ac1 = 0 - dpa.w.ph $ac1, s7, t9 // ac1 += t12*-10704 + t13* 4433 - sra s4, s5, 16 // tmp4 = t11 - addiu a1, a1, 16 - addiu s8, s8, -1 - extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11 - extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11 - addu s2, s5, s4 // tmp2 = t10 + t11 - subu s3, s5, s4 // tmp3 = t10 - t11 - sll s2, s2, 2 // tmp2 = (t10 + t11) << 2 - sll s3, s3, 2 // tmp3 = (t10 - t11) << 2 - sh s2, -16(a1) - sh s3, -8(a1) - sh s0, -12(a1) - bgtz s8, 1b - sh s1, -4(a1) - li t0, 2260 - li t1, 11363 - li t2, 9633 - li t3, 6436 - li t4, 6437 - li t5, 2261 - li t6, 11362 - li t7, 2259 - li t8, 4433 - li t9, 10703 - li a1, 10704 - li s8, 8 - -2: - lh a2, 0(a0) // 0 - lh a3, 16(a0) // 8 - lh v0, 32(a0) // 16 - lh v1, 48(a0) // 24 - lh s4, 64(a0) // 32 - lh s5, 80(a0) // 40 - lh s6, 96(a0) // 48 - lh s7, 112(a0) // 56 - addu s2, v0, s5 // tmp2 = 16 + 40 - subu s5, v0, s5 // tmp5 = 16 - 40 - addu s3, v1, s4 // tmp3 = 24 + 32 - subu s4, v1, s4 // tmp4 = 24 - 32 - addu s0, a2, s7 // tmp0 = 0 + 56 - subu s7, a2, s7 // tmp7 = 0 - 56 - addu s1, a3, s6 // tmp1 = 8 + 48 - subu s6, a3, s6 // tmp6 = 8 - 48 - addu a2, s0, s3 // tmp10 = tmp0 + tmp3 - subu v1, s0, s3 // tmp13 = tmp0 - tmp3 - addu a3, s1, s2 // tmp11 = tmp1 + tmp2 - subu v0, s1, s2 // tmp12 = tmp1 - tmp2 - mult s7, t1 // ac0 = tmp7 * c1 - madd s4, t0 // ac0 += tmp4 * c0 - madd s5, t4 // ac0 += tmp5 * c4 - madd s6, t2 // ac0 += tmp6 * c2 - mult $ac1, s7, t2 // ac1 = tmp7 * c2 - msub $ac1, s4, t3 // ac1 -= tmp4 * c3 - msub $ac1, s5, t6 // ac1 -= tmp5 * c6 - msub $ac1, s6, t7 // ac1 -= tmp6 * c7 - mult $ac2, s7, t4 // ac2 = tmp7 * c4 - madd $ac2, s4, t2 // ac2 += tmp4 * c2 - madd $ac2, s5, t5 // ac2 += tmp5 * c5 - msub $ac2, s6, t6 // ac2 -= tmp6 * c6 - mult $ac3, s7, t0 // ac3 = tmp7 * c0 - msub $ac3, s4, t1 // ac3 -= tmp4 * c1 - madd $ac3, s5, t2 // ac3 += tmp5 * c2 - msub $ac3, s6, t3 // ac3 -= tmp6 * c3 - extr_r.w s0, $ac0, 15 // tmp0 = (ac0 + 16384) >> 15 - extr_r.w s1, $ac1, 15 // tmp1 = (ac1 + 16384) >> 15 - extr_r.w s2, $ac2, 15 // tmp2 = (ac2 + 16384) >> 15 - extr_r.w s3, $ac3, 15 // tmp3 = (ac3 + 16384) >> 15 - addiu s8, s8, -1 - addu s4, a2, a3 // tmp4 = tmp10 + tmp11 - subu s5, a2, a3 // tmp5 = tmp10 - tmp11 - sh s0, 16(a0) - sh s1, 48(a0) - sh s2, 80(a0) - sh s3, 112(a0) - mult v0, t8 // ac0 = tmp12 * c8 - madd v1, t9 // ac0 += tmp13 * c9 - mult $ac1, v1, t8 // ac1 = tmp13 * c8 - msub $ac1, v0, a1 // ac1 -= tmp12 * c10 - addiu a0, a0, 2 - extr_r.w s6, $ac0, 15 // tmp6 = (ac0 + 16384) >> 15 - extr_r.w s7, $ac1, 15 // tmp7 = (ac1 + 16384) >> 15 - shra_r.w s4, s4, 2 // tmp4 = (tmp4 + 2) >> 2 - shra_r.w s5, s5, 2 // tmp5 = (tmp5 + 2) >> 2 - sh s4, -2(a0) - sh s5, 62(a0) - sh s6, 30(a0) - bgtz s8, 2b - sh s7, 94(a0) - - RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8 - - jr ra - nop - -END(jsimd_fdct_islow_dspr2) - - -/**************************************************************************/ -LEAF_DSPR2(jsimd_fdct_ifast_dspr2) -/* - * a0 = data - */ - .set at - - SAVE_REGS_ON_STACK 8, s0, s1 - - li a1, 0x014e014e // FIX_1_306562965 (334 << 16)|(334 & 0xffff) - li a2, 0x008b008b // FIX_0_541196100 (139 << 16)|(139 & 0xffff) - li a3, 0x00620062 // FIX_0_382683433 (98 << 16) |(98 & 0xffff) - li s1, 0x00b500b5 // FIX_0_707106781 (181 << 16)|(181 & 0xffff) - - move v0, a0 - addiu v1, v0, 128 // end address - -0: - lw t0, 0(v0) // tmp0 = 1|0 - lw t1, 4(v0) // tmp1 = 3|2 - lw t2, 8(v0) // tmp2 = 5|4 - lw t3, 12(v0) // tmp3 = 7|6 - packrl.ph t1, t1, t1 // tmp1 = 2|3 - packrl.ph t3, t3, t3 // tmp3 = 6|7 - subq.ph t7, t1, t2 // tmp7 = 2-5|3-4 = t5|t4 - subq.ph t5, t0, t3 // tmp5 = 1-6|0-7 = t6|t7 - addq.ph t6, t1, t2 // tmp6 = 2+5|3+4 = t2|t3 - addq.ph t4, t0, t3 // tmp4 = 1+6|0+7 = t1|t0 - addq.ph t8, t4, t6 // tmp5 = t1+t2|t0+t3 = t11|t10 - subq.ph t9, t4, t6 // tmp7 = t1-t2|t0-t3 = t12|t13 - sra t4, t8, 16 // tmp4 = t11 - mult $0, $0 // ac0 = 0 - dpa.w.ph $ac0, t9, s1 - mult $ac1, $0, $0 // ac1 = 0 - dpa.w.ph $ac1, t7, a3 // ac1 += t4*98 + t5*98 - dpsx.w.ph $ac1, t5, a3 // ac1 += t6*98 + t7*98 - mult $ac2, $0, $0 // ac2 = 0 - dpa.w.ph $ac2, t7, a2 // ac2 += t4*139 + t5*139 - mult $ac3, $0, $0 // ac3 = 0 - dpa.w.ph $ac3, t5, a1 // ac3 += t6*334 + t7*334 - precrq.ph.w t0, t5, t7 // t0 = t5|t6 - addq.ph t2, t8, t4 // tmp2 = t10 + t11 - subq.ph t3, t8, t4 // tmp3 = t10 - t11 - extr.w t4, $ac0, 8 - mult $0, $0 // ac0 = 0 - dpa.w.ph $ac0, t0, s1 // ac0 += t5*181 + t6*181 - extr.w t0, $ac1, 8 // t0 = z5 - extr.w t1, $ac2, 8 // t1 = MULTIPLY(tmp10, 139) - extr.w t7, $ac3, 8 // t2 = MULTIPLY(tmp12, 334) - extr.w t8, $ac0, 8 // t8 = z3 = MULTIPLY(tmp11, 181) - add t6, t1, t0 // t6 = z2 - add t7, t7, t0 // t7 = z4 - subq.ph t0, t5, t8 // t0 = z13 = tmp7 - z3 - addq.ph t8, t5, t8 // t9 = z11 = tmp7 + z3 - addq.ph t1, t0, t6 // t1 = z13 + z2 - subq.ph t6, t0, t6 // t6 = z13 - z2 - addq.ph t0, t8, t7 // t0 = z11 + z4 - subq.ph t7, t8, t7 // t7 = z11 - z4 - addq.ph t5, t4, t9 - subq.ph t4, t9, t4 - sh t2, 0(v0) - sh t5, 4(v0) - sh t3, 8(v0) - sh t4, 12(v0) - sh t1, 10(v0) - sh t6, 6(v0) - sh t0, 2(v0) - sh t7, 14(v0) - addiu v0, 16 - bne v1, v0, 0b - nop - move v0, a0 - addiu v1, v0, 16 - -1: - lh t0, 0(v0) // 0 - lh t1, 16(v0) // 8 - lh t2, 32(v0) // 16 - lh t3, 48(v0) // 24 - lh t4, 64(v0) // 32 - lh t5, 80(v0) // 40 - lh t6, 96(v0) // 48 - lh t7, 112(v0) // 56 - add t8, t0, t7 // t8 = tmp0 - sub t7, t0, t7 // t7 = tmp7 - add t0, t1, t6 // t0 = tmp1 - sub t1, t1, t6 // t1 = tmp6 - add t6, t2, t5 // t6 = tmp2 - sub t5, t2, t5 // t5 = tmp5 - add t2, t3, t4 // t2 = tmp3 - sub t3, t3, t4 // t3 = tmp4 - add t4, t8, t2 // t4 = tmp10 = tmp0 + tmp3 - sub t8, t8, t2 // t8 = tmp13 = tmp0 - tmp3 - sub s0, t0, t6 // s0 = tmp12 = tmp1 - tmp2 - ins t8, s0, 16, 16 // t8 = tmp12|tmp13 - add t2, t0, t6 // t2 = tmp11 = tmp1 + tmp2 - mult $0, $0 // ac0 = 0 - dpa.w.ph $ac0, t8, s1 // ac0 += t12*181 + t13*181 - add s0, t4, t2 // t8 = tmp10+tmp11 - sub t4, t4, t2 // t4 = tmp10-tmp11 - sh s0, 0(v0) - sh t4, 64(v0) - extr.w t2, $ac0, 8 // z1 = MULTIPLY(tmp12+tmp13, FIX_0_707106781) - addq.ph t4, t8, t2 // t9 = tmp13 + z1 - subq.ph t8, t8, t2 // t2 = tmp13 - z1 - sh t4, 32(v0) - sh t8, 96(v0) - add t3, t3, t5 // t3 = tmp10 = tmp4 + tmp5 - add t0, t5, t1 // t0 = tmp11 = tmp5 + tmp6 - add t1, t1, t7 // t1 = tmp12 = tmp6 + tmp7 - andi t4, a1, 0xffff - mul s0, t1, t4 - sra s0, s0, 8 // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965) - ins t1, t3, 16, 16 // t1 = tmp10|tmp12 - mult $0, $0 // ac0 = 0 - mulsa.w.ph $ac0, t1, a3 // ac0 += t10*98 - t12*98 - extr.w t8, $ac0, 8 // z5 = MULTIPLY(tmp10-tmp12, FIX_0_382683433) - add t2, t7, t8 // t2 = tmp7 + z5 - sub t7, t7, t8 // t7 = tmp7 - z5 - andi t4, a2, 0xffff - mul t8, t3, t4 - sra t8, t8, 8 // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100) - andi t4, s1, 0xffff - mul t6, t0, t4 - sra t6, t6, 8 // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781) - add t0, t6, t8 // t0 = z3 + z2 - sub t1, t6, t8 // t1 = z3 - z2 - add t3, t6, s0 // t3 = z3 + z4 - sub t4, t6, s0 // t4 = z3 - z4 - sub t5, t2, t1 // t5 = dataptr[5] - sub t6, t7, t0 // t6 = dataptr[3] - add t3, t2, t3 // t3 = dataptr[1] - add t4, t7, t4 // t4 = dataptr[7] - sh t5, 80(v0) - sh t6, 48(v0) - sh t3, 16(v0) - sh t4, 112(v0) - addiu v0, 2 - bne v0, v1, 1b - nop - - RESTORE_REGS_FROM_STACK 8, s0, s1 - - j ra - nop -END(jsimd_fdct_ifast_dspr2) - - -/*****************************************************************************/ -LEAF_DSPR2(jsimd_quantize_dspr2) -/* - * a0 = coef_block - * a1 = divisors - * a2 = workspace - */ - .set at - - SAVE_REGS_ON_STACK 16, s0, s1, s2 - - addiu v0, a2, 124 // v0 = workspace_end - lh t0, 0(a2) - lh t1, 0(a1) - lh t2, 128(a1) - sra t3, t0, 15 - sll t3, t3, 1 - addiu t3, t3, 1 - mul t0, t0, t3 - lh t4, 384(a1) - lh t5, 130(a1) - lh t6, 2(a2) - lh t7, 2(a1) - lh t8, 386(a1) - -1: - andi t1, 0xffff - add t9, t0, t2 - andi t9, 0xffff - mul v1, t9, t1 - sra s0, t6, 15 - sll s0, s0, 1 - addiu s0, s0, 1 - addiu t9, t4, 16 - srav v1, v1, t9 - mul v1, v1, t3 - mul t6, t6, s0 - andi t7, 0xffff - addiu a2, a2, 4 - addiu a1, a1, 4 - add s1, t6, t5 - andi s1, 0xffff - sh v1, 0(a0) - - mul s2, s1, t7 - addiu s1, t8, 16 - srav s2, s2, s1 - mul s2, s2, s0 - lh t0, 0(a2) - lh t1, 0(a1) - sra t3, t0, 15 - sll t3, t3, 1 - addiu t3, t3, 1 - mul t0, t0, t3 - lh t2, 128(a1) - lh t4, 384(a1) - lh t5, 130(a1) - lh t8, 386(a1) - lh t6, 2(a2) - lh t7, 2(a1) - sh s2, 2(a0) - lh t0, 0(a2) - sra t3, t0, 15 - sll t3, t3, 1 - addiu t3, t3, 1 - mul t0, t0, t3 - bne a2, v0, 1b - addiu a0, a0, 4 - - andi t1, 0xffff - add t9, t0, t2 - andi t9, 0xffff - mul v1, t9, t1 - sra s0, t6, 15 - sll s0, s0, 1 - addiu s0, s0, 1 - addiu t9, t4, 16 - srav v1, v1, t9 - mul v1, v1, t3 - mul t6, t6, s0 - andi t7, 0xffff - sh v1, 0(a0) - add s1, t6, t5 - andi s1, 0xffff - mul s2, s1, t7 - addiu s1, t8, 16 - addiu a2, a2, 4 - addiu a1, a1, 4 - srav s2, s2, s1 - mul s2, s2, s0 - sh s2, 2(a0) - - RESTORE_REGS_FROM_STACK 16, s0, s1, s2 - - j ra - nop - -END(jsimd_quantize_dspr2) - - -#ifndef __mips_soft_float - -/*****************************************************************************/ -LEAF_DSPR2(jsimd_quantize_float_dspr2) -/* - * a0 = coef_block - * a1 = divisors - * a2 = workspace - */ - .set at - - li t1, 0x46800100 // integer representation 16384.5 - mtc1 t1, f0 - li t0, 63 -0: - lwc1 f2, 0(a2) - lwc1 f10, 0(a1) - lwc1 f4, 4(a2) - lwc1 f12, 4(a1) - lwc1 f6, 8(a2) - lwc1 f14, 8(a1) - lwc1 f8, 12(a2) - lwc1 f16, 12(a1) - madd.s f2, f0, f2, f10 - madd.s f4, f0, f4, f12 - madd.s f6, f0, f6, f14 - madd.s f8, f0, f8, f16 - lwc1 f10, 16(a1) - lwc1 f12, 20(a1) - trunc.w.s f2, f2 - trunc.w.s f4, f4 - trunc.w.s f6, f6 - trunc.w.s f8, f8 - lwc1 f14, 24(a1) - lwc1 f16, 28(a1) - mfc1 t1, f2 - mfc1 t2, f4 - mfc1 t3, f6 - mfc1 t4, f8 - lwc1 f2, 16(a2) - lwc1 f4, 20(a2) - lwc1 f6, 24(a2) - lwc1 f8, 28(a2) - madd.s f2, f0, f2, f10 - madd.s f4, f0, f4, f12 - madd.s f6, f0, f6, f14 - madd.s f8, f0, f8, f16 - addiu t1, t1, -16384 - addiu t2, t2, -16384 - addiu t3, t3, -16384 - addiu t4, t4, -16384 - trunc.w.s f2, f2 - trunc.w.s f4, f4 - trunc.w.s f6, f6 - trunc.w.s f8, f8 - sh t1, 0(a0) - sh t2, 2(a0) - sh t3, 4(a0) - sh t4, 6(a0) - mfc1 t1, f2 - mfc1 t2, f4 - mfc1 t3, f6 - mfc1 t4, f8 - addiu t0, t0, -8 - addiu a2, a2, 32 - addiu a1, a1, 32 - addiu t1, t1, -16384 - addiu t2, t2, -16384 - addiu t3, t3, -16384 - addiu t4, t4, -16384 - sh t1, 8(a0) - sh t2, 10(a0) - sh t3, 12(a0) - sh t4, 14(a0) - bgez t0, 0b - addiu a0, a0, 16 - - j ra - nop - -END(jsimd_quantize_float_dspr2) - -#endif - - -/*****************************************************************************/ -LEAF_DSPR2(jsimd_idct_2x2_dspr2) -/* - * a0 = compptr->dct_table - * a1 = coef_block - * a2 = output_buf - * a3 = output_col - */ - .set at - - SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5 - - addiu sp, sp, -40 - move v0, sp - addiu s2, zero, 29692 - addiu s3, zero, -10426 - addiu s4, zero, 6967 - addiu s5, zero, -5906 - lh t0, 0(a1) // t0 = inptr[DCTSIZE*0] - lh t5, 0(a0) // t5 = quantptr[DCTSIZE*0] - lh t1, 48(a1) // t1 = inptr[DCTSIZE*3] - lh t6, 48(a0) // t6 = quantptr[DCTSIZE*3] - mul t4, t5, t0 - lh t0, 16(a1) // t0 = inptr[DCTSIZE*1] - lh t5, 16(a0) // t5 = quantptr[DCTSIZE*1] - mul t6, t6, t1 - mul t5, t5, t0 - lh t2, 80(a1) // t2 = inptr[DCTSIZE*5] - lh t7, 80(a0) // t7 = quantptr[DCTSIZE*5] - lh t3, 112(a1) // t3 = inptr[DCTSIZE*7] - lh t8, 112(a0) // t8 = quantptr[DCTSIZE*7] - mul t7, t7, t2 - mult zero, zero - mul t8, t8, t3 - li s0, 0x73FCD746 // s0 = (29692 << 16) | (-10426 & 0xffff) - li s1, 0x1B37E8EE // s1 = (6967 << 16) | (-5906 & 0xffff) - ins t6, t5, 16, 16 // t6 = t5|t6 - sll t4, t4, 15 - dpa.w.ph $ac0, t6, s0 - lh t1, 2(a1) - lh t6, 2(a0) - ins t8, t7, 16, 16 // t8 = t7|t8 - dpa.w.ph $ac0, t8, s1 - mflo t0, $ac0 - mul t5, t6, t1 - lh t1, 18(a1) - lh t6, 18(a0) - lh t2, 50(a1) - lh t7, 50(a0) - mul t6, t6, t1 - subu t8, t4, t0 - mul t7, t7, t2 - addu t0, t4, t0 - shra_r.w t0, t0, 13 - lh t1, 82(a1) - lh t2, 82(a0) - lh t3, 114(a1) - lh t4, 114(a0) - shra_r.w t8, t8, 13 - mul t1, t1, t2 - mul t3, t3, t4 - sw t0, 0(v0) - sw t8, 20(v0) - sll t4, t5, 15 - ins t7, t6, 16, 16 - mult zero, zero - dpa.w.ph $ac0, t7, s0 - ins t3, t1, 16, 16 - lh t1, 6(a1) - lh t6, 6(a0) - dpa.w.ph $ac0, t3, s1 - mflo t0, $ac0 - mul t5, t6, t1 - lh t1, 22(a1) - lh t6, 22(a0) - lh t2, 54(a1) - lh t7, 54(a0) - mul t6, t6, t1 - subu t8, t4, t0 - mul t7, t7, t2 - addu t0, t4, t0 - shra_r.w t0, t0, 13 - lh t1, 86(a1) - lh t2, 86(a0) - lh t3, 118(a1) - lh t4, 118(a0) - shra_r.w t8, t8, 13 - mul t1, t1, t2 - mul t3, t3, t4 - sw t0, 4(v0) - sw t8, 24(v0) - sll t4, t5, 15 - ins t7, t6, 16, 16 - mult zero, zero - dpa.w.ph $ac0, t7, s0 - ins t3, t1, 16, 16 - lh t1, 10(a1) - lh t6, 10(a0) - dpa.w.ph $ac0, t3, s1 - mflo t0, $ac0 - mul t5, t6, t1 - lh t1, 26(a1) - lh t6, 26(a0) - lh t2, 58(a1) - lh t7, 58(a0) - mul t6, t6, t1 - subu t8, t4, t0 - mul t7, t7, t2 - addu t0, t4, t0 - shra_r.w t0, t0, 13 - lh t1, 90(a1) - lh t2, 90(a0) - lh t3, 122(a1) - lh t4, 122(a0) - shra_r.w t8, t8, 13 - mul t1, t1, t2 - mul t3, t3, t4 - sw t0, 8(v0) - sw t8, 28(v0) - sll t4, t5, 15 - ins t7, t6, 16, 16 - mult zero, zero - dpa.w.ph $ac0, t7, s0 - ins t3, t1, 16, 16 - lh t1, 14(a1) - lh t6, 14(a0) - dpa.w.ph $ac0, t3, s1 - mflo t0, $ac0 - mul t5, t6, t1 - lh t1, 30(a1) - lh t6, 30(a0) - lh t2, 62(a1) - lh t7, 62(a0) - mul t6, t6, t1 - subu t8, t4, t0 - mul t7, t7, t2 - addu t0, t4, t0 - shra_r.w t0, t0, 13 - lh t1, 94(a1) - lh t2, 94(a0) - lh t3, 126(a1) - lh t4, 126(a0) - shra_r.w t8, t8, 13 - mul t1, t1, t2 - mul t3, t3, t4 - sw t0, 12(v0) - sw t8, 32(v0) - sll t4, t5, 15 - ins t7, t6, 16, 16 - mult zero, zero - dpa.w.ph $ac0, t7, s0 - ins t3, t1, 16, 16 - dpa.w.ph $ac0, t3, s1 - mflo t0, $ac0 - lw t9, 0(a2) - lw t3, 0(v0) - lw t7, 4(v0) - lw t1, 8(v0) - addu t9, t9, a3 - sll t3, t3, 15 - subu t8, t4, t0 - addu t0, t4, t0 - shra_r.w t0, t0, 13 - shra_r.w t8, t8, 13 - sw t0, 16(v0) - sw t8, 36(v0) - lw t5, 12(v0) - lw t6, 16(v0) - mult t7, s2 - madd t1, s3 - madd t5, s4 - madd t6, s5 - lw t5, 24(v0) - lw t7, 28(v0) - mflo t0, $ac0 - lw t8, 32(v0) - lw t2, 36(v0) - mult $ac1, t5, s2 - madd $ac1, t7, s3 - madd $ac1, t8, s4 - madd $ac1, t2, s5 - addu t1, t3, t0 - subu t6, t3, t0 - shra_r.w t1, t1, 20 - shra_r.w t6, t6, 20 - mflo t4, $ac1 - shll_s.w t1, t1, 24 - shll_s.w t6, t6, 24 - sra t1, t1, 24 - sra t6, t6, 24 - addiu t1, t1, 128 - addiu t6, t6, 128 - lw t0, 20(v0) - sb t1, 0(t9) - sb t6, 1(t9) - sll t0, t0, 15 - lw t9, 4(a2) - addu t1, t0, t4 - subu t6, t0, t4 - addu t9, t9, a3 - shra_r.w t1, t1, 20 - shra_r.w t6, t6, 20 - shll_s.w t1, t1, 24 - shll_s.w t6, t6, 24 - sra t1, t1, 24 - sra t6, t6, 24 - addiu t1, t1, 128 - addiu t6, t6, 128 - sb t1, 0(t9) - sb t6, 1(t9) - addiu sp, sp, 40 - - RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5 - - j ra - nop - -END(jsimd_idct_2x2_dspr2) - - -/*****************************************************************************/ -LEAF_DSPR2(jsimd_idct_4x4_dspr2) -/* - * a0 = compptr->dct_table - * a1 = coef_block - * a2 = output_buf - * a3 = output_col - * 16(sp) = workspace[DCTSIZE*4]; // buffers data between passes - */ - .set at - - SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 - - lw v1, 48(sp) - move t0, a1 - move t1, v1 - li t9, 4 - li s0, 0x2e75f93e - li s1, 0x21f9ba79 - li s2, 0xecc2efb0 - li s3, 0x52031ccd - -0: - lh s6, 32(t0) // inptr[DCTSIZE*2] - lh t6, 32(a0) // quantptr[DCTSIZE*2] - lh s7, 96(t0) // inptr[DCTSIZE*6] - lh t7, 96(a0) // quantptr[DCTSIZE*6] - mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) - lh s4, 0(t0) // inptr[DCTSIZE*0] - mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) - lh s5, 0(a0) // quantptr[0] - li s6, 15137 - li s7, 6270 - mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0]) - mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) - lh t5, 112(t0) // inptr[DCTSIZE*7] - mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) - lh s4, 112(a0) // quantptr[DCTSIZE*7] - lh v0, 80(t0) // inptr[DCTSIZE*5] - lh s5, 80(a0) // quantptr[DCTSIZE*5] - lh s6, 48(a0) // quantptr[DCTSIZE*3] - sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1) - lh s7, 16(a0) // quantptr[DCTSIZE*1] - lh t8, 16(t0) // inptr[DCTSIZE*1] - subu t6, t6, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6) - lh t7, 48(t0) // inptr[DCTSIZE*3] - mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7]) - mul v0, s5, v0 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5]) - mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3]) - mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1]) - addu t3, t2, t6 // tmp10 = tmp0 + z2 - subu t4, t2, t6 // tmp10 = tmp0 - z2 - mult $ac0, zero, zero - mult $ac1, zero, zero - ins t5, v0, 16, 16 - ins t7, t8, 16, 16 - addiu t9, t9, -1 - dpa.w.ph $ac0, t5, s0 - dpa.w.ph $ac0, t7, s1 - dpa.w.ph $ac1, t5, s2 - dpa.w.ph $ac1, t7, s3 - mflo s4, $ac0 - mflo s5, $ac1 - addiu a0, a0, 2 - addiu t1, t1, 4 - addiu t0, t0, 2 - addu t6, t4, s4 - subu t5, t4, s4 - addu s6, t3, s5 - subu s7, t3, s5 - shra_r.w t6, t6, 12 // DESCALE(tmp12 + temp1, 12) - shra_r.w t5, t5, 12 // DESCALE(tmp12 - temp1, 12) - shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12) - shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12) - sw t6, 28(t1) - sw t5, 60(t1) - sw s6, -4(t1) - bgtz t9, 0b - sw s7, 92(t1) - // second loop three pass - li t9, 3 -1: - lh s6, 34(t0) // inptr[DCTSIZE*2] - lh t6, 34(a0) // quantptr[DCTSIZE*2] - lh s7, 98(t0) // inptr[DCTSIZE*6] - lh t7, 98(a0) // quantptr[DCTSIZE*6] - mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) - lh s4, 2(t0) // inptr[DCTSIZE*0] - mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) - lh s5, 2(a0) // quantptr[DCTSIZE*0] - li s6, 15137 - li s7, 6270 - mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0]) - mul v0, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) - lh t5, 114(t0) // inptr[DCTSIZE*7] - mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) - lh s4, 114(a0) // quantptr[DCTSIZE*7] - lh s5, 82(a0) // quantptr[DCTSIZE*5] - lh t6, 82(t0) // inptr[DCTSIZE*5] - sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1) - lh s6, 50(a0) // quantptr[DCTSIZE*3] - lh t8, 18(t0) // inptr[DCTSIZE*1] - subu v0, v0, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6) - lh t7, 50(t0) // inptr[DCTSIZE*3] - lh s7, 18(a0) // quantptr[DCTSIZE*1] - mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7]) - mul t6, s5, t6 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5]) - mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3]) - mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1]) - addu t3, t2, v0 // tmp10 = tmp0 + z2 - subu t4, t2, v0 // tmp10 = tmp0 - z2 - mult $ac0, zero, zero - mult $ac1, zero, zero - ins t5, t6, 16, 16 - ins t7, t8, 16, 16 - dpa.w.ph $ac0, t5, s0 - dpa.w.ph $ac0, t7, s1 - dpa.w.ph $ac1, t5, s2 - dpa.w.ph $ac1, t7, s3 - mflo t5, $ac0 - mflo t6, $ac1 - addiu t9, t9, -1 - addiu t0, t0, 2 - addiu a0, a0, 2 - addiu t1, t1, 4 - addu s5, t4, t5 - subu s4, t4, t5 - addu s6, t3, t6 - subu s7, t3, t6 - shra_r.w s5, s5, 12 // DESCALE(tmp12 + temp1, 12) - shra_r.w s4, s4, 12 // DESCALE(tmp12 - temp1, 12) - shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12) - shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12) - sw s5, 32(t1) - sw s4, 64(t1) - sw s6, 0(t1) - bgtz t9, 1b - sw s7, 96(t1) - move t1, v1 - li s4, 15137 - lw s6, 8(t1) // wsptr[2] - li s5, 6270 - lw s7, 24(t1) // wsptr[6] - mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065) - lw t2, 0(t1) // wsptr[0] - mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865) - lh t5, 28(t1) // wsptr[7] - lh t6, 20(t1) // wsptr[5] - lh t7, 12(t1) // wsptr[3] - lh t8, 4(t1) // wsptr[1] - ins t5, t6, 16, 16 - ins t7, t8, 16, 16 - mult $ac0, zero, zero - dpa.w.ph $ac0, t5, s0 - dpa.w.ph $ac0, t7, s1 - mult $ac1, zero, zero - dpa.w.ph $ac1, t5, s2 - dpa.w.ph $ac1, t7, s3 - sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1) - mflo s6, $ac0 - // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) - subu s4, s4, s5 - addu t3, t2, s4 // tmp10 = tmp0 + z2 - mflo s7, $ac1 - subu t4, t2, s4 // tmp10 = tmp0 - z2 - addu t7, t4, s6 - subu t8, t4, s6 - addu t5, t3, s7 - subu t6, t3, s7 - shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19) - shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19) - shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19) - shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19) - sll s4, t9, 2 - lw v0, 0(a2) // output_buf[ctr] - shll_s.w t5, t5, 24 - shll_s.w t6, t6, 24 - shll_s.w t7, t7, 24 - shll_s.w t8, t8, 24 - sra t5, t5, 24 - sra t6, t6, 24 - sra t7, t7, 24 - sra t8, t8, 24 - addu v0, v0, a3 // outptr = output_buf[ctr] + output_col - addiu t5, t5, 128 - addiu t6, t6, 128 - addiu t7, t7, 128 - addiu t8, t8, 128 - sb t5, 0(v0) - sb t7, 1(v0) - sb t8, 2(v0) - sb t6, 3(v0) - // 2 - li s4, 15137 - lw s6, 40(t1) // wsptr[2] - li s5, 6270 - lw s7, 56(t1) // wsptr[6] - mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065) - lw t2, 32(t1) // wsptr[0] - mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865) - lh t5, 60(t1) // wsptr[7] - lh t6, 52(t1) // wsptr[5] - lh t7, 44(t1) // wsptr[3] - lh t8, 36(t1) // wsptr[1] - ins t5, t6, 16, 16 - ins t7, t8, 16, 16 - mult $ac0, zero, zero - dpa.w.ph $ac0, t5, s0 - dpa.w.ph $ac0, t7, s1 - mult $ac1, zero, zero - dpa.w.ph $ac1, t5, s2 - dpa.w.ph $ac1, t7, s3 - sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1) - mflo s6, $ac0 - // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) - subu s4, s4, s5 - addu t3, t2, s4 // tmp10 = tmp0 + z2 - mflo s7, $ac1 - subu t4, t2, s4 // tmp10 = tmp0 - z2 - addu t7, t4, s6 - subu t8, t4, s6 - addu t5, t3, s7 - subu t6, t3, s7 - shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+1) - shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+1) - shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+1) - shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+1) - sll s4, t9, 2 - lw v0, 4(a2) // output_buf[ctr] - shll_s.w t5, t5, 24 - shll_s.w t6, t6, 24 - shll_s.w t7, t7, 24 - shll_s.w t8, t8, 24 - sra t5, t5, 24 - sra t6, t6, 24 - sra t7, t7, 24 - sra t8, t8, 24 - addu v0, v0, a3 // outptr = output_buf[ctr] + output_col - addiu t5, t5, 128 - addiu t6, t6, 128 - addiu t7, t7, 128 - addiu t8, t8, 128 - sb t5, 0(v0) - sb t7, 1(v0) - sb t8, 2(v0) - sb t6, 3(v0) - // 3 - li s4, 15137 - lw s6, 72(t1) // wsptr[2] - li s5, 6270 - lw s7, 88(t1) // wsptr[6] - mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065) - lw t2, 64(t1) // wsptr[0] - mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865) - lh t5, 92(t1) // wsptr[7] - lh t6, 84(t1) // wsptr[5] - lh t7, 76(t1) // wsptr[3] - lh t8, 68(t1) // wsptr[1] - ins t5, t6, 16, 16 - ins t7, t8, 16, 16 - mult $ac0, zero, zero - dpa.w.ph $ac0, t5, s0 - dpa.w.ph $ac0, t7, s1 - mult $ac1, zero, zero - dpa.w.ph $ac1, t5, s2 - dpa.w.ph $ac1, t7, s3 - sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1) - mflo s6, $ac0 - // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) - subu s4, s4, s5 - addu t3, t2, s4 // tmp10 = tmp0 + z2 - mflo s7, $ac1 - subu t4, t2, s4 // tmp10 = tmp0 - z2 - addu t7, t4, s6 - subu t8, t4, s6 - addu t5, t3, s7 - subu t6, t3, s7 - shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19) - shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19) - shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19) - shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19) - sll s4, t9, 2 - lw v0, 8(a2) // output_buf[ctr] - shll_s.w t5, t5, 24 - shll_s.w t6, t6, 24 - shll_s.w t7, t7, 24 - shll_s.w t8, t8, 24 - sra t5, t5, 24 - sra t6, t6, 24 - sra t7, t7, 24 - sra t8, t8, 24 - addu v0, v0, a3 // outptr = output_buf[ctr] + output_col - addiu t5, t5, 128 - addiu t6, t6, 128 - addiu t7, t7, 128 - addiu t8, t8, 128 - sb t5, 0(v0) - sb t7, 1(v0) - sb t8, 2(v0) - sb t6, 3(v0) - li s4, 15137 - lw s6, 104(t1) // wsptr[2] - li s5, 6270 - lw s7, 120(t1) // wsptr[6] - mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065) - lw t2, 96(t1) // wsptr[0] - mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865) - lh t5, 124(t1) // wsptr[7] - lh t6, 116(t1) // wsptr[5] - lh t7, 108(t1) // wsptr[3] - lh t8, 100(t1) // wsptr[1] - ins t5, t6, 16, 16 - ins t7, t8, 16, 16 - mult $ac0, zero, zero - dpa.w.ph $ac0, t5, s0 - dpa.w.ph $ac0, t7, s1 - mult $ac1, zero, zero - dpa.w.ph $ac1, t5, s2 - dpa.w.ph $ac1, t7, s3 - sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1) - mflo s6, $ac0 - // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) - subu s4, s4, s5 - addu t3, t2, s4 // tmp10 = tmp0 + z2; - mflo s7, $ac1 - subu t4, t2, s4 // tmp10 = tmp0 - z2; - addu t7, t4, s6 - subu t8, t4, s6 - addu t5, t3, s7 - subu t6, t3, s7 - shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19) - shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19) - shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19) - shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19) - sll s4, t9, 2 - lw v0, 12(a2) // output_buf[ctr] - shll_s.w t5, t5, 24 - shll_s.w t6, t6, 24 - shll_s.w t7, t7, 24 - shll_s.w t8, t8, 24 - sra t5, t5, 24 - sra t6, t6, 24 - sra t7, t7, 24 - sra t8, t8, 24 - addu v0, v0, a3 // outptr = output_buf[ctr] + output_col - addiu t5, t5, 128 - addiu t6, t6, 128 - addiu t7, t7, 128 - addiu t8, t8, 128 - sb t5, 0(v0) - sb t7, 1(v0) - sb t8, 2(v0) - sb t6, 3(v0) - - RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 - - j ra - nop -END(jsimd_idct_4x4_dspr2) - - -/*****************************************************************************/ -LEAF_DSPR2(jsimd_idct_6x6_dspr2) -/* - * a0 = compptr->dct_table - * a1 = coef_block - * a2 = output_buf - * a3 = output_col - */ - .set at - - SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 - - addiu sp, sp, -144 - move v0, sp - addiu v1, v0, 24 - addiu t9, zero, 5793 - addiu s0, zero, 10033 - addiu s1, zero, 2998 - -1: - lh s2, 0(a0) // q0 = quantptr[ 0] - lh s3, 32(a0) // q1 = quantptr[16] - lh s4, 64(a0) // q2 = quantptr[32] - lh t2, 64(a1) // tmp2 = inptr[32] - lh t1, 32(a1) // tmp1 = inptr[16] - lh t0, 0(a1) // tmp0 = inptr[ 0] - mul t2, t2, s4 // tmp2 = tmp2 * q2 - mul t1, t1, s3 // tmp1 = tmp1 * q1 - mul t0, t0, s2 // tmp0 = tmp0 * q0 - lh t6, 16(a1) // z1 = inptr[ 8] - lh t8, 80(a1) // z3 = inptr[40] - lh t7, 48(a1) // z2 = inptr[24] - lh s2, 16(a0) // q0 = quantptr[ 8] - lh s4, 80(a0) // q2 = quantptr[40] - lh s3, 48(a0) // q1 = quantptr[24] - mul t2, t2, t9 // tmp2 = tmp2 * 5793 - mul t1, t1, s0 // tmp1 = tmp1 * 10033 - sll t0, t0, 13 // tmp0 = tmp0 << 13 - mul t6, t6, s2 // z1 = z1 * q0 - mul t8, t8, s4 // z3 = z3 * q2 - mul t7, t7, s3 // z2 = z2 * q1 - addu t3, t0, t2 // tmp10 = tmp0 + tmp2 - sll t2, t2, 1 // tmp2 = tmp2 << 2 - subu t4, t0, t2 // tmp11 = tmp0 - tmp2; - subu t5, t3, t1 // tmp12 = tmp10 - tmp1 - addu t3, t3, t1 // tmp10 = tmp10 + tmp1 - addu t1, t6, t8 // tmp1 = z1 + z3 - mul t1, t1, s1 // tmp1 = tmp1 * 2998 - shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11 - subu t2, t6, t8 // tmp2 = z1 - z3 - subu t2, t2, t7 // tmp2 = tmp2 - z2 - sll t2, t2, 2 // tmp2 = tmp2 << 2 - addu t0, t6, t7 // tmp0 = z1 + z2 - sll t0, t0, 13 // tmp0 = tmp0 << 13 - subu s2, t8, t7 // q0 = z3 - z2 - sll s2, s2, 13 // q0 = q0 << 13 - addu t0, t0, t1 // tmp0 = tmp0 + tmp1 - addu t1, s2, t1 // tmp1 = q0 + tmp1 - addu s2, t4, t2 // q0 = tmp11 + tmp2 - subu s3, t4, t2 // q1 = tmp11 - tmp2 - addu t6, t3, t0 // z1 = tmp10 + tmp0 - subu t7, t3, t0 // z2 = tmp10 - tmp0 - addu t4, t5, t1 // tmp11 = tmp12 + tmp1 - subu t5, t5, t1 // tmp12 = tmp12 - tmp1 - shra_r.w t6, t6, 11 // z1 = (z1 + 1024) >> 11 - shra_r.w t7, t7, 11 // z2 = (z2 + 1024) >> 11 - shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11 - shra_r.w t5, t5, 11 // tmp12 = (tmp12 + 1024) >> 11 - sw s2, 24(v0) - sw s3, 96(v0) - sw t6, 0(v0) - sw t7, 120(v0) - sw t4, 48(v0) - sw t5, 72(v0) - addiu v0, v0, 4 - addiu a1, a1, 2 - bne v0, v1, 1b - addiu a0, a0, 2 - - /* Pass 2: process 6 rows from work array, store into output array. */ - move v0, sp - addiu v1, v0, 144 - -2: - lw t0, 0(v0) - lw t2, 16(v0) - lw s5, 0(a2) - addiu t0, t0, 16 - sll t0, t0, 13 - mul t3, t2, t9 - lw t6, 4(v0) - lw t8, 20(v0) - lw t7, 12(v0) - addu s5, s5, a3 - addu s6, t6, t8 - mul s6, s6, s1 - addu t1, t0, t3 - subu t4, t0, t3 - subu t4, t4, t3 - lw t3, 8(v0) - mul t0, t3, s0 - addu s7, t6, t7 - sll s7, s7, 13 - addu s7, s6, s7 - subu t2, t8, t7 - sll t2, t2, 13 - addu t2, s6, t2 - subu s6, t6, t7 - subu s6, s6, t8 - sll s6, s6, 13 - addu t3, t1, t0 - subu t5, t1, t0 - addu t6, t3, s7 - subu t3, t3, s7 - addu t7, t4, s6 - subu t4, t4, s6 - addu t8, t5, t2 - subu t5, t5, t2 - shll_s.w t6, t6, 6 - shll_s.w t3, t3, 6 - shll_s.w t7, t7, 6 - shll_s.w t4, t4, 6 - shll_s.w t8, t8, 6 - shll_s.w t5, t5, 6 - sra t6, t6, 24 - addiu t6, t6, 128 - sra t3, t3, 24 - addiu t3, t3, 128 - sb t6, 0(s5) - sra t7, t7, 24 - addiu t7, t7, 128 - sb t3, 5(s5) - sra t4, t4, 24 - addiu t4, t4, 128 - sb t7, 1(s5) - sra t8, t8, 24 - addiu t8, t8, 128 - sb t4, 4(s5) - addiu v0, v0, 24 - sra t5, t5, 24 - addiu t5, t5, 128 - sb t8, 2(s5) - addiu a2, a2, 4 - bne v0, v1, 2b - sb t5, 3(s5) - - addiu sp, sp, 144 - - RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 - - j ra - nop - -END(jsimd_idct_6x6_dspr2) - - -/*****************************************************************************/ -LEAF_DSPR2(jsimd_idct_12x12_pass1_dspr2) -/* - * a0 = compptr->dct_table - * a1 = coef_block - * a2 = workspace - */ - SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 - - li a3, 8 - -1: - // odd part - lh t0, 48(a1) - lh t1, 48(a0) - lh t2, 16(a1) - lh t3, 16(a0) - lh t4, 80(a1) - lh t5, 80(a0) - lh t6, 112(a1) - lh t7, 112(a0) - mul t0, t0, t1 // z2 - mul t1, t2, t3 // z1 - mul t2, t4, t5 // z3 - mul t3, t6, t7 // z4 - li t4, 10703 // FIX(1.306562965) - li t5, 4433 // FIX_0_541196100 - li t6, 7053 // FIX(0.860918669) - mul t4, t0, t4 // tmp11 - mul t5, t0, t5 // -tmp14 - addu t7, t1, t2 // tmp10 - addu t8, t7, t3 // tmp10 + z4 - mul t6, t6, t8 // tmp15 - li t8, 2139 // FIX(0.261052384) - mul t8, t7, t8 // MULTIPLY(tmp10, FIX(0.261052384)) - li t7, 2295 // FIX(0.280143716) - mul t7, t1, t7 // MULTIPLY(z1, FIX(0.280143716)) - addu t9, t2, t3 // z3 + z4 - li s0, 8565 // FIX(1.045510580) - mul t9, t9, s0 // -tmp13 - li s0, 12112 // FIX(1.478575242) - mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242) - li s1, 12998 // FIX(1.586706681) - mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681)) - li s2, 5540 // FIX(0.676326758) - mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758)) - li s3, 16244 // FIX(1.982889723) - mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723)) - subu t1, t1, t3 // z1-=z4 - subu t0, t0, t2 // z2-=z3 - addu t2, t0, t1 // z1+z2 - li t3, 4433 // FIX_0_541196100 - mul t2, t2, t3 // z3 - li t3, 6270 // FIX_0_765366865 - mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865) - li t3, 15137 // FIX_0_765366865 - mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065) - addu t8, t6, t8 // tmp12 - addu t3, t8, t4 // tmp12 + tmp11 - addu t3, t3, t7 // tmp10 - subu t8, t8, t9 // tmp12 + tmp13 - addu s0, t5, s0 - subu t8, t8, s0 // tmp12 - subu t9, t6, t9 - subu s1, s1, t4 - addu t9, t9, s1 // tmp13 - subu t6, t6, t5 - subu t6, t6, s2 - subu t6, t6, s3 // tmp15 - // even part start - lh t4, 64(a1) - lh t5, 64(a0) - lh t7, 32(a1) - lh s0, 32(a0) - lh s1, 0(a1) - lh s2, 0(a0) - lh s3, 96(a1) - lh v0, 96(a0) - mul t4, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) - mul t5, t7, s0 // DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) - mul t7, s1, s2 // DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - mul s0, s3, v0 // DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) - // odd part end - addu t1, t2, t1 // tmp11 - subu t0, t2, t0 // tmp14 - // update counter and pointers - addiu a3, a3, -1 - addiu a0, a0, 2 - addiu a1, a1, 2 - // even part rest - li s1, 10033 - li s2, 11190 - mul t4, t4, s1 // z4 - mul s1, t5, s2 // z4 - sll t5, t5, 13 // z1 - sll t7, t7, 13 - addiu t7, t7, 1024 // z3 - sll s0, s0, 13 // z2 - addu s2, t7, t4 // tmp10 - subu t4, t7, t4 // tmp11 - subu s3, t5, s0 // tmp12 - addu t2, t7, s3 // tmp21 - subu s3, t7, s3 // tmp24 - addu t7, s1, s0 // tmp12 - addu v0, s2, t7 // tmp20 - subu s2, s2, t7 // tmp25 - subu s1, s1, t5 // z4 - z1 - subu s1, s1, s0 // tmp12 - addu s0, t4, s1 // tmp22 - subu t4, t4, s1 // tmp23 - // final output stage - addu t5, v0, t3 - subu v0, v0, t3 - addu t3, t2, t1 - subu t2, t2, t1 - addu t1, s0, t8 - subu s0, s0, t8 - addu t8, t4, t9 - subu t4, t4, t9 - addu t9, s3, t0 - subu s3, s3, t0 - addu t0, s2, t6 - subu s2, s2, t6 - sra t5, t5, 11 - sra t3, t3, 11 - sra t1, t1, 11 - sra t8, t8, 11 - sra t9, t9, 11 - sra t0, t0, 11 - sra s2, s2, 11 - sra s3, s3, 11 - sra t4, t4, 11 - sra s0, s0, 11 - sra t2, t2, 11 - sra v0, v0, 11 - sw t5, 0(a2) - sw t3, 32(a2) - sw t1, 64(a2) - sw t8, 96(a2) - sw t9, 128(a2) - sw t0, 160(a2) - sw s2, 192(a2) - sw s3, 224(a2) - sw t4, 256(a2) - sw s0, 288(a2) - sw t2, 320(a2) - sw v0, 352(a2) - bgtz a3, 1b - addiu a2, a2, 4 - - RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 - - j ra - nop - -END(jsimd_idct_12x12_pass1_dspr2) - - -/*****************************************************************************/ -LEAF_DSPR2(jsimd_idct_12x12_pass2_dspr2) -/* - * a0 = workspace - * a1 = output - */ - SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 - - li a3, 12 - -1: - // Odd part - lw t0, 12(a0) - lw t1, 4(a0) - lw t2, 20(a0) - lw t3, 28(a0) - li t4, 10703 // FIX(1.306562965) - li t5, 4433 // FIX_0_541196100 - mul t4, t0, t4 // tmp11 - mul t5, t0, t5 // -tmp14 - addu t6, t1, t2 // tmp10 - li t7, 2139 // FIX(0.261052384) - mul t7, t6, t7 // MULTIPLY(tmp10, FIX(0.261052384)) - addu t6, t6, t3 // tmp10 + z4 - li t8, 7053 // FIX(0.860918669) - mul t6, t6, t8 // tmp15 - li t8, 2295 // FIX(0.280143716) - mul t8, t1, t8 // MULTIPLY(z1, FIX(0.280143716)) - addu t9, t2, t3 // z3 + z4 - li s0, 8565 // FIX(1.045510580) - mul t9, t9, s0 // -tmp13 - li s0, 12112 // FIX(1.478575242) - mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242)) - li s1, 12998 // FIX(1.586706681) - mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681)) - li s2, 5540 // FIX(0.676326758) - mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758)) - li s3, 16244 // FIX(1.982889723) - mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723)) - subu t1, t1, t3 // z1 -= z4 - subu t0, t0, t2 // z2 -= z3 - addu t2, t1, t0 // z1 + z2 - li t3, 4433 // FIX_0_541196100 - mul t2, t2, t3 // z3 - li t3, 6270 // FIX_0_765366865 - mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865) - li t3, 15137 // FIX_1_847759065 - mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065) - addu t3, t6, t7 // tmp12 - addu t7, t3, t4 - addu t7, t7, t8 // tmp10 - subu t3, t3, t9 - subu t3, t3, t5 - subu t3, t3, s0 // tmp12 - subu t9, t6, t9 - subu t9, t9, t4 - addu t9, t9, s1 // tmp13 - subu t6, t6, t5 - subu t6, t6, s2 - subu t6, t6, s3 // tmp15 - addu t1, t2, t1 // tmp11 - subu t0, t2, t0 // tmp14 - // even part - lw t2, 16(a0) // z4 - lw t4, 8(a0) // z1 - lw t5, 0(a0) // z3 - lw t8, 24(a0) // z2 - li s0, 10033 // FIX(1.224744871) - li s1, 11190 // FIX(1.366025404) - mul t2, t2, s0 // z4 - mul s0, t4, s1 // z4 - addiu t5, t5, 0x10 - sll t5, t5, 13 // z3 - sll t4, t4, 13 // z1 - sll t8, t8, 13 // z2 - subu s1, t4, t8 // tmp12 - addu s2, t5, t2 // tmp10 - subu t2, t5, t2 // tmp11 - addu s3, t5, s1 // tmp21 - subu s1, t5, s1 // tmp24 - addu t5, s0, t8 // tmp12 - addu v0, s2, t5 // tmp20 - subu t5, s2, t5 // tmp25 - subu t4, s0, t4 - subu t4, t4, t8 // tmp12 - addu t8, t2, t4 // tmp22 - subu t2, t2, t4 // tmp23 - // increment counter and pointers - addiu a3, a3, -1 - addiu a0, a0, 32 - // Final stage - addu t4, v0, t7 - subu v0, v0, t7 - addu t7, s3, t1 - subu s3, s3, t1 - addu t1, t8, t3 - subu t8, t8, t3 - addu t3, t2, t9 - subu t2, t2, t9 - addu t9, s1, t0 - subu s1, s1, t0 - addu t0, t5, t6 - subu t5, t5, t6 - sll t4, t4, 4 - sll t7, t7, 4 - sll t1, t1, 4 - sll t3, t3, 4 - sll t9, t9, 4 - sll t0, t0, 4 - sll t5, t5, 4 - sll s1, s1, 4 - sll t2, t2, 4 - sll t8, t8, 4 - sll s3, s3, 4 - sll v0, v0, 4 - shll_s.w t4, t4, 2 - shll_s.w t7, t7, 2 - shll_s.w t1, t1, 2 - shll_s.w t3, t3, 2 - shll_s.w t9, t9, 2 - shll_s.w t0, t0, 2 - shll_s.w t5, t5, 2 - shll_s.w s1, s1, 2 - shll_s.w t2, t2, 2 - shll_s.w t8, t8, 2 - shll_s.w s3, s3, 2 - shll_s.w v0, v0, 2 - srl t4, t4, 24 - srl t7, t7, 24 - srl t1, t1, 24 - srl t3, t3, 24 - srl t9, t9, 24 - srl t0, t0, 24 - srl t5, t5, 24 - srl s1, s1, 24 - srl t2, t2, 24 - srl t8, t8, 24 - srl s3, s3, 24 - srl v0, v0, 24 - lw t6, 0(a1) - addiu t4, t4, 0x80 - addiu t7, t7, 0x80 - addiu t1, t1, 0x80 - addiu t3, t3, 0x80 - addiu t9, t9, 0x80 - addiu t0, t0, 0x80 - addiu t5, t5, 0x80 - addiu s1, s1, 0x80 - addiu t2, t2, 0x80 - addiu t8, t8, 0x80 - addiu s3, s3, 0x80 - addiu v0, v0, 0x80 - sb t4, 0(t6) - sb t7, 1(t6) - sb t1, 2(t6) - sb t3, 3(t6) - sb t9, 4(t6) - sb t0, 5(t6) - sb t5, 6(t6) - sb s1, 7(t6) - sb t2, 8(t6) - sb t8, 9(t6) - sb s3, 10(t6) - sb v0, 11(t6) - bgtz a3, 1b - addiu a1, a1, 4 - - RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 - - jr ra - nop - -END(jsimd_idct_12x12_pass2_dspr2) - - -/*****************************************************************************/ -LEAF_DSPR2(jsimd_convsamp_dspr2) -/* - * a0 = sample_data - * a1 = start_col - * a2 = workspace - */ - lw t0, 0(a0) - li t7, 0xff80ff80 - addu t0, t0, a1 - ulw t1, 0(t0) - ulw t2, 4(t0) - preceu.ph.qbr t3, t1 - preceu.ph.qbl t4, t1 - lw t0, 4(a0) - preceu.ph.qbr t5, t2 - preceu.ph.qbl t6, t2 - addu t0, t0, a1 - addu.ph t3, t3, t7 - addu.ph t4, t4, t7 - ulw t1, 0(t0) - ulw t2, 4(t0) - addu.ph t5, t5, t7 - addu.ph t6, t6, t7 - usw t3, 0(a2) - usw t4, 4(a2) - preceu.ph.qbr t3, t1 - preceu.ph.qbl t4, t1 - usw t5, 8(a2) - usw t6, 12(a2) - - lw t0, 8(a0) - preceu.ph.qbr t5, t2 - preceu.ph.qbl t6, t2 - addu t0, t0, a1 - addu.ph t3, t3, t7 - addu.ph t4, t4, t7 - ulw t1, 0(t0) - ulw t2, 4(t0) - addu.ph t5, t5, t7 - addu.ph t6, t6, t7 - usw t3, 16(a2) - usw t4, 20(a2) - preceu.ph.qbr t3, t1 - preceu.ph.qbl t4, t1 - usw t5, 24(a2) - usw t6, 28(a2) - - lw t0, 12(a0) - preceu.ph.qbr t5, t2 - preceu.ph.qbl t6, t2 - addu t0, t0, a1 - addu.ph t3, t3, t7 - addu.ph t4, t4, t7 - ulw t1, 0(t0) - ulw t2, 4(t0) - addu.ph t5, t5, t7 - addu.ph t6, t6, t7 - usw t3, 32(a2) - usw t4, 36(a2) - preceu.ph.qbr t3, t1 - preceu.ph.qbl t4, t1 - usw t5, 40(a2) - usw t6, 44(a2) - - lw t0, 16(a0) - preceu.ph.qbr t5, t2 - preceu.ph.qbl t6, t2 - addu t0, t0, a1 - addu.ph t3, t3, t7 - addu.ph t4, t4, t7 - ulw t1, 0(t0) - ulw t2, 4(t0) - addu.ph t5, t5, t7 - addu.ph t6, t6, t7 - usw t3, 48(a2) - usw t4, 52(a2) - preceu.ph.qbr t3, t1 - preceu.ph.qbl t4, t1 - usw t5, 56(a2) - usw t6, 60(a2) - - lw t0, 20(a0) - preceu.ph.qbr t5, t2 - preceu.ph.qbl t6, t2 - addu t0, t0, a1 - addu.ph t3, t3, t7 - addu.ph t4, t4, t7 - ulw t1, 0(t0) - ulw t2, 4(t0) - addu.ph t5, t5, t7 - addu.ph t6, t6, t7 - usw t3, 64(a2) - usw t4, 68(a2) - preceu.ph.qbr t3, t1 - preceu.ph.qbl t4, t1 - usw t5, 72(a2) - usw t6, 76(a2) - - lw t0, 24(a0) - preceu.ph.qbr t5, t2 - preceu.ph.qbl t6, t2 - addu t0, t0, a1 - addu.ph t3, t3, t7 - addu.ph t4, t4, t7 - ulw t1, 0(t0) - ulw t2, 4(t0) - addu.ph t5, t5, t7 - addu.ph t6, t6, t7 - usw t3, 80(a2) - usw t4, 84(a2) - preceu.ph.qbr t3, t1 - preceu.ph.qbl t4, t1 - usw t5, 88(a2) - usw t6, 92(a2) - - lw t0, 28(a0) - preceu.ph.qbr t5, t2 - preceu.ph.qbl t6, t2 - addu t0, t0, a1 - addu.ph t3, t3, t7 - addu.ph t4, t4, t7 - ulw t1, 0(t0) - ulw t2, 4(t0) - addu.ph t5, t5, t7 - addu.ph t6, t6, t7 - usw t3, 96(a2) - usw t4, 100(a2) - preceu.ph.qbr t3, t1 - preceu.ph.qbl t4, t1 - usw t5, 104(a2) - usw t6, 108(a2) - preceu.ph.qbr t5, t2 - preceu.ph.qbl t6, t2 - addu.ph t3, t3, t7 - addu.ph t4, t4, t7 - addu.ph t5, t5, t7 - addu.ph t6, t6, t7 - usw t3, 112(a2) - usw t4, 116(a2) - usw t5, 120(a2) - usw t6, 124(a2) - - j ra - nop - -END(jsimd_convsamp_dspr2) - - -#ifndef __mips_soft_float - -/*****************************************************************************/ -LEAF_DSPR2(jsimd_convsamp_float_dspr2) -/* - * a0 = sample_data - * a1 = start_col - * a2 = workspace - */ - .set at - - lw t0, 0(a0) - addu t0, t0, a1 - lbu t1, 0(t0) - lbu t2, 1(t0) - lbu t3, 2(t0) - lbu t4, 3(t0) - lbu t5, 4(t0) - lbu t6, 5(t0) - lbu t7, 6(t0) - lbu t8, 7(t0) - addiu t1, t1, -128 - addiu t2, t2, -128 - addiu t3, t3, -128 - addiu t4, t4, -128 - addiu t5, t5, -128 - addiu t6, t6, -128 - addiu t7, t7, -128 - addiu t8, t8, -128 - mtc1 t1, f2 - mtc1 t2, f4 - mtc1 t3, f6 - mtc1 t4, f8 - mtc1 t5, f10 - mtc1 t6, f12 - mtc1 t7, f14 - mtc1 t8, f16 - cvt.s.w f2, f2 - cvt.s.w f4, f4 - cvt.s.w f6, f6 - cvt.s.w f8, f8 - cvt.s.w f10, f10 - cvt.s.w f12, f12 - cvt.s.w f14, f14 - cvt.s.w f16, f16 - lw t0, 4(a0) - swc1 f2, 0(a2) - swc1 f4, 4(a2) - swc1 f6, 8(a2) - addu t0, t0, a1 - swc1 f8, 12(a2) - swc1 f10, 16(a2) - swc1 f12, 20(a2) - swc1 f14, 24(a2) - swc1 f16, 28(a2) - // elemr 1 - lbu t1, 0(t0) - lbu t2, 1(t0) - lbu t3, 2(t0) - lbu t4, 3(t0) - lbu t5, 4(t0) - lbu t6, 5(t0) - lbu t7, 6(t0) - lbu t8, 7(t0) - addiu t1, t1, -128 - addiu t2, t2, -128 - addiu t3, t3, -128 - addiu t4, t4, -128 - addiu t5, t5, -128 - addiu t6, t6, -128 - addiu t7, t7, -128 - addiu t8, t8, -128 - mtc1 t1, f2 - mtc1 t2, f4 - mtc1 t3, f6 - mtc1 t4, f8 - mtc1 t5, f10 - mtc1 t6, f12 - mtc1 t7, f14 - mtc1 t8, f16 - cvt.s.w f2, f2 - cvt.s.w f4, f4 - cvt.s.w f6, f6 - cvt.s.w f8, f8 - cvt.s.w f10, f10 - cvt.s.w f12, f12 - cvt.s.w f14, f14 - cvt.s.w f16, f16 - lw t0, 8(a0) - swc1 f2, 32(a2) - swc1 f4, 36(a2) - swc1 f6, 40(a2) - addu t0, t0, a1 - swc1 f8, 44(a2) - swc1 f10, 48(a2) - swc1 f12, 52(a2) - swc1 f14, 56(a2) - swc1 f16, 60(a2) - // elemr 2 - lbu t1, 0(t0) - lbu t2, 1(t0) - lbu t3, 2(t0) - lbu t4, 3(t0) - lbu t5, 4(t0) - lbu t6, 5(t0) - lbu t7, 6(t0) - lbu t8, 7(t0) - addiu t1, t1, -128 - addiu t2, t2, -128 - addiu t3, t3, -128 - addiu t4, t4, -128 - addiu t5, t5, -128 - addiu t6, t6, -128 - addiu t7, t7, -128 - addiu t8, t8, -128 - mtc1 t1, f2 - mtc1 t2, f4 - mtc1 t3, f6 - mtc1 t4, f8 - mtc1 t5, f10 - mtc1 t6, f12 - mtc1 t7, f14 - mtc1 t8, f16 - cvt.s.w f2, f2 - cvt.s.w f4, f4 - cvt.s.w f6, f6 - cvt.s.w f8, f8 - cvt.s.w f10, f10 - cvt.s.w f12, f12 - cvt.s.w f14, f14 - cvt.s.w f16, f16 - lw t0, 12(a0) - swc1 f2, 64(a2) - swc1 f4, 68(a2) - swc1 f6, 72(a2) - addu t0, t0, a1 - swc1 f8, 76(a2) - swc1 f10, 80(a2) - swc1 f12, 84(a2) - swc1 f14, 88(a2) - swc1 f16, 92(a2) - // elemr 3 - lbu t1, 0(t0) - lbu t2, 1(t0) - lbu t3, 2(t0) - lbu t4, 3(t0) - lbu t5, 4(t0) - lbu t6, 5(t0) - lbu t7, 6(t0) - lbu t8, 7(t0) - addiu t1, t1, -128 - addiu t2, t2, -128 - addiu t3, t3, -128 - addiu t4, t4, -128 - addiu t5, t5, -128 - addiu t6, t6, -128 - addiu t7, t7, -128 - addiu t8, t8, -128 - mtc1 t1, f2 - mtc1 t2, f4 - mtc1 t3, f6 - mtc1 t4, f8 - mtc1 t5, f10 - mtc1 t6, f12 - mtc1 t7, f14 - mtc1 t8, f16 - cvt.s.w f2, f2 - cvt.s.w f4, f4 - cvt.s.w f6, f6 - cvt.s.w f8, f8 - cvt.s.w f10, f10 - cvt.s.w f12, f12 - cvt.s.w f14, f14 - cvt.s.w f16, f16 - lw t0, 16(a0) - swc1 f2, 96(a2) - swc1 f4, 100(a2) - swc1 f6, 104(a2) - addu t0, t0, a1 - swc1 f8, 108(a2) - swc1 f10, 112(a2) - swc1 f12, 116(a2) - swc1 f14, 120(a2) - swc1 f16, 124(a2) - // elemr 4 - lbu t1, 0(t0) - lbu t2, 1(t0) - lbu t3, 2(t0) - lbu t4, 3(t0) - lbu t5, 4(t0) - lbu t6, 5(t0) - lbu t7, 6(t0) - lbu t8, 7(t0) - addiu t1, t1, -128 - addiu t2, t2, -128 - addiu t3, t3, -128 - addiu t4, t4, -128 - addiu t5, t5, -128 - addiu t6, t6, -128 - addiu t7, t7, -128 - addiu t8, t8, -128 - mtc1 t1, f2 - mtc1 t2, f4 - mtc1 t3, f6 - mtc1 t4, f8 - mtc1 t5, f10 - mtc1 t6, f12 - mtc1 t7, f14 - mtc1 t8, f16 - cvt.s.w f2, f2 - cvt.s.w f4, f4 - cvt.s.w f6, f6 - cvt.s.w f8, f8 - cvt.s.w f10, f10 - cvt.s.w f12, f12 - cvt.s.w f14, f14 - cvt.s.w f16, f16 - lw t0, 20(a0) - swc1 f2, 128(a2) - swc1 f4, 132(a2) - swc1 f6, 136(a2) - addu t0, t0, a1 - swc1 f8, 140(a2) - swc1 f10, 144(a2) - swc1 f12, 148(a2) - swc1 f14, 152(a2) - swc1 f16, 156(a2) - // elemr 5 - lbu t1, 0(t0) - lbu t2, 1(t0) - lbu t3, 2(t0) - lbu t4, 3(t0) - lbu t5, 4(t0) - lbu t6, 5(t0) - lbu t7, 6(t0) - lbu t8, 7(t0) - addiu t1, t1, -128 - addiu t2, t2, -128 - addiu t3, t3, -128 - addiu t4, t4, -128 - addiu t5, t5, -128 - addiu t6, t6, -128 - addiu t7, t7, -128 - addiu t8, t8, -128 - mtc1 t1, f2 - mtc1 t2, f4 - mtc1 t3, f6 - mtc1 t4, f8 - mtc1 t5, f10 - mtc1 t6, f12 - mtc1 t7, f14 - mtc1 t8, f16 - cvt.s.w f2, f2 - cvt.s.w f4, f4 - cvt.s.w f6, f6 - cvt.s.w f8, f8 - cvt.s.w f10, f10 - cvt.s.w f12, f12 - cvt.s.w f14, f14 - cvt.s.w f16, f16 - lw t0, 24(a0) - swc1 f2, 160(a2) - swc1 f4, 164(a2) - swc1 f6, 168(a2) - addu t0, t0, a1 - swc1 f8, 172(a2) - swc1 f10, 176(a2) - swc1 f12, 180(a2) - swc1 f14, 184(a2) - swc1 f16, 188(a2) - // elemr 6 - lbu t1, 0(t0) - lbu t2, 1(t0) - lbu t3, 2(t0) - lbu t4, 3(t0) - lbu t5, 4(t0) - lbu t6, 5(t0) - lbu t7, 6(t0) - lbu t8, 7(t0) - addiu t1, t1, -128 - addiu t2, t2, -128 - addiu t3, t3, -128 - addiu t4, t4, -128 - addiu t5, t5, -128 - addiu t6, t6, -128 - addiu t7, t7, -128 - addiu t8, t8, -128 - mtc1 t1, f2 - mtc1 t2, f4 - mtc1 t3, f6 - mtc1 t4, f8 - mtc1 t5, f10 - mtc1 t6, f12 - mtc1 t7, f14 - mtc1 t8, f16 - cvt.s.w f2, f2 - cvt.s.w f4, f4 - cvt.s.w f6, f6 - cvt.s.w f8, f8 - cvt.s.w f10, f10 - cvt.s.w f12, f12 - cvt.s.w f14, f14 - cvt.s.w f16, f16 - lw t0, 28(a0) - swc1 f2, 192(a2) - swc1 f4, 196(a2) - swc1 f6, 200(a2) - addu t0, t0, a1 - swc1 f8, 204(a2) - swc1 f10, 208(a2) - swc1 f12, 212(a2) - swc1 f14, 216(a2) - swc1 f16, 220(a2) - // elemr 7 - lbu t1, 0(t0) - lbu t2, 1(t0) - lbu t3, 2(t0) - lbu t4, 3(t0) - lbu t5, 4(t0) - lbu t6, 5(t0) - lbu t7, 6(t0) - lbu t8, 7(t0) - addiu t1, t1, -128 - addiu t2, t2, -128 - addiu t3, t3, -128 - addiu t4, t4, -128 - addiu t5, t5, -128 - addiu t6, t6, -128 - addiu t7, t7, -128 - addiu t8, t8, -128 - mtc1 t1, f2 - mtc1 t2, f4 - mtc1 t3, f6 - mtc1 t4, f8 - mtc1 t5, f10 - mtc1 t6, f12 - mtc1 t7, f14 - mtc1 t8, f16 - cvt.s.w f2, f2 - cvt.s.w f4, f4 - cvt.s.w f6, f6 - cvt.s.w f8, f8 - cvt.s.w f10, f10 - cvt.s.w f12, f12 - cvt.s.w f14, f14 - cvt.s.w f16, f16 - swc1 f2, 224(a2) - swc1 f4, 228(a2) - swc1 f6, 232(a2) - swc1 f8, 236(a2) - swc1 f10, 240(a2) - swc1 f12, 244(a2) - swc1 f14, 248(a2) - swc1 f16, 252(a2) - - j ra - nop - -END(jsimd_convsamp_float_dspr2) - -#endif - -/*****************************************************************************/ diff --git a/simd/mips/jsimd_dspr2_asm.h b/simd/mips/jsimd_dspr2_asm.h deleted file mode 100644 index 12cfda4..0000000 --- a/simd/mips/jsimd_dspr2_asm.h +++ /dev/null @@ -1,292 +0,0 @@ -/* - * MIPS DSPr2 optimizations for libjpeg-turbo - * - * Copyright (C) 2013, MIPS Technologies, Inc., California. - * Copyright (C) 2018, Matthieu Darbois. - * All Rights Reserved. - * Authors: Teodora Novkovic (teodora.novkovic@imgtec.com) - * Darko Laus (darko.laus@imgtec.com) - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -#define zero $0 -#define AT $1 -#define v0 $2 -#define v1 $3 -#define a0 $4 -#define a1 $5 -#define a2 $6 -#define a3 $7 -#define t0 $8 -#define t1 $9 -#define t2 $10 -#define t3 $11 -#define t4 $12 -#define t5 $13 -#define t6 $14 -#define t7 $15 -#define s0 $16 -#define s1 $17 -#define s2 $18 -#define s3 $19 -#define s4 $20 -#define s5 $21 -#define s6 $22 -#define s7 $23 -#define t8 $24 -#define t9 $25 -#define k0 $26 -#define k1 $27 -#define gp $28 -#define sp $29 -#define fp $30 -#define s8 $30 -#define ra $31 - -#define f0 $f0 -#define f1 $f1 -#define f2 $f2 -#define f3 $f3 -#define f4 $f4 -#define f5 $f5 -#define f6 $f6 -#define f7 $f7 -#define f8 $f8 -#define f9 $f9 -#define f10 $f10 -#define f11 $f11 -#define f12 $f12 -#define f13 $f13 -#define f14 $f14 -#define f15 $f15 -#define f16 $f16 -#define f17 $f17 -#define f18 $f18 -#define f19 $f19 -#define f20 $f20 -#define f21 $f21 -#define f22 $f22 -#define f23 $f23 -#define f24 $f24 -#define f25 $f25 -#define f26 $f26 -#define f27 $f27 -#define f28 $f28 -#define f29 $f29 -#define f30 $f30 -#define f31 $f31 - -#ifdef __ELF__ -#define HIDDEN_SYMBOL(symbol) .hidden symbol; -#else -#define HIDDEN_SYMBOL(symbol) -#endif - -/* - * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2 - */ -#define LEAF_MIPS32R2(symbol) \ - .globl symbol; \ - HIDDEN_SYMBOL(symbol) \ - .align 2; \ - .type symbol, @function; \ - .ent symbol, 0; \ -symbol: \ - .frame sp, 0, ra; \ - .set push; \ - .set arch = mips32r2; \ - .set noreorder; \ - .set noat; - -/* - * LEAF_DSPR2 - declare leaf routine for MIPS DSPr2 - */ -#define LEAF_DSPR2(symbol) \ -LEAF_MIPS32R2(symbol) \ - .set dspr2; - -/* - * END - mark end of function - */ -#define END(function) \ - .set pop; \ - .end function; \ - .size function, .-function - -/* - * Checks if stack offset is big enough for storing/restoring regs_num - * number of register to/from stack. Stack offset must be greater than - * or equal to the number of bytes needed for storing registers (regs_num*4). - * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is - * preserved for input arguments of the functions, already stored in a0-a3), - * stack size can be further optimized by utilizing this space. - */ -.macro CHECK_STACK_OFFSET regs_num, stack_offset -.if \stack_offset < \regs_num * 4 - 16 -.error "Stack offset too small." -.endif -.endm - -/* - * Saves set of registers on stack. Maximum number of registers that - * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7). - * Stack offset is number of bytes that are added to stack pointer (sp) - * before registers are pushed in order to provide enough space on stack - * (offset must be multiple of 4, and must be big enough, as described by - * CHECK_STACK_OFFSET macro). This macro is intended to be used in - * combination with RESTORE_REGS_FROM_STACK macro. Example: - * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1 - * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1 - */ -.macro SAVE_REGS_ON_STACK stack_offset = 0, r1, \ - r2 = 0, r3 = 0, r4 = 0, \ - r5 = 0, r6 = 0, r7 = 0, \ - r8 = 0, r9 = 0, r10 = 0, \ - r11 = 0, r12 = 0, r13 = 0, \ - r14 = 0 -.if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4) - .error "Stack offset must be pozitive and multiple of 4." -.endif -.if \stack_offset != 0 - addiu sp, sp, -\stack_offset -.endif - sw \r1, 0(sp) -.if \r2 != 0 - sw \r2, 4(sp) -.endif -.if \r3 != 0 - sw \r3, 8(sp) -.endif -.if \r4 != 0 - sw \r4, 12(sp) -.endif -.if \r5 != 0 - CHECK_STACK_OFFSET 5, \stack_offset - sw \r5, 16(sp) -.endif -.if \r6 != 0 - CHECK_STACK_OFFSET 6, \stack_offset - sw \r6, 20(sp) -.endif -.if \r7 != 0 - CHECK_STACK_OFFSET 7, \stack_offset - sw \r7, 24(sp) -.endif -.if \r8 != 0 - CHECK_STACK_OFFSET 8, \stack_offset - sw \r8, 28(sp) -.endif -.if \r9 != 0 - CHECK_STACK_OFFSET 9, \stack_offset - sw \r9, 32(sp) -.endif -.if \r10 != 0 - CHECK_STACK_OFFSET 10, \stack_offset - sw \r10, 36(sp) -.endif -.if \r11 != 0 - CHECK_STACK_OFFSET 11, \stack_offset - sw \r11, 40(sp) -.endif -.if \r12 != 0 - CHECK_STACK_OFFSET 12, \stack_offset - sw \r12, 44(sp) -.endif -.if \r13 != 0 - CHECK_STACK_OFFSET 13, \stack_offset - sw \r13, 48(sp) -.endif -.if \r14 != 0 - CHECK_STACK_OFFSET 14, \stack_offset - sw \r14, 52(sp) -.endif -.endm - -/* - * Restores set of registers from stack. Maximum number of registers that - * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7). - * Stack offset is number of bytes that are added to stack pointer (sp) - * after registers are restored (offset must be multiple of 4, and must - * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is - * intended to be used in combination with RESTORE_REGS_FROM_STACK macro. - * Example: - * SAVE_REGS_ON_STACK 4, v0, v1, s0, s1 - * RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1 - */ -.macro RESTORE_REGS_FROM_STACK stack_offset = 0, r1, \ - r2 = 0, r3 = 0, r4 = 0, \ - r5 = 0, r6 = 0, r7 = 0, \ - r8 = 0, r9 = 0, r10 = 0, \ - r11 = 0, r12 = 0, r13 = 0, \ - r14 = 0 -.if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4) - .error "Stack offset must be pozitive and multiple of 4." -.endif - lw \r1, 0(sp) -.if \r2 != 0 - lw \r2, 4(sp) -.endif -.if \r3 != 0 - lw \r3, 8(sp) -.endif -.if \r4 != 0 - lw \r4, 12(sp) -.endif -.if \r5 != 0 - CHECK_STACK_OFFSET 5, \stack_offset - lw \r5, 16(sp) -.endif -.if \r6 != 0 - CHECK_STACK_OFFSET 6, \stack_offset - lw \r6, 20(sp) -.endif -.if \r7 != 0 - CHECK_STACK_OFFSET 7, \stack_offset - lw \r7, 24(sp) -.endif -.if \r8 != 0 - CHECK_STACK_OFFSET 8, \stack_offset - lw \r8, 28(sp) -.endif -.if \r9 != 0 - CHECK_STACK_OFFSET 9, \stack_offset - lw \r9, 32(sp) -.endif -.if \r10 != 0 - CHECK_STACK_OFFSET 10, \stack_offset - lw \r10, 36(sp) -.endif -.if \r11 != 0 - CHECK_STACK_OFFSET 11, \stack_offset - lw \r11, 40(sp) -.endif -.if \r12 != 0 - CHECK_STACK_OFFSET 12, \stack_offset - lw \r12, 44(sp) -.endif -.if \r13 != 0 - CHECK_STACK_OFFSET 13, \stack_offset - lw \r13, 48(sp) -.endif -.if \r14 != 0 - CHECK_STACK_OFFSET 14, \stack_offset - lw \r14, 52(sp) -.endif -.if \stack_offset != 0 - addiu sp, sp, \stack_offset -.endif -.endm diff --git a/simd/nasm/jcolsamp.inc b/simd/nasm/jcolsamp.inc index a2d5b49..6f6d7f2 100644 --- a/simd/nasm/jcolsamp.inc +++ b/simd/nasm/jcolsamp.inc @@ -7,8 +7,6 @@ ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; [TAB8] ; -------------------------------------------------------------------------- diff --git a/simd/nasm/jdct.inc b/simd/nasm/jdct.inc index 79d5146..9192f66 100644 --- a/simd/nasm/jdct.inc +++ b/simd/nasm/jdct.inc @@ -7,8 +7,6 @@ ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc -; -; [TAB8] ; Each IDCT routine is responsible for range-limiting its results and ; converting them to unsigned form (0..MAXJSAMPLE). The raw outputs could diff --git a/simd/nasm/jpeg_nbits_table.inc b/simd/nasm/jpeg_nbits_table.inc deleted file mode 100644 index 2ce6c28..0000000 --- a/simd/nasm/jpeg_nbits_table.inc +++ /dev/null @@ -1,4097 +0,0 @@ -jpeg_nbits_table db \ - 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, \ - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, \ - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, \ - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, \ - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, \ - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, \ - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, \ - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, \ - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, \ - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \ - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 diff --git a/simd/nasm/jsimdext.inc b/simd/nasm/jsimdext.inc index b40901f..9930d80 100644 --- a/simd/nasm/jsimdext.inc +++ b/simd/nasm/jsimdext.inc @@ -2,7 +2,7 @@ ; jsimdext.inc - common declarations ; ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB -; Copyright (C) 2010, 2016, D. R. Commander. +; Copyright (C) 2010, 2016, 2019, D. R. Commander. ; Copyright (C) 2018, Matthieu Darbois. ; ; Based on the x86 SIMD extension for IJG JPEG library - version 1.02 @@ -24,8 +24,6 @@ ; 2. Altered source versions must be plainly marked as such, and must not be ; misrepresented as being the original software. ; 3. This notice may not be removed or altered from any source distribution. -; -; [TAB8] ; ========================================================================== ; System-dependent configurations @@ -167,19 +165,19 @@ section .note.GNU-stack noalloc noexec nowrite progbits %define XMM_DWORD %define XMM_MMWORD -%define SIZEOF_BYTE 1 ; sizeof(BYTE) -%define SIZEOF_WORD 2 ; sizeof(WORD) -%define SIZEOF_DWORD 4 ; sizeof(DWORD) -%define SIZEOF_QWORD 8 ; sizeof(QWORD) -%define SIZEOF_OWORD 16 ; sizeof(OWORD) -%define SIZEOF_YWORD 32 ; sizeof(YWORD) +%define SIZEOF_BYTE 1 ; sizeof(byte) +%define SIZEOF_WORD 2 ; sizeof(word) +%define SIZEOF_DWORD 4 ; sizeof(dword) +%define SIZEOF_QWORD 8 ; sizeof(qword) +%define SIZEOF_OWORD 16 ; sizeof(oword) +%define SIZEOF_YWORD 32 ; sizeof(yword) %define BYTE_BIT 8 ; CHAR_BIT in C -%define WORD_BIT 16 ; sizeof(WORD)*BYTE_BIT -%define DWORD_BIT 32 ; sizeof(DWORD)*BYTE_BIT -%define QWORD_BIT 64 ; sizeof(QWORD)*BYTE_BIT -%define OWORD_BIT 128 ; sizeof(OWORD)*BYTE_BIT -%define YWORD_BIT 256 ; sizeof(YWORD)*BYTE_BIT +%define WORD_BIT 16 ; sizeof(word)*BYTE_BIT +%define DWORD_BIT 32 ; sizeof(dword)*BYTE_BIT +%define QWORD_BIT 64 ; sizeof(qword)*BYTE_BIT +%define OWORD_BIT 128 ; sizeof(oword)*BYTE_BIT +%define YWORD_BIT 256 ; sizeof(yword)*BYTE_BIT ; -------------------------------------------------------------------------- ; External Symbol Name @@ -198,6 +196,11 @@ section .note.GNU-stack noalloc noexec nowrite progbits %ifdef __YASM_VER__ %define GLOBAL_FUNCTION(name) global EXTN(name):private_extern %define GLOBAL_DATA(name) global EXTN(name):private_extern +%else +%if __NASM_VERSION_ID__ >= 0x020E0000 +%define GLOBAL_FUNCTION(name) global EXTN(name):private_extern +%define GLOBAL_DATA(name) global EXTN(name):private_extern +%endif %endif %endif diff --git a/simd/powerpc/jccolext-altivec.c b/simd/powerpc/jccolext-altivec.c deleted file mode 100644 index 170f90f..0000000 --- a/simd/powerpc/jccolext-altivec.c +++ /dev/null @@ -1,269 +0,0 @@ -/* - * AltiVec optimizations for libjpeg-turbo - * - * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. - * Copyright (C) 2014, Jay Foad. All Rights Reserved. - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* This file is included by jccolor-altivec.c */ - - -void jsimd_rgb_ycc_convert_altivec(JDIMENSION img_width, JSAMPARRAY input_buf, - JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows) -{ - JSAMPROW inptr, outptr0, outptr1, outptr2; - int pitch = img_width * RGB_PIXELSIZE, num_cols; -#if __BIG_ENDIAN__ - int offset; -#endif - unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16]; - - __vector unsigned char rgb0, rgb1 = { 0 }, rgb2 = { 0 }, - rgbg0, rgbg1, rgbg2, rgbg3, y, cb, cr; -#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4 - __vector unsigned char rgb3 = { 0 }; -#endif -#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4 - __vector unsigned char rgb4 = { 0 }; -#endif - __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3; - __vector unsigned short yl, yh, crl, crh, cbl, cbh; - __vector int y0, y1, y2, y3, cr0, cr1, cr2, cr3, cb0, cb1, cb2, cb3; - - /* Constants */ - __vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) }, - pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) }, - pw_mf016_mf033 = { __4X2(-F_0_168, -F_0_331) }, - pw_mf008_mf041 = { __4X2(-F_0_081, -F_0_418) }; - __vector unsigned short pw_f050_f000 = { __4X2(F_0_500, 0) }; - __vector int pd_onehalf = { __4X(ONE_HALF) }, - pd_onehalfm1_cj = { __4X(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS)) }; - __vector unsigned char pb_zero = { __16X(0) }, -#if __BIG_ENDIAN__ - shift_pack_index = - { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; -#else - shift_pack_index = - { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; -#endif - - while (--num_rows >= 0) { - inptr = *input_buf++; - outptr0 = output_buf[0][output_row]; - outptr1 = output_buf[1][output_row]; - outptr2 = output_buf[2][output_row]; - output_row++; - - for (num_cols = pitch; num_cols > 0; - num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16, - outptr0 += 16, outptr1 += 16, outptr2 += 16) { - -#if __BIG_ENDIAN__ - /* Load 16 pixels == 48 or 64 bytes */ - offset = (size_t)inptr & 15; - if (offset) { - __vector unsigned char unaligned_shift_index; - int bytes = num_cols + offset; - - if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) { - /* Slow path to prevent buffer overread. Since there is no way to - * read a partial AltiVec register, overread would occur on the last - * chunk of the last image row if the right edge is not on a 16-byte - * boundary. It could also occur on other rows if the bytes per row - * is low enough. Since we can't determine whether we're on the last - * image row, we have to assume every row is the last. - */ - memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); - rgb0 = vec_ld(0, tmpbuf); - rgb1 = vec_ld(16, tmpbuf); - rgb2 = vec_ld(32, tmpbuf); -#if RGB_PIXELSIZE == 4 - rgb3 = vec_ld(48, tmpbuf); -#endif - } else { - /* Fast path */ - rgb0 = vec_ld(0, inptr); - if (bytes > 16) - rgb1 = vec_ld(16, inptr); - if (bytes > 32) - rgb2 = vec_ld(32, inptr); - if (bytes > 48) - rgb3 = vec_ld(48, inptr); -#if RGB_PIXELSIZE == 4 - if (bytes > 64) - rgb4 = vec_ld(64, inptr); -#endif - unaligned_shift_index = vec_lvsl(0, inptr); - rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index); - rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index); - rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index); -#if RGB_PIXELSIZE == 4 - rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index); -#endif - } - } else { -#endif /* __BIG_ENDIAN__ */ - if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) { - /* Slow path */ - memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); - rgb0 = VEC_LD(0, tmpbuf); - rgb1 = VEC_LD(16, tmpbuf); - rgb2 = VEC_LD(32, tmpbuf); -#if RGB_PIXELSIZE == 4 - rgb3 = VEC_LD(48, tmpbuf); -#endif - } else { - /* Fast path */ - rgb0 = VEC_LD(0, inptr); - if (num_cols > 16) - rgb1 = VEC_LD(16, inptr); - if (num_cols > 32) - rgb2 = VEC_LD(32, inptr); -#if RGB_PIXELSIZE == 4 - if (num_cols > 48) - rgb3 = VEC_LD(48, inptr); -#endif - } -#if __BIG_ENDIAN__ - } -#endif - -#if RGB_PIXELSIZE == 3 - /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5 - * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga - * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf - * - * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3 - * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7 - * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb - * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf - */ - rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0); - rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1); - rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2); - rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3); -#else - /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3 - * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7 - * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb - * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf - * - * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3 - * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7 - * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb - * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf - */ - rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX); - rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX); - rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX); - rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX); -#endif - - /* rg0 = R0 G0 R1 G1 R2 G2 R3 G3 - * bg0 = B0 G0 B1 G1 B2 G2 B3 G3 - * ... - * - * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't - * support unsigned vectors. - */ - rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0); - bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0); - rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1); - bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1); - rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2); - bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2); - rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3); - bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3); - - /* (Original) - * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B - * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE - * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE - * - * (This implementation) - * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G - * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE - * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE - */ - - /* Calculate Y values */ - - y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf); - y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf); - y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf); - y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf); - y0 = vec_msums(bg0, pw_f0114_f0250, y0); - y1 = vec_msums(bg1, pw_f0114_f0250, y1); - y2 = vec_msums(bg2, pw_f0114_f0250, y2); - y3 = vec_msums(bg3, pw_f0114_f0250, y3); - /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from - * each dword into a new 16-bit vector, which is the equivalent of - * descaling the 32-bit results (right-shifting by 16 bits) and then - * packing them. - */ - yl = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1, - shift_pack_index); - yh = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3, - shift_pack_index); - y = vec_pack(yl, yh); - vec_st(y, 0, outptr0); - - /* Calculate Cb values */ - cb0 = vec_msums(rg0, pw_mf016_mf033, pd_onehalfm1_cj); - cb1 = vec_msums(rg1, pw_mf016_mf033, pd_onehalfm1_cj); - cb2 = vec_msums(rg2, pw_mf016_mf033, pd_onehalfm1_cj); - cb3 = vec_msums(rg3, pw_mf016_mf033, pd_onehalfm1_cj); - cb0 = (__vector int)vec_msum((__vector unsigned short)bg0, pw_f050_f000, - (__vector unsigned int)cb0); - cb1 = (__vector int)vec_msum((__vector unsigned short)bg1, pw_f050_f000, - (__vector unsigned int)cb1); - cb2 = (__vector int)vec_msum((__vector unsigned short)bg2, pw_f050_f000, - (__vector unsigned int)cb2); - cb3 = (__vector int)vec_msum((__vector unsigned short)bg3, pw_f050_f000, - (__vector unsigned int)cb3); - cbl = vec_perm((__vector unsigned short)cb0, - (__vector unsigned short)cb1, shift_pack_index); - cbh = vec_perm((__vector unsigned short)cb2, - (__vector unsigned short)cb3, shift_pack_index); - cb = vec_pack(cbl, cbh); - vec_st(cb, 0, outptr1); - - /* Calculate Cr values */ - cr0 = vec_msums(bg0, pw_mf008_mf041, pd_onehalfm1_cj); - cr1 = vec_msums(bg1, pw_mf008_mf041, pd_onehalfm1_cj); - cr2 = vec_msums(bg2, pw_mf008_mf041, pd_onehalfm1_cj); - cr3 = vec_msums(bg3, pw_mf008_mf041, pd_onehalfm1_cj); - cr0 = (__vector int)vec_msum((__vector unsigned short)rg0, pw_f050_f000, - (__vector unsigned int)cr0); - cr1 = (__vector int)vec_msum((__vector unsigned short)rg1, pw_f050_f000, - (__vector unsigned int)cr1); - cr2 = (__vector int)vec_msum((__vector unsigned short)rg2, pw_f050_f000, - (__vector unsigned int)cr2); - cr3 = (__vector int)vec_msum((__vector unsigned short)rg3, pw_f050_f000, - (__vector unsigned int)cr3); - crl = vec_perm((__vector unsigned short)cr0, - (__vector unsigned short)cr1, shift_pack_index); - crh = vec_perm((__vector unsigned short)cr2, - (__vector unsigned short)cr3, shift_pack_index); - cr = vec_pack(crl, crh); - vec_st(cr, 0, outptr2); - } - } -} diff --git a/simd/powerpc/jccolor-altivec.c b/simd/powerpc/jccolor-altivec.c deleted file mode 100644 index d670dbc..0000000 --- a/simd/powerpc/jccolor-altivec.c +++ /dev/null @@ -1,116 +0,0 @@ -/* - * AltiVec optimizations for libjpeg-turbo - * - * Copyright (C) 2014, D. R. Commander. All Rights Reserved. - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* RGB --> YCC CONVERSION */ - -#include "jsimd_altivec.h" - - -#define F_0_081 5329 /* FIX(0.08131) */ -#define F_0_114 7471 /* FIX(0.11400) */ -#define F_0_168 11059 /* FIX(0.16874) */ -#define F_0_250 16384 /* FIX(0.25000) */ -#define F_0_299 19595 /* FIX(0.29900) */ -#define F_0_331 21709 /* FIX(0.33126) */ -#define F_0_418 27439 /* FIX(0.41869) */ -#define F_0_500 32768 /* FIX(0.50000) */ -#define F_0_587 38470 /* FIX(0.58700) */ -#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */ - -#define SCALEBITS 16 -#define ONE_HALF (1 << (SCALEBITS - 1)) - - -#define RGBG_INDEX0 \ - { 0, 1, 3, 4, 6, 7, 9, 10, 2, 1, 5, 4, 8, 7, 11, 10 } -#define RGBG_INDEX1 \ - { 12, 13, 15, 16, 18, 19, 21, 22, 14, 13, 17, 16, 20, 19, 23, 22 } -#define RGBG_INDEX2 \ - { 8, 9, 11, 12, 14, 15, 17, 18, 10, 9, 13, 12, 16, 15, 19, 18 } -#define RGBG_INDEX3 \ - { 4, 5, 7, 8, 10, 11, 13, 14, 6, 5, 9, 8, 12, 11, 15, 14 } -#include "jccolext-altivec.c" -#undef RGB_PIXELSIZE - -#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE -#define jsimd_rgb_ycc_convert_altivec jsimd_extrgb_ycc_convert_altivec -#include "jccolext-altivec.c" -#undef RGB_PIXELSIZE -#undef RGBG_INDEX0 -#undef RGBG_INDEX1 -#undef RGBG_INDEX2 -#undef RGBG_INDEX3 -#undef jsimd_rgb_ycc_convert_altivec - -#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE -#define RGBG_INDEX \ - { 0, 1, 4, 5, 8, 9, 12, 13, 2, 1, 6, 5, 10, 9, 14, 13 } -#define jsimd_rgb_ycc_convert_altivec jsimd_extrgbx_ycc_convert_altivec -#include "jccolext-altivec.c" -#undef RGB_PIXELSIZE -#undef RGBG_INDEX -#undef jsimd_rgb_ycc_convert_altivec - -#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE -#define RGBG_INDEX0 \ - { 2, 1, 5, 4, 8, 7, 11, 10, 0, 1, 3, 4, 6, 7, 9, 10 } -#define RGBG_INDEX1 \ - { 14, 13, 17, 16, 20, 19, 23, 22, 12, 13, 15, 16, 18, 19, 21, 22 } -#define RGBG_INDEX2 \ - { 10, 9, 13, 12, 16, 15, 19, 18, 8, 9, 11, 12, 14, 15, 17, 18 } -#define RGBG_INDEX3 \ - { 6, 5, 9, 8, 12, 11, 15, 14, 4, 5, 7, 8, 10, 11, 13, 14 } -#define jsimd_rgb_ycc_convert_altivec jsimd_extbgr_ycc_convert_altivec -#include "jccolext-altivec.c" -#undef RGB_PIXELSIZE -#undef RGBG_INDEX0 -#undef RGBG_INDEX1 -#undef RGBG_INDEX2 -#undef RGBG_INDEX3 -#undef jsimd_rgb_ycc_convert_altivec - -#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE -#define RGBG_INDEX \ - { 2, 1, 6, 5, 10, 9, 14, 13, 0, 1, 4, 5, 8, 9, 12, 13 } -#define jsimd_rgb_ycc_convert_altivec jsimd_extbgrx_ycc_convert_altivec -#include "jccolext-altivec.c" -#undef RGB_PIXELSIZE -#undef RGBG_INDEX -#undef jsimd_rgb_ycc_convert_altivec - -#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE -#define RGBG_INDEX \ - { 3, 2, 7, 6, 11, 10, 15, 14, 1, 2, 5, 6, 9, 10, 13, 14 } -#define jsimd_rgb_ycc_convert_altivec jsimd_extxbgr_ycc_convert_altivec -#include "jccolext-altivec.c" -#undef RGB_PIXELSIZE -#undef RGBG_INDEX -#undef jsimd_rgb_ycc_convert_altivec - -#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE -#define RGBG_INDEX \ - { 1, 2, 5, 6, 9, 10, 13, 14, 3, 2, 7, 6, 11, 10, 15, 14 } -#define jsimd_rgb_ycc_convert_altivec jsimd_extxrgb_ycc_convert_altivec -#include "jccolext-altivec.c" -#undef RGB_PIXELSIZE -#undef RGBG_INDEX -#undef jsimd_rgb_ycc_convert_altivec diff --git a/simd/powerpc/jcgray-altivec.c b/simd/powerpc/jcgray-altivec.c deleted file mode 100644 index a11a7e7..0000000 --- a/simd/powerpc/jcgray-altivec.c +++ /dev/null @@ -1,111 +0,0 @@ -/* - * AltiVec optimizations for libjpeg-turbo - * - * Copyright (C) 2014, D. R. Commander. All Rights Reserved. - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* RGB --> GRAYSCALE CONVERSION */ - -#include "jsimd_altivec.h" - - -#define F_0_114 7471 /* FIX(0.11400) */ -#define F_0_250 16384 /* FIX(0.25000) */ -#define F_0_299 19595 /* FIX(0.29900) */ -#define F_0_587 38470 /* FIX(0.58700) */ -#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */ - -#define SCALEBITS 16 -#define ONE_HALF (1 << (SCALEBITS - 1)) - - -#define RGBG_INDEX0 \ - { 0, 1, 3, 4, 6, 7, 9, 10, 2, 1, 5, 4, 8, 7, 11, 10 } -#define RGBG_INDEX1 \ - { 12, 13, 15, 16, 18, 19, 21, 22, 14, 13, 17, 16, 20, 19, 23, 22 } -#define RGBG_INDEX2 \ - { 8, 9, 11, 12, 14, 15, 17, 18, 10, 9, 13, 12, 16, 15, 19, 18 } -#define RGBG_INDEX3 \ - { 4, 5, 7, 8, 10, 11, 13, 14, 6, 5, 9, 8, 12, 11, 15, 14 } -#include "jcgryext-altivec.c" -#undef RGB_PIXELSIZE - -#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE -#define jsimd_rgb_gray_convert_altivec jsimd_extrgb_gray_convert_altivec -#include "jcgryext-altivec.c" -#undef RGB_PIXELSIZE -#undef RGBG_INDEX0 -#undef RGBG_INDEX1 -#undef RGBG_INDEX2 -#undef RGBG_INDEX3 -#undef jsimd_rgb_gray_convert_altivec - -#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE -#define RGBG_INDEX \ - { 0, 1, 4, 5, 8, 9, 12, 13, 2, 1, 6, 5, 10, 9, 14, 13 } -#define jsimd_rgb_gray_convert_altivec jsimd_extrgbx_gray_convert_altivec -#include "jcgryext-altivec.c" -#undef RGB_PIXELSIZE -#undef RGBG_INDEX -#undef jsimd_rgb_gray_convert_altivec - -#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE -#define RGBG_INDEX0 \ - { 2, 1, 5, 4, 8, 7, 11, 10, 0, 1, 3, 4, 6, 7, 9, 10 } -#define RGBG_INDEX1 \ - { 14, 13, 17, 16, 20, 19, 23, 22, 12, 13, 15, 16, 18, 19, 21, 22 } -#define RGBG_INDEX2 \ - { 10, 9, 13, 12, 16, 15, 19, 18, 8, 9, 11, 12, 14, 15, 17, 18 } -#define RGBG_INDEX3 \ - { 6, 5, 9, 8, 12, 11, 15, 14, 4, 5, 7, 8, 10, 11, 13, 14 } -#define jsimd_rgb_gray_convert_altivec jsimd_extbgr_gray_convert_altivec -#include "jcgryext-altivec.c" -#undef RGB_PIXELSIZE -#undef RGBG_INDEX0 -#undef RGBG_INDEX1 -#undef RGBG_INDEX2 -#undef RGBG_INDEX3 -#undef jsimd_rgb_gray_convert_altivec - -#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE -#define RGBG_INDEX \ - { 2, 1, 6, 5, 10, 9, 14, 13, 0, 1, 4, 5, 8, 9, 12, 13 } -#define jsimd_rgb_gray_convert_altivec jsimd_extbgrx_gray_convert_altivec -#include "jcgryext-altivec.c" -#undef RGB_PIXELSIZE -#undef RGBG_INDEX -#undef jsimd_rgb_gray_convert_altivec - -#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE -#define RGBG_INDEX \ - { 3, 2, 7, 6, 11, 10, 15, 14, 1, 2, 5, 6, 9, 10, 13, 14 } -#define jsimd_rgb_gray_convert_altivec jsimd_extxbgr_gray_convert_altivec -#include "jcgryext-altivec.c" -#undef RGB_PIXELSIZE -#undef RGBG_INDEX -#undef jsimd_rgb_gray_convert_altivec - -#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE -#define RGBG_INDEX \ - { 1, 2, 5, 6, 9, 10, 13, 14, 3, 2, 7, 6, 11, 10, 15, 14 } -#define jsimd_rgb_gray_convert_altivec jsimd_extxrgb_gray_convert_altivec -#include "jcgryext-altivec.c" -#undef RGB_PIXELSIZE -#undef RGBG_INDEX -#undef jsimd_rgb_gray_convert_altivec diff --git a/simd/powerpc/jcgryext-altivec.c b/simd/powerpc/jcgryext-altivec.c deleted file mode 100644 index b280cbb..0000000 --- a/simd/powerpc/jcgryext-altivec.c +++ /dev/null @@ -1,228 +0,0 @@ -/* - * AltiVec optimizations for libjpeg-turbo - * - * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. - * Copyright (C) 2014, Jay Foad. All Rights Reserved. - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* This file is included by jcgray-altivec.c */ - - -void jsimd_rgb_gray_convert_altivec(JDIMENSION img_width, JSAMPARRAY input_buf, - JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows) -{ - JSAMPROW inptr, outptr; - int pitch = img_width * RGB_PIXELSIZE, num_cols; -#if __BIG_ENDIAN__ - int offset; - unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16]; -#endif - - __vector unsigned char rgb0, rgb1 = { 0 }, rgb2 = { 0 }, - rgbg0, rgbg1, rgbg2, rgbg3, y; -#if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4 - __vector unsigned char rgb3 = { 0 }; -#endif -#if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4 - __vector unsigned char rgb4 = { 0 }; -#endif - __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3; - __vector unsigned short yl, yh; - __vector int y0, y1, y2, y3; - - /* Constants */ - __vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) }, - pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) }; - __vector int pd_onehalf = { __4X(ONE_HALF) }; - __vector unsigned char pb_zero = { __16X(0) }, -#if __BIG_ENDIAN__ - shift_pack_index = - { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; -#else - shift_pack_index = - { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; -#endif - - while (--num_rows >= 0) { - inptr = *input_buf++; - outptr = output_buf[0][output_row]; - output_row++; - - for (num_cols = pitch; num_cols > 0; - num_cols -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16, - outptr += 16) { - -#if __BIG_ENDIAN__ - /* Load 16 pixels == 48 or 64 bytes */ - offset = (size_t)inptr & 15; - if (offset) { - __vector unsigned char unaligned_shift_index; - int bytes = num_cols + offset; - - if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) { - /* Slow path to prevent buffer overread. Since there is no way to - * read a partial AltiVec register, overread would occur on the last - * chunk of the last image row if the right edge is not on a 16-byte - * boundary. It could also occur on other rows if the bytes per row - * is low enough. Since we can't determine whether we're on the last - * image row, we have to assume every row is the last. - */ - memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); - rgb0 = vec_ld(0, tmpbuf); - rgb1 = vec_ld(16, tmpbuf); - rgb2 = vec_ld(32, tmpbuf); -#if RGB_PIXELSIZE == 4 - rgb3 = vec_ld(48, tmpbuf); -#endif - } else { - /* Fast path */ - rgb0 = vec_ld(0, inptr); - if (bytes > 16) - rgb1 = vec_ld(16, inptr); - if (bytes > 32) - rgb2 = vec_ld(32, inptr); - if (bytes > 48) - rgb3 = vec_ld(48, inptr); -#if RGB_PIXELSIZE == 4 - if (bytes > 64) - rgb4 = vec_ld(64, inptr); -#endif - unaligned_shift_index = vec_lvsl(0, inptr); - rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index); - rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index); - rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index); -#if RGB_PIXELSIZE == 4 - rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index); -#endif - } - } else { - if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) { - /* Slow path */ - memcpy(tmpbuf, inptr, min(num_cols, RGB_PIXELSIZE * 16)); - rgb0 = vec_ld(0, tmpbuf); - rgb1 = vec_ld(16, tmpbuf); - rgb2 = vec_ld(32, tmpbuf); -#if RGB_PIXELSIZE == 4 - rgb3 = vec_ld(48, tmpbuf); -#endif - } else { - /* Fast path */ - rgb0 = vec_ld(0, inptr); - if (num_cols > 16) - rgb1 = vec_ld(16, inptr); - if (num_cols > 32) - rgb2 = vec_ld(32, inptr); -#if RGB_PIXELSIZE == 4 - if (num_cols > 48) - rgb3 = vec_ld(48, inptr); -#endif - } - } -#else - /* Little endian */ - rgb0 = vec_vsx_ld(0, inptr); - if (num_cols > 16) - rgb1 = vec_vsx_ld(16, inptr); - if (num_cols > 32) - rgb2 = vec_vsx_ld(32, inptr); -#if RGB_PIXELSIZE == 4 - if (num_cols > 48) - rgb3 = vec_vsx_ld(48, inptr); -#endif -#endif - -#if RGB_PIXELSIZE == 3 - /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5 - * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga - * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf - * - * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3 - * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7 - * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb - * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf - */ - rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0); - rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1); - rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2); - rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3); -#else - /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3 - * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7 - * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb - * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf - * - * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3 - * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7 - * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb - * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf - */ - rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX); - rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX); - rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX); - rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX); -#endif - - /* rg0 = R0 G0 R1 G1 R2 G2 R3 G3 - * bg0 = B0 G0 B1 G1 B2 G2 B3 G3 - * ... - * - * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't - * support unsigned vectors. - */ - rg0 = (__vector signed short)VEC_UNPACKHU(rgbg0); - bg0 = (__vector signed short)VEC_UNPACKLU(rgbg0); - rg1 = (__vector signed short)VEC_UNPACKHU(rgbg1); - bg1 = (__vector signed short)VEC_UNPACKLU(rgbg1); - rg2 = (__vector signed short)VEC_UNPACKHU(rgbg2); - bg2 = (__vector signed short)VEC_UNPACKLU(rgbg2); - rg3 = (__vector signed short)VEC_UNPACKHU(rgbg3); - bg3 = (__vector signed short)VEC_UNPACKLU(rgbg3); - - /* (Original) - * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B - * - * (This implementation) - * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G - */ - - /* Calculate Y values */ - - y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf); - y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf); - y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf); - y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf); - y0 = vec_msums(bg0, pw_f0114_f0250, y0); - y1 = vec_msums(bg1, pw_f0114_f0250, y1); - y2 = vec_msums(bg2, pw_f0114_f0250, y2); - y3 = vec_msums(bg3, pw_f0114_f0250, y3); - /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from - * each dword into a new 16-bit vector, which is the equivalent of - * descaling the 32-bit results (right-shifting by 16 bits) and then - * packing them. - */ - yl = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1, - shift_pack_index); - yh = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3, - shift_pack_index); - y = vec_pack(yl, yh); - vec_st(y, 0, outptr); - } - } -} diff --git a/simd/powerpc/jcsample-altivec.c b/simd/powerpc/jcsample-altivec.c deleted file mode 100644 index 6e25b8d..0000000 --- a/simd/powerpc/jcsample-altivec.c +++ /dev/null @@ -1,159 +0,0 @@ -/* - * AltiVec optimizations for libjpeg-turbo - * - * Copyright (C) 2015, D. R. Commander. All Rights Reserved. - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* CHROMA DOWNSAMPLING */ - -#include "jsimd_altivec.h" -#include "jcsample.h" - - -void jsimd_h2v1_downsample_altivec(JDIMENSION image_width, - int max_v_samp_factor, - JDIMENSION v_samp_factor, - JDIMENSION width_in_blocks, - JSAMPARRAY input_data, - JSAMPARRAY output_data) -{ - int outrow, outcol; - JDIMENSION output_cols = width_in_blocks * DCTSIZE; - JSAMPROW inptr, outptr; - - __vector unsigned char this0, next0, out; - __vector unsigned short this0e, this0o, next0e, next0o, outl, outh; - - /* Constants */ - __vector unsigned short pw_bias = { __4X2(0, 1) }, - pw_one = { __8X(1) }; - __vector unsigned char even_odd_index = - { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }, - pb_zero = { __16X(0) }; - - expand_right_edge(input_data, max_v_samp_factor, image_width, - output_cols * 2); - - for (outrow = 0; outrow < v_samp_factor; outrow++) { - outptr = output_data[outrow]; - inptr = input_data[outrow]; - - for (outcol = output_cols; outcol > 0; - outcol -= 16, inptr += 32, outptr += 16) { - - this0 = vec_ld(0, inptr); - this0 = vec_perm(this0, this0, even_odd_index); - this0e = (__vector unsigned short)VEC_UNPACKHU(this0); - this0o = (__vector unsigned short)VEC_UNPACKLU(this0); - outl = vec_add(this0e, this0o); - outl = vec_add(outl, pw_bias); - outl = vec_sr(outl, pw_one); - - if (outcol > 8) { - next0 = vec_ld(16, inptr); - next0 = vec_perm(next0, next0, even_odd_index); - next0e = (__vector unsigned short)VEC_UNPACKHU(next0); - next0o = (__vector unsigned short)VEC_UNPACKLU(next0); - outh = vec_add(next0e, next0o); - outh = vec_add(outh, pw_bias); - outh = vec_sr(outh, pw_one); - } else - outh = vec_splat_u16(0); - - out = vec_pack(outl, outh); - vec_st(out, 0, outptr); - } - } -} - - -void -jsimd_h2v2_downsample_altivec(JDIMENSION image_width, int max_v_samp_factor, - JDIMENSION v_samp_factor, - JDIMENSION width_in_blocks, - JSAMPARRAY input_data, JSAMPARRAY output_data) -{ - int inrow, outrow, outcol; - JDIMENSION output_cols = width_in_blocks * DCTSIZE; - JSAMPROW inptr0, inptr1, outptr; - - __vector unsigned char this0, next0, this1, next1, out; - __vector unsigned short this0e, this0o, next0e, next0o, this1e, this1o, - next1e, next1o, out0l, out0h, out1l, out1h, outl, outh; - - /* Constants */ - __vector unsigned short pw_bias = { __4X2(1, 2) }, - pw_two = { __8X(2) }; - __vector unsigned char even_odd_index = - { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }, - pb_zero = { __16X(0) }; - - expand_right_edge(input_data, max_v_samp_factor, image_width, - output_cols * 2); - - for (inrow = 0, outrow = 0; outrow < v_samp_factor; - inrow += 2, outrow++) { - - inptr0 = input_data[inrow]; - inptr1 = input_data[inrow + 1]; - outptr = output_data[outrow]; - - for (outcol = output_cols; outcol > 0; - outcol -= 16, inptr0 += 32, inptr1 += 32, outptr += 16) { - - this0 = vec_ld(0, inptr0); - this0 = vec_perm(this0, this0, even_odd_index); - this0e = (__vector unsigned short)VEC_UNPACKHU(this0); - this0o = (__vector unsigned short)VEC_UNPACKLU(this0); - out0l = vec_add(this0e, this0o); - - this1 = vec_ld(0, inptr1); - this1 = vec_perm(this1, this1, even_odd_index); - this1e = (__vector unsigned short)VEC_UNPACKHU(this1); - this1o = (__vector unsigned short)VEC_UNPACKLU(this1); - out1l = vec_add(this1e, this1o); - - outl = vec_add(out0l, out1l); - outl = vec_add(outl, pw_bias); - outl = vec_sr(outl, pw_two); - - if (outcol > 8) { - next0 = vec_ld(16, inptr0); - next0 = vec_perm(next0, next0, even_odd_index); - next0e = (__vector unsigned short)VEC_UNPACKHU(next0); - next0o = (__vector unsigned short)VEC_UNPACKLU(next0); - out0h = vec_add(next0e, next0o); - - next1 = vec_ld(16, inptr1); - next1 = vec_perm(next1, next1, even_odd_index); - next1e = (__vector unsigned short)VEC_UNPACKHU(next1); - next1o = (__vector unsigned short)VEC_UNPACKLU(next1); - out1h = vec_add(next1e, next1o); - - outh = vec_add(out0h, out1h); - outh = vec_add(outh, pw_bias); - outh = vec_sr(outh, pw_two); - } else - outh = vec_splat_u16(0); - - out = vec_pack(outl, outh); - vec_st(out, 0, outptr); - } - } -} diff --git a/simd/powerpc/jcsample.h b/simd/powerpc/jcsample.h deleted file mode 100644 index 2ac4816..0000000 --- a/simd/powerpc/jcsample.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - * jcsample.h - * - * This file was part of the Independent JPEG Group's software: - * Copyright (C) 1991-1996, Thomas G. Lane. - * For conditions of distribution and use, see the accompanying README.ijg - * file. - */ - -LOCAL(void) -expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols, - JDIMENSION output_cols) -{ - register JSAMPROW ptr; - register JSAMPLE pixval; - register int count; - int row; - int numcols = (int)(output_cols - input_cols); - - if (numcols > 0) { - for (row = 0; row < num_rows; row++) { - ptr = image_data[row] + input_cols; - pixval = ptr[-1]; /* don't need GETJSAMPLE() here */ - for (count = numcols; count > 0; count--) - *ptr++ = pixval; - } - } -} diff --git a/simd/powerpc/jdcolext-altivec.c b/simd/powerpc/jdcolext-altivec.c deleted file mode 100644 index 68d52bd..0000000 --- a/simd/powerpc/jdcolext-altivec.c +++ /dev/null @@ -1,276 +0,0 @@ -/* - * AltiVec optimizations for libjpeg-turbo - * - * Copyright (C) 2015, D. R. Commander. All Rights Reserved. - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* This file is included by jdcolor-altivec.c */ - - -void jsimd_ycc_rgb_convert_altivec(JDIMENSION out_width, JSAMPIMAGE input_buf, - JDIMENSION input_row, JSAMPARRAY output_buf, - int num_rows) -{ - JSAMPROW outptr, inptr0, inptr1, inptr2; - int pitch = out_width * RGB_PIXELSIZE, num_cols; -#if __BIG_ENDIAN__ - int offset; -#endif - unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16]; - - __vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3, - y, cb, cr; -#if __BIG_ENDIAN__ - __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3; -#if RGB_PIXELSIZE == 4 - __vector unsigned char out4; -#endif -#endif -#if RGB_PIXELSIZE == 4 - __vector unsigned char rgb3; -#endif - __vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, yl, yh, cbl, cbh, - crl, crh, rl, rh, gl, gh, bl, bh, g0w, g1w, g2w, g3w; - __vector int g0, g1, g2, g3; - - /* Constants - * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17 - * high-order bits, not 16. - */ - __vector short pw_f0402 = { __8X(F_0_402 >> 1) }, - pw_mf0228 = { __8X(-F_0_228 >> 1) }, - pw_mf0344_f0285 = { __4X2(-F_0_344, F_0_285) }, - pw_one = { __8X(1) }, pw_255 = { __8X(255) }, - pw_cj = { __8X(CENTERJSAMPLE) }; - __vector int pd_onehalf = { __4X(ONE_HALF) }; - __vector unsigned char pb_zero = { __16X(0) }, -#if __BIG_ENDIAN__ - shift_pack_index = - { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; -#else - shift_pack_index = - { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; -#endif - - while (--num_rows >= 0) { - inptr0 = input_buf[0][input_row]; - inptr1 = input_buf[1][input_row]; - inptr2 = input_buf[2][input_row]; - input_row++; - outptr = *output_buf++; - - for (num_cols = pitch; num_cols > 0; - num_cols -= RGB_PIXELSIZE * 16, outptr += RGB_PIXELSIZE * 16, - inptr0 += 16, inptr1 += 16, inptr2 += 16) { - - y = vec_ld(0, inptr0); - /* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't - * support unsigned vectors. - */ - yl = (__vector signed short)VEC_UNPACKHU(y); - yh = (__vector signed short)VEC_UNPACKLU(y); - - cb = vec_ld(0, inptr1); - cbl = (__vector signed short)VEC_UNPACKHU(cb); - cbh = (__vector signed short)VEC_UNPACKLU(cb); - cbl = vec_sub(cbl, pw_cj); - cbh = vec_sub(cbh, pw_cj); - - cr = vec_ld(0, inptr2); - crl = (__vector signed short)VEC_UNPACKHU(cr); - crh = (__vector signed short)VEC_UNPACKLU(cr); - crl = vec_sub(crl, pw_cj); - crh = vec_sub(crh, pw_cj); - - /* (Original) - * R = Y + 1.40200 * Cr - * G = Y - 0.34414 * Cb - 0.71414 * Cr - * B = Y + 1.77200 * Cb - * - * (This implementation) - * R = Y + 0.40200 * Cr + Cr - * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr - * B = Y - 0.22800 * Cb + Cb + Cb - */ - bl = vec_add(cbl, cbl); - bh = vec_add(cbh, cbh); - bl = vec_madds(bl, pw_mf0228, pw_one); - bh = vec_madds(bh, pw_mf0228, pw_one); - bl = vec_sra(bl, (__vector unsigned short)pw_one); - bh = vec_sra(bh, (__vector unsigned short)pw_one); - bl = vec_add(bl, cbl); - bh = vec_add(bh, cbh); - bl = vec_add(bl, cbl); - bh = vec_add(bh, cbh); - bl = vec_add(bl, yl); - bh = vec_add(bh, yh); - - rl = vec_add(crl, crl); - rh = vec_add(crh, crh); - rl = vec_madds(rl, pw_f0402, pw_one); - rh = vec_madds(rh, pw_f0402, pw_one); - rl = vec_sra(rl, (__vector unsigned short)pw_one); - rh = vec_sra(rh, (__vector unsigned short)pw_one); - rl = vec_add(rl, crl); - rh = vec_add(rh, crh); - rl = vec_add(rl, yl); - rh = vec_add(rh, yh); - - g0w = vec_mergeh(cbl, crl); - g1w = vec_mergel(cbl, crl); - g0 = vec_msums(g0w, pw_mf0344_f0285, pd_onehalf); - g1 = vec_msums(g1w, pw_mf0344_f0285, pd_onehalf); - g2w = vec_mergeh(cbh, crh); - g3w = vec_mergel(cbh, crh); - g2 = vec_msums(g2w, pw_mf0344_f0285, pd_onehalf); - g3 = vec_msums(g3w, pw_mf0344_f0285, pd_onehalf); - /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from - * each dword into a new 16-bit vector, which is the equivalent of - * descaling the 32-bit results (right-shifting by 16 bits) and then - * packing them. - */ - gl = vec_perm((__vector short)g0, (__vector short)g1, shift_pack_index); - gh = vec_perm((__vector short)g2, (__vector short)g3, shift_pack_index); - gl = vec_sub(gl, crl); - gh = vec_sub(gh, crh); - gl = vec_add(gl, yl); - gh = vec_add(gh, yh); - - rg0 = vec_mergeh(rl, gl); - bx0 = vec_mergeh(bl, pw_255); - rg1 = vec_mergel(rl, gl); - bx1 = vec_mergel(bl, pw_255); - rg2 = vec_mergeh(rh, gh); - bx2 = vec_mergeh(bh, pw_255); - rg3 = vec_mergel(rh, gh); - bx3 = vec_mergel(bh, pw_255); - - rgbx0 = vec_packsu(rg0, bx0); - rgbx1 = vec_packsu(rg1, bx1); - rgbx2 = vec_packsu(rg2, bx2); - rgbx3 = vec_packsu(rg3, bx3); - -#if RGB_PIXELSIZE == 3 - /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3 - * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7 - * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb - * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf - * - * rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5 - * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga - * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf - */ - rgb0 = vec_perm(rgbx0, rgbx1, (__vector unsigned char)RGB_INDEX0); - rgb1 = vec_perm(rgbx1, rgbx2, (__vector unsigned char)RGB_INDEX1); - rgb2 = vec_perm(rgbx2, rgbx3, (__vector unsigned char)RGB_INDEX2); -#else - /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3 - * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7 - * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb - * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf - * - * rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3 - * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7 - * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb - * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf - */ - rgb0 = vec_perm(rgbx0, rgbx0, (__vector unsigned char)RGB_INDEX); - rgb1 = vec_perm(rgbx1, rgbx1, (__vector unsigned char)RGB_INDEX); - rgb2 = vec_perm(rgbx2, rgbx2, (__vector unsigned char)RGB_INDEX); - rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX); -#endif - -#if __BIG_ENDIAN__ - offset = (size_t)outptr & 15; - if (offset) { - __vector unsigned char unaligned_shift_index; - int bytes = num_cols + offset; - - if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) { - /* Slow path to prevent buffer overwrite. Since there is no way to - * write a partial AltiVec register, overwrite would occur on the - * last chunk of the last image row if the right edge is not on a - * 16-byte boundary. It could also occur on other rows if the bytes - * per row is low enough. Since we can't determine whether we're on - * the last image row, we have to assume every row is the last. - */ - vec_st(rgb0, 0, tmpbuf); - vec_st(rgb1, 16, tmpbuf); - vec_st(rgb2, 32, tmpbuf); -#if RGB_PIXELSIZE == 4 - vec_st(rgb3, 48, tmpbuf); -#endif - memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16)); - } else { - /* Fast path */ - unaligned_shift_index = vec_lvsl(0, outptr); - edgel = vec_ld(0, outptr); - edgeh = vec_ld(min(num_cols - 1, RGB_PIXELSIZE * 16), outptr); - edges = vec_perm(edgeh, edgel, unaligned_shift_index); - unaligned_shift_index = vec_lvsr(0, outptr); - out0 = vec_perm(edges, rgb0, unaligned_shift_index); - out1 = vec_perm(rgb0, rgb1, unaligned_shift_index); - out2 = vec_perm(rgb1, rgb2, unaligned_shift_index); -#if RGB_PIXELSIZE == 4 - out3 = vec_perm(rgb2, rgb3, unaligned_shift_index); - out4 = vec_perm(rgb3, edges, unaligned_shift_index); -#else - out3 = vec_perm(rgb2, edges, unaligned_shift_index); -#endif - vec_st(out0, 0, outptr); - if (bytes > 16) - vec_st(out1, 16, outptr); - if (bytes > 32) - vec_st(out2, 32, outptr); - if (bytes > 48) - vec_st(out3, 48, outptr); -#if RGB_PIXELSIZE == 4 - if (bytes > 64) - vec_st(out4, 64, outptr); -#endif - } - } else { -#endif /* __BIG_ENDIAN__ */ - if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) { - /* Slow path */ - VEC_ST(rgb0, 0, tmpbuf); - VEC_ST(rgb1, 16, tmpbuf); - VEC_ST(rgb2, 32, tmpbuf); -#if RGB_PIXELSIZE == 4 - VEC_ST(rgb3, 48, tmpbuf); -#endif - memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16)); - } else { - /* Fast path */ - VEC_ST(rgb0, 0, outptr); - if (num_cols > 16) - VEC_ST(rgb1, 16, outptr); - if (num_cols > 32) - VEC_ST(rgb2, 32, outptr); -#if RGB_PIXELSIZE == 4 - if (num_cols > 48) - VEC_ST(rgb3, 48, outptr); -#endif - } -#if __BIG_ENDIAN__ - } -#endif - } - } -} diff --git a/simd/powerpc/jdcolor-altivec.c b/simd/powerpc/jdcolor-altivec.c deleted file mode 100644 index eb35b67..0000000 --- a/simd/powerpc/jdcolor-altivec.c +++ /dev/null @@ -1,106 +0,0 @@ -/* - * AltiVec optimizations for libjpeg-turbo - * - * Copyright (C) 2015, D. R. Commander. All Rights Reserved. - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* YCC --> RGB CONVERSION */ - -#include "jsimd_altivec.h" - - -#define F_0_344 22554 /* FIX(0.34414) */ -#define F_0_714 46802 /* FIX(0.71414) */ -#define F_1_402 91881 /* FIX(1.40200) */ -#define F_1_772 116130 /* FIX(1.77200) */ -#define F_0_402 (F_1_402 - 65536) /* FIX(1.40200) - FIX(1) */ -#define F_0_285 (65536 - F_0_714) /* FIX(1) - FIX(0.71414) */ -#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */ - -#define SCALEBITS 16 -#define ONE_HALF (1 << (SCALEBITS - 1)) - -#define RGB_INDEX0 \ - { 0, 1, 8, 2, 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18 } -#define RGB_INDEX1 \ - { 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21 } -#define RGB_INDEX2 \ - { 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21, 28, 22, 23, 30 } -#include "jdcolext-altivec.c" -#undef RGB_PIXELSIZE - -#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE -#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgb_convert_altivec -#include "jdcolext-altivec.c" -#undef RGB_PIXELSIZE -#undef RGB_INDEX0 -#undef RGB_INDEX1 -#undef RGB_INDEX2 -#undef jsimd_ycc_rgb_convert_altivec - -#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE -#define RGB_INDEX \ - { 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 } -#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgbx_convert_altivec -#include "jdcolext-altivec.c" -#undef RGB_PIXELSIZE -#undef RGB_INDEX -#undef jsimd_ycc_rgb_convert_altivec - -#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE -#define RGB_INDEX0 \ - { 8, 1, 0, 10, 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26 } -#define RGB_INDEX1 \ - { 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21 } -#define RGB_INDEX2 \ - { 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21, 20, 30, 23, 22 } -#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgr_convert_altivec -#include "jdcolext-altivec.c" -#undef RGB_PIXELSIZE -#undef RGB_INDEX0 -#undef RGB_INDEX1 -#undef RGB_INDEX2 -#undef jsimd_ycc_rgb_convert_altivec - -#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE -#define RGB_INDEX \ - { 8, 1, 0, 9, 10, 3, 2, 11, 12, 5, 4, 13, 14, 7, 6, 15 } -#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgrx_convert_altivec -#include "jdcolext-altivec.c" -#undef RGB_PIXELSIZE -#undef RGB_INDEX -#undef jsimd_ycc_rgb_convert_altivec - -#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE -#define RGB_INDEX \ - { 9, 8, 1, 0, 11, 10, 3, 2, 13, 12, 5, 4, 15, 14, 7, 6 } -#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxbgr_convert_altivec -#include "jdcolext-altivec.c" -#undef RGB_PIXELSIZE -#undef RGB_INDEX -#undef jsimd_ycc_rgb_convert_altivec - -#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE -#define RGB_INDEX \ - { 9, 0, 1, 8, 11, 2, 3, 10, 13, 4, 5, 12, 15, 6, 7, 14 } -#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxrgb_convert_altivec -#include "jdcolext-altivec.c" -#undef RGB_PIXELSIZE -#undef RGB_INDEX -#undef jsimd_ycc_rgb_convert_altivec diff --git a/simd/powerpc/jdmerge-altivec.c b/simd/powerpc/jdmerge-altivec.c deleted file mode 100644 index 79c577f..0000000 --- a/simd/powerpc/jdmerge-altivec.c +++ /dev/null @@ -1,130 +0,0 @@ -/* - * AltiVec optimizations for libjpeg-turbo - * - * Copyright (C) 2015, D. R. Commander. All Rights Reserved. - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* MERGED YCC --> RGB CONVERSION AND UPSAMPLING */ - -#include "jsimd_altivec.h" - - -#define F_0_344 22554 /* FIX(0.34414) */ -#define F_0_714 46802 /* FIX(0.71414) */ -#define F_1_402 91881 /* FIX(1.40200) */ -#define F_1_772 116130 /* FIX(1.77200) */ -#define F_0_402 (F_1_402 - 65536) /* FIX(1.40200) - FIX(1) */ -#define F_0_285 (65536 - F_0_714) /* FIX(1) - FIX(0.71414) */ -#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */ - -#define SCALEBITS 16 -#define ONE_HALF (1 << (SCALEBITS - 1)) - -#define RGB_INDEX0 \ - { 0, 1, 8, 2, 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18 } -#define RGB_INDEX1 \ - { 3, 10, 4, 5, 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21 } -#define RGB_INDEX2 \ - { 12, 6, 7, 14, 16, 17, 24, 18, 19, 26, 20, 21, 28, 22, 23, 30 } -#include "jdmrgext-altivec.c" -#undef RGB_PIXELSIZE - -#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE -#define jsimd_h2v1_merged_upsample_altivec \ - jsimd_h2v1_extrgb_merged_upsample_altivec -#define jsimd_h2v2_merged_upsample_altivec \ - jsimd_h2v2_extrgb_merged_upsample_altivec -#include "jdmrgext-altivec.c" -#undef RGB_PIXELSIZE -#undef RGB_INDEX0 -#undef RGB_INDEX1 -#undef RGB_INDEX2 -#undef jsimd_h2v1_merged_upsample_altivec -#undef jsimd_h2v2_merged_upsample_altivec - -#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE -#define RGB_INDEX \ - { 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 } -#define jsimd_h2v1_merged_upsample_altivec \ - jsimd_h2v1_extrgbx_merged_upsample_altivec -#define jsimd_h2v2_merged_upsample_altivec \ - jsimd_h2v2_extrgbx_merged_upsample_altivec -#include "jdmrgext-altivec.c" -#undef RGB_PIXELSIZE -#undef RGB_INDEX -#undef jsimd_h2v1_merged_upsample_altivec -#undef jsimd_h2v2_merged_upsample_altivec - -#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE -#define RGB_INDEX0 \ - { 8, 1, 0, 10, 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26 } -#define RGB_INDEX1 \ - { 3, 2, 12, 5, 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21 } -#define RGB_INDEX2 \ - { 4, 14, 7, 6, 24, 17, 16, 26, 19, 18, 28, 21, 20, 30, 23, 22 } -#define jsimd_h2v1_merged_upsample_altivec \ - jsimd_h2v1_extbgr_merged_upsample_altivec -#define jsimd_h2v2_merged_upsample_altivec \ - jsimd_h2v2_extbgr_merged_upsample_altivec -#include "jdmrgext-altivec.c" -#undef RGB_PIXELSIZE -#undef RGB_INDEX0 -#undef RGB_INDEX1 -#undef RGB_INDEX2 -#undef jsimd_h2v1_merged_upsample_altivec -#undef jsimd_h2v2_merged_upsample_altivec - -#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE -#define RGB_INDEX \ - { 8, 1, 0, 9, 10, 3, 2, 11, 12, 5, 4, 13, 14, 7, 6, 15 } -#define jsimd_h2v1_merged_upsample_altivec \ - jsimd_h2v1_extbgrx_merged_upsample_altivec -#define jsimd_h2v2_merged_upsample_altivec \ - jsimd_h2v2_extbgrx_merged_upsample_altivec -#include "jdmrgext-altivec.c" -#undef RGB_PIXELSIZE -#undef RGB_INDEX -#undef jsimd_h2v1_merged_upsample_altivec -#undef jsimd_h2v2_merged_upsample_altivec - -#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE -#define RGB_INDEX \ - { 9, 8, 1, 0, 11, 10, 3, 2, 13, 12, 5, 4, 15, 14, 7, 6 } -#define jsimd_h2v1_merged_upsample_altivec \ - jsimd_h2v1_extxbgr_merged_upsample_altivec -#define jsimd_h2v2_merged_upsample_altivec \ - jsimd_h2v2_extxbgr_merged_upsample_altivec -#include "jdmrgext-altivec.c" -#undef RGB_PIXELSIZE -#undef RGB_INDEX -#undef jsimd_h2v1_merged_upsample_altivec -#undef jsimd_h2v2_merged_upsample_altivec - -#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE -#define RGB_INDEX \ - { 9, 0, 1, 8, 11, 2, 3, 10, 13, 4, 5, 12, 15, 6, 7, 14 } -#define jsimd_h2v1_merged_upsample_altivec \ - jsimd_h2v1_extxrgb_merged_upsample_altivec -#define jsimd_h2v2_merged_upsample_altivec \ - jsimd_h2v2_extxrgb_merged_upsample_altivec -#include "jdmrgext-altivec.c" -#undef RGB_PIXELSIZE -#undef RGB_INDEX -#undef jsimd_h2v1_merged_upsample_altivec -#undef jsimd_h2v2_merged_upsample_altivec diff --git a/simd/powerpc/jdmrgext-altivec.c b/simd/powerpc/jdmrgext-altivec.c deleted file mode 100644 index 40f02c3..0000000 --- a/simd/powerpc/jdmrgext-altivec.c +++ /dev/null @@ -1,329 +0,0 @@ -/* - * AltiVec optimizations for libjpeg-turbo - * - * Copyright (C) 2015, D. R. Commander. All Rights Reserved. - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* This file is included by jdmerge-altivec.c */ - - -void jsimd_h2v1_merged_upsample_altivec(JDIMENSION output_width, - JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, - JSAMPARRAY output_buf) -{ - JSAMPROW outptr, inptr0, inptr1, inptr2; - int pitch = output_width * RGB_PIXELSIZE, num_cols, yloop; -#if __BIG_ENDIAN__ - int offset; -#endif - unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16]; - - __vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3, - y, cb, cr; -#if __BIG_ENDIAN__ - __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3; -#if RGB_PIXELSIZE == 4 - __vector unsigned char out4; -#endif -#endif -#if RGB_PIXELSIZE == 4 - __vector unsigned char rgb3; -#endif - __vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, ye, yo, cbl, cbh, - crl, crh, r_yl, r_yh, g_yl, g_yh, b_yl, b_yh, g_y0w, g_y1w, g_y2w, g_y3w, - rl, rh, gl, gh, bl, bh, re, ro, ge, go, be, bo; - __vector int g_y0, g_y1, g_y2, g_y3; - - /* Constants - * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17 - * high-order bits, not 16. - */ - __vector short pw_f0402 = { __8X(F_0_402 >> 1) }, - pw_mf0228 = { __8X(-F_0_228 >> 1) }, - pw_mf0344_f0285 = { __4X2(-F_0_344, F_0_285) }, - pw_one = { __8X(1) }, pw_255 = { __8X(255) }, - pw_cj = { __8X(CENTERJSAMPLE) }; - __vector int pd_onehalf = { __4X(ONE_HALF) }; - __vector unsigned char pb_zero = { __16X(0) }, -#if __BIG_ENDIAN__ - shift_pack_index = - { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }, - even_index = - { 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26, 0, 28, 0, 30 }, - odd_index = - { 0, 17, 0, 19, 0, 21, 0, 23, 0, 25, 0, 27, 0, 29, 0, 31 }; -#else - shift_pack_index = - { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }, - even_index = - { 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26, 0, 28, 0, 30, 0 }, - odd_index = - { 17, 0, 19, 0, 21, 0, 23, 0, 25, 0, 27, 0, 29, 0, 31, 0 }; -#endif - - inptr0 = input_buf[0][in_row_group_ctr]; - inptr1 = input_buf[1][in_row_group_ctr]; - inptr2 = input_buf[2][in_row_group_ctr]; - outptr = output_buf[0]; - - for (num_cols = pitch; num_cols > 0; inptr1 += 16, inptr2 += 16) { - - cb = vec_ld(0, inptr1); - /* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't - * support unsigned vectors. - */ - cbl = (__vector signed short)VEC_UNPACKHU(cb); - cbh = (__vector signed short)VEC_UNPACKLU(cb); - cbl = vec_sub(cbl, pw_cj); - cbh = vec_sub(cbh, pw_cj); - - cr = vec_ld(0, inptr2); - crl = (__vector signed short)VEC_UNPACKHU(cr); - crh = (__vector signed short)VEC_UNPACKLU(cr); - crl = vec_sub(crl, pw_cj); - crh = vec_sub(crh, pw_cj); - - /* (Original) - * R = Y + 1.40200 * Cr - * G = Y - 0.34414 * Cb - 0.71414 * Cr - * B = Y + 1.77200 * Cb - * - * (This implementation) - * R = Y + 0.40200 * Cr + Cr - * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr - * B = Y - 0.22800 * Cb + Cb + Cb - */ - b_yl = vec_add(cbl, cbl); - b_yh = vec_add(cbh, cbh); - b_yl = vec_madds(b_yl, pw_mf0228, pw_one); - b_yh = vec_madds(b_yh, pw_mf0228, pw_one); - b_yl = vec_sra(b_yl, (__vector unsigned short)pw_one); - b_yh = vec_sra(b_yh, (__vector unsigned short)pw_one); - b_yl = vec_add(b_yl, cbl); - b_yh = vec_add(b_yh, cbh); - b_yl = vec_add(b_yl, cbl); - b_yh = vec_add(b_yh, cbh); - - r_yl = vec_add(crl, crl); - r_yh = vec_add(crh, crh); - r_yl = vec_madds(r_yl, pw_f0402, pw_one); - r_yh = vec_madds(r_yh, pw_f0402, pw_one); - r_yl = vec_sra(r_yl, (__vector unsigned short)pw_one); - r_yh = vec_sra(r_yh, (__vector unsigned short)pw_one); - r_yl = vec_add(r_yl, crl); - r_yh = vec_add(r_yh, crh); - - g_y0w = vec_mergeh(cbl, crl); - g_y1w = vec_mergel(cbl, crl); - g_y0 = vec_msums(g_y0w, pw_mf0344_f0285, pd_onehalf); - g_y1 = vec_msums(g_y1w, pw_mf0344_f0285, pd_onehalf); - g_y2w = vec_mergeh(cbh, crh); - g_y3w = vec_mergel(cbh, crh); - g_y2 = vec_msums(g_y2w, pw_mf0344_f0285, pd_onehalf); - g_y3 = vec_msums(g_y3w, pw_mf0344_f0285, pd_onehalf); - /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from - * each dword into a new 16-bit vector, which is the equivalent of - * descaling the 32-bit results (right-shifting by 16 bits) and then - * packing them. - */ - g_yl = vec_perm((__vector short)g_y0, (__vector short)g_y1, - shift_pack_index); - g_yh = vec_perm((__vector short)g_y2, (__vector short)g_y3, - shift_pack_index); - g_yl = vec_sub(g_yl, crl); - g_yh = vec_sub(g_yh, crh); - - for (yloop = 0; yloop < 2 && num_cols > 0; yloop++, - num_cols -= RGB_PIXELSIZE * 16, - outptr += RGB_PIXELSIZE * 16, inptr0 += 16) { - - y = vec_ld(0, inptr0); - ye = (__vector signed short)vec_perm(pb_zero, y, even_index); - yo = (__vector signed short)vec_perm(pb_zero, y, odd_index); - - if (yloop == 0) { - be = vec_add(b_yl, ye); - bo = vec_add(b_yl, yo); - re = vec_add(r_yl, ye); - ro = vec_add(r_yl, yo); - ge = vec_add(g_yl, ye); - go = vec_add(g_yl, yo); - } else { - be = vec_add(b_yh, ye); - bo = vec_add(b_yh, yo); - re = vec_add(r_yh, ye); - ro = vec_add(r_yh, yo); - ge = vec_add(g_yh, ye); - go = vec_add(g_yh, yo); - } - - rl = vec_mergeh(re, ro); - rh = vec_mergel(re, ro); - gl = vec_mergeh(ge, go); - gh = vec_mergel(ge, go); - bl = vec_mergeh(be, bo); - bh = vec_mergel(be, bo); - - rg0 = vec_mergeh(rl, gl); - bx0 = vec_mergeh(bl, pw_255); - rg1 = vec_mergel(rl, gl); - bx1 = vec_mergel(bl, pw_255); - rg2 = vec_mergeh(rh, gh); - bx2 = vec_mergeh(bh, pw_255); - rg3 = vec_mergel(rh, gh); - bx3 = vec_mergel(bh, pw_255); - - rgbx0 = vec_packsu(rg0, bx0); - rgbx1 = vec_packsu(rg1, bx1); - rgbx2 = vec_packsu(rg2, bx2); - rgbx3 = vec_packsu(rg3, bx3); - -#if RGB_PIXELSIZE == 3 - /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3 - * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7 - * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb - * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf - * - * rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5 - * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga - * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf - */ - rgb0 = vec_perm(rgbx0, rgbx1, (__vector unsigned char)RGB_INDEX0); - rgb1 = vec_perm(rgbx1, rgbx2, (__vector unsigned char)RGB_INDEX1); - rgb2 = vec_perm(rgbx2, rgbx3, (__vector unsigned char)RGB_INDEX2); -#else - /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3 - * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7 - * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb - * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf - * - * rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3 - * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7 - * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb - * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf - */ - rgb0 = vec_perm(rgbx0, rgbx0, (__vector unsigned char)RGB_INDEX); - rgb1 = vec_perm(rgbx1, rgbx1, (__vector unsigned char)RGB_INDEX); - rgb2 = vec_perm(rgbx2, rgbx2, (__vector unsigned char)RGB_INDEX); - rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX); -#endif - -#if __BIG_ENDIAN__ - offset = (size_t)outptr & 15; - if (offset) { - __vector unsigned char unaligned_shift_index; - int bytes = num_cols + offset; - - if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) { - /* Slow path to prevent buffer overwrite. Since there is no way to - * write a partial AltiVec register, overwrite would occur on the - * last chunk of the last image row if the right edge is not on a - * 16-byte boundary. It could also occur on other rows if the bytes - * per row is low enough. Since we can't determine whether we're on - * the last image row, we have to assume every row is the last. - */ - vec_st(rgb0, 0, tmpbuf); - vec_st(rgb1, 16, tmpbuf); - vec_st(rgb2, 32, tmpbuf); -#if RGB_PIXELSIZE == 4 - vec_st(rgb3, 48, tmpbuf); -#endif - memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16)); - } else { - /* Fast path */ - unaligned_shift_index = vec_lvsl(0, outptr); - edgel = vec_ld(0, outptr); - edgeh = vec_ld(min(num_cols - 1, RGB_PIXELSIZE * 16), outptr); - edges = vec_perm(edgeh, edgel, unaligned_shift_index); - unaligned_shift_index = vec_lvsr(0, outptr); - out0 = vec_perm(edges, rgb0, unaligned_shift_index); - out1 = vec_perm(rgb0, rgb1, unaligned_shift_index); - out2 = vec_perm(rgb1, rgb2, unaligned_shift_index); -#if RGB_PIXELSIZE == 4 - out3 = vec_perm(rgb2, rgb3, unaligned_shift_index); - out4 = vec_perm(rgb3, edges, unaligned_shift_index); -#else - out3 = vec_perm(rgb2, edges, unaligned_shift_index); -#endif - vec_st(out0, 0, outptr); - if (bytes > 16) - vec_st(out1, 16, outptr); - if (bytes > 32) - vec_st(out2, 32, outptr); - if (bytes > 48) - vec_st(out3, 48, outptr); -#if RGB_PIXELSIZE == 4 - if (bytes > 64) - vec_st(out4, 64, outptr); -#endif - } - } else { -#endif /* __BIG_ENDIAN__ */ - if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) { - /* Slow path */ - VEC_ST(rgb0, 0, tmpbuf); - VEC_ST(rgb1, 16, tmpbuf); - VEC_ST(rgb2, 32, tmpbuf); -#if RGB_PIXELSIZE == 4 - VEC_ST(rgb3, 48, tmpbuf); -#endif - memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16)); - } else { - /* Fast path */ - VEC_ST(rgb0, 0, outptr); - if (num_cols > 16) - VEC_ST(rgb1, 16, outptr); - if (num_cols > 32) - VEC_ST(rgb2, 32, outptr); -#if RGB_PIXELSIZE == 4 - if (num_cols > 48) - VEC_ST(rgb3, 48, outptr); -#endif - } -#if __BIG_ENDIAN__ - } -#endif - } - } -} - - -void jsimd_h2v2_merged_upsample_altivec(JDIMENSION output_width, - JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, - JSAMPARRAY output_buf) -{ - JSAMPROW inptr, outptr; - - inptr = input_buf[0][in_row_group_ctr]; - outptr = output_buf[0]; - - input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2]; - jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr, - output_buf); - - input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2 + 1]; - output_buf[0] = output_buf[1]; - jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr, - output_buf); - - input_buf[0][in_row_group_ctr] = inptr; - output_buf[0] = outptr; -} diff --git a/simd/powerpc/jdsample-altivec.c b/simd/powerpc/jdsample-altivec.c deleted file mode 100644 index 04df0cf..0000000 --- a/simd/powerpc/jdsample-altivec.c +++ /dev/null @@ -1,400 +0,0 @@ -/* - * AltiVec optimizations for libjpeg-turbo - * - * Copyright (C) 2015, D. R. Commander. All Rights Reserved. - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* CHROMA UPSAMPLING */ - -#include "jsimd_altivec.h" - - -void jsimd_h2v1_fancy_upsample_altivec(int max_v_samp_factor, - JDIMENSION downsampled_width, - JSAMPARRAY input_data, - JSAMPARRAY *output_data_ptr) -{ - JSAMPARRAY output_data = *output_data_ptr; - JSAMPROW inptr, outptr; - int inrow, incol; - - __vector unsigned char this0, last0, p_last0, next0 = { 0 }, p_next0, - out; - __vector short this0e, this0o, this0l, this0h, last0l, last0h, - next0l, next0h, outle, outhe, outlo, outho; - - /* Constants */ - __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) }, - last_index_col0 = - { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, - last_index = - { 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 }, - next_index = - { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 }, - next_index_lastcol = - { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 }, -#if __BIG_ENDIAN__ - merge_pack_index = - { 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 }; -#else - merge_pack_index = - { 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 }; -#endif - __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) }; - - for (inrow = 0; inrow < max_v_samp_factor; inrow++) { - inptr = input_data[inrow]; - outptr = output_data[inrow]; - - if (downsampled_width & 15) - inptr[downsampled_width] = inptr[downsampled_width - 1]; - - this0 = vec_ld(0, inptr); - p_last0 = vec_perm(this0, this0, last_index_col0); - last0 = this0; - - for (incol = downsampled_width; incol > 0; - incol -= 16, inptr += 16, outptr += 32) { - - if (downsampled_width - incol > 0) { - p_last0 = vec_perm(last0, this0, last_index); - last0 = this0; - } - - if (incol <= 16) - p_next0 = vec_perm(this0, this0, next_index_lastcol); - else { - next0 = vec_ld(16, inptr); - p_next0 = vec_perm(this0, next0, next_index); - } - - this0e = (__vector short)vec_mule(this0, pb_three); - this0o = (__vector short)vec_mulo(this0, pb_three); - this0l = vec_mergeh(this0e, this0o); - this0h = vec_mergel(this0e, this0o); - - last0l = (__vector short)VEC_UNPACKHU(p_last0); - last0h = (__vector short)VEC_UNPACKLU(p_last0); - last0l = vec_add(last0l, pw_one); - - next0l = (__vector short)VEC_UNPACKHU(p_next0); - next0h = (__vector short)VEC_UNPACKLU(p_next0); - next0l = vec_add(next0l, pw_two); - - outle = vec_add(this0l, last0l); - outlo = vec_add(this0l, next0l); - outle = vec_sr(outle, (__vector unsigned short)pw_two); - outlo = vec_sr(outlo, (__vector unsigned short)pw_two); - - out = vec_perm((__vector unsigned char)outle, - (__vector unsigned char)outlo, merge_pack_index); - vec_st(out, 0, outptr); - - if (incol > 8) { - last0h = vec_add(last0h, pw_one); - next0h = vec_add(next0h, pw_two); - - outhe = vec_add(this0h, last0h); - outho = vec_add(this0h, next0h); - outhe = vec_sr(outhe, (__vector unsigned short)pw_two); - outho = vec_sr(outho, (__vector unsigned short)pw_two); - - out = vec_perm((__vector unsigned char)outhe, - (__vector unsigned char)outho, merge_pack_index); - vec_st(out, 16, outptr); - } - - this0 = next0; - } - } -} - - -void jsimd_h2v2_fancy_upsample_altivec(int max_v_samp_factor, - JDIMENSION downsampled_width, - JSAMPARRAY input_data, - JSAMPARRAY *output_data_ptr) -{ - JSAMPARRAY output_data = *output_data_ptr; - JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1; - int inrow, outrow, incol; - - __vector unsigned char this_1, this0, this1, out; - __vector short this_1l, this_1h, this0l, this0h, this1l, this1h, - lastcolsum_1h, lastcolsum1h, - p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h, - thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h, - nextcolsum_1l = { 0 }, nextcolsum_1h = { 0 }, - nextcolsum1l = { 0 }, nextcolsum1h = { 0 }, - p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h, - tmpl, tmph, outle, outhe, outlo, outho; - - /* Constants */ - __vector unsigned char pb_zero = { __16X(0) }, - last_index_col0 = - { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }, - last_index = - { 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 }, - next_index = - { 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 }, - next_index_lastcol = - { 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15 }, -#if __BIG_ENDIAN__ - merge_pack_index = - { 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 }; -#else - merge_pack_index = - { 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 }; -#endif - __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) }, - pw_seven = { __8X(7) }, pw_eight = { __8X(8) }; - __vector unsigned short pw_four = { __8X(4) }; - - for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { - - inptr_1 = input_data[inrow - 1]; - inptr0 = input_data[inrow]; - inptr1 = input_data[inrow + 1]; - outptr0 = output_data[outrow++]; - outptr1 = output_data[outrow++]; - - if (downsampled_width & 15) { - inptr_1[downsampled_width] = inptr_1[downsampled_width - 1]; - inptr0[downsampled_width] = inptr0[downsampled_width - 1]; - inptr1[downsampled_width] = inptr1[downsampled_width - 1]; - } - - this0 = vec_ld(0, inptr0); - this0l = (__vector short)VEC_UNPACKHU(this0); - this0h = (__vector short)VEC_UNPACKLU(this0); - this0l = vec_mladd(this0l, pw_three, pw_zero); - this0h = vec_mladd(this0h, pw_three, pw_zero); - - this_1 = vec_ld(0, inptr_1); - this_1l = (__vector short)VEC_UNPACKHU(this_1); - this_1h = (__vector short)VEC_UNPACKLU(this_1); - thiscolsum_1l = vec_add(this0l, this_1l); - thiscolsum_1h = vec_add(this0h, this_1h); - lastcolsum_1h = thiscolsum_1h; - p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0); - p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index); - - this1 = vec_ld(0, inptr1); - this1l = (__vector short)VEC_UNPACKHU(this1); - this1h = (__vector short)VEC_UNPACKLU(this1); - thiscolsum1l = vec_add(this0l, this1l); - thiscolsum1h = vec_add(this0h, this1h); - lastcolsum1h = thiscolsum1h; - p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0); - p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index); - - for (incol = downsampled_width; incol > 0; - incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16, - outptr0 += 32, outptr1 += 32) { - - if (downsampled_width - incol > 0) { - p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index); - p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index); - p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index); - p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index); - lastcolsum_1h = thiscolsum_1h; lastcolsum1h = thiscolsum1h; - } - - if (incol <= 16) { - p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index); - p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h, - next_index_lastcol); - p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index); - p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h, - next_index_lastcol); - } else { - this0 = vec_ld(16, inptr0); - this0l = (__vector short)VEC_UNPACKHU(this0); - this0h = (__vector short)VEC_UNPACKLU(this0); - this0l = vec_mladd(this0l, pw_three, pw_zero); - this0h = vec_mladd(this0h, pw_three, pw_zero); - - this_1 = vec_ld(16, inptr_1); - this_1l = (__vector short)VEC_UNPACKHU(this_1); - this_1h = (__vector short)VEC_UNPACKLU(this_1); - nextcolsum_1l = vec_add(this0l, this_1l); - nextcolsum_1h = vec_add(this0h, this_1h); - p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index); - p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index); - - this1 = vec_ld(16, inptr1); - this1l = (__vector short)VEC_UNPACKHU(this1); - this1h = (__vector short)VEC_UNPACKLU(this1); - nextcolsum1l = vec_add(this0l, this1l); - nextcolsum1h = vec_add(this0h, this1h); - p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index); - p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index); - } - - /* Process the upper row */ - - tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero); - outle = vec_add(tmpl, p_lastcolsum_1l); - outle = vec_add(outle, pw_eight); - outle = vec_sr(outle, pw_four); - - outlo = vec_add(tmpl, p_nextcolsum_1l); - outlo = vec_add(outlo, pw_seven); - outlo = vec_sr(outlo, pw_four); - - out = vec_perm((__vector unsigned char)outle, - (__vector unsigned char)outlo, merge_pack_index); - vec_st(out, 0, outptr0); - - if (incol > 8) { - tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero); - outhe = vec_add(tmph, p_lastcolsum_1h); - outhe = vec_add(outhe, pw_eight); - outhe = vec_sr(outhe, pw_four); - - outho = vec_add(tmph, p_nextcolsum_1h); - outho = vec_add(outho, pw_seven); - outho = vec_sr(outho, pw_four); - - out = vec_perm((__vector unsigned char)outhe, - (__vector unsigned char)outho, merge_pack_index); - vec_st(out, 16, outptr0); - } - - /* Process the lower row */ - - tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero); - outle = vec_add(tmpl, p_lastcolsum1l); - outle = vec_add(outle, pw_eight); - outle = vec_sr(outle, pw_four); - - outlo = vec_add(tmpl, p_nextcolsum1l); - outlo = vec_add(outlo, pw_seven); - outlo = vec_sr(outlo, pw_four); - - out = vec_perm((__vector unsigned char)outle, - (__vector unsigned char)outlo, merge_pack_index); - vec_st(out, 0, outptr1); - - if (incol > 8) { - tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero); - outhe = vec_add(tmph, p_lastcolsum1h); - outhe = vec_add(outhe, pw_eight); - outhe = vec_sr(outhe, pw_four); - - outho = vec_add(tmph, p_nextcolsum1h); - outho = vec_add(outho, pw_seven); - outho = vec_sr(outho, pw_four); - - out = vec_perm((__vector unsigned char)outhe, - (__vector unsigned char)outho, merge_pack_index); - vec_st(out, 16, outptr1); - } - - thiscolsum_1l = nextcolsum_1l; thiscolsum_1h = nextcolsum_1h; - thiscolsum1l = nextcolsum1l; thiscolsum1h = nextcolsum1h; - } - } -} - - -/* These are rarely used (mainly just for decompressing YCCK images) */ - -void jsimd_h2v1_upsample_altivec(int max_v_samp_factor, - JDIMENSION output_width, - JSAMPARRAY input_data, - JSAMPARRAY *output_data_ptr) -{ - JSAMPARRAY output_data = *output_data_ptr; - JSAMPROW inptr, outptr; - int inrow, incol; - - __vector unsigned char in, inl, inh; - - for (inrow = 0; inrow < max_v_samp_factor; inrow++) { - inptr = input_data[inrow]; - outptr = output_data[inrow]; - - for (incol = (output_width + 31) & (~31); incol > 0; - incol -= 64, inptr += 32, outptr += 64) { - - in = vec_ld(0, inptr); - inl = vec_mergeh(in, in); - inh = vec_mergel(in, in); - - vec_st(inl, 0, outptr); - vec_st(inh, 16, outptr); - - if (incol > 32) { - in = vec_ld(16, inptr); - inl = vec_mergeh(in, in); - inh = vec_mergel(in, in); - - vec_st(inl, 32, outptr); - vec_st(inh, 48, outptr); - } - } - } -} - - -void jsimd_h2v2_upsample_altivec(int max_v_samp_factor, - JDIMENSION output_width, - JSAMPARRAY input_data, - JSAMPARRAY *output_data_ptr) -{ - JSAMPARRAY output_data = *output_data_ptr; - JSAMPROW inptr, outptr0, outptr1; - int inrow, outrow, incol; - - __vector unsigned char in, inl, inh; - - for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { - - inptr = input_data[inrow]; - outptr0 = output_data[outrow++]; - outptr1 = output_data[outrow++]; - - for (incol = (output_width + 31) & (~31); incol > 0; - incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) { - - in = vec_ld(0, inptr); - inl = vec_mergeh(in, in); - inh = vec_mergel(in, in); - - vec_st(inl, 0, outptr0); - vec_st(inl, 0, outptr1); - - vec_st(inh, 16, outptr0); - vec_st(inh, 16, outptr1); - - if (incol > 32) { - in = vec_ld(16, inptr); - inl = vec_mergeh(in, in); - inh = vec_mergel(in, in); - - vec_st(inl, 32, outptr0); - vec_st(inl, 32, outptr1); - - vec_st(inh, 48, outptr0); - vec_st(inh, 48, outptr1); - } - } - } -} diff --git a/simd/powerpc/jfdctfst-altivec.c b/simd/powerpc/jfdctfst-altivec.c deleted file mode 100644 index ad9af81..0000000 --- a/simd/powerpc/jfdctfst-altivec.c +++ /dev/null @@ -1,154 +0,0 @@ -/* - * AltiVec optimizations for libjpeg-turbo - * - * Copyright (C) 2014, D. R. Commander. All Rights Reserved. - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* FAST INTEGER FORWARD DCT - * - * This is similar to the SSE2 implementation, except that we left-shift the - * constants by 1 less bit (the -1 in CONST_SHIFT.) This is because - * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of: - * the elements in arg3 + the most significant 17 bits of - * (the elements in arg1 * the elements in arg2). - */ - -#include "jsimd_altivec.h" - - -#define F_0_382 98 /* FIX(0.382683433) */ -#define F_0_541 139 /* FIX(0.541196100) */ -#define F_0_707 181 /* FIX(0.707106781) */ -#define F_1_306 334 /* FIX(1.306562965) */ - -#define CONST_BITS 8 -#define PRE_MULTIPLY_SCALE_BITS 2 -#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1) - - -#define DO_FDCT() { \ - /* Even part */ \ - \ - tmp10 = vec_add(tmp0, tmp3); \ - tmp13 = vec_sub(tmp0, tmp3); \ - tmp11 = vec_add(tmp1, tmp2); \ - tmp12 = vec_sub(tmp1, tmp2); \ - \ - out0 = vec_add(tmp10, tmp11); \ - out4 = vec_sub(tmp10, tmp11); \ - \ - z1 = vec_add(tmp12, tmp13); \ - z1 = vec_sl(z1, pre_multiply_scale_bits); \ - z1 = vec_madds(z1, pw_0707, pw_zero); \ - \ - out2 = vec_add(tmp13, z1); \ - out6 = vec_sub(tmp13, z1); \ - \ - /* Odd part */ \ - \ - tmp10 = vec_add(tmp4, tmp5); \ - tmp11 = vec_add(tmp5, tmp6); \ - tmp12 = vec_add(tmp6, tmp7); \ - \ - tmp10 = vec_sl(tmp10, pre_multiply_scale_bits); \ - tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \ - z5 = vec_sub(tmp10, tmp12); \ - z5 = vec_madds(z5, pw_0382, pw_zero); \ - \ - z2 = vec_madds(tmp10, pw_0541, z5); \ - z4 = vec_madds(tmp12, pw_1306, z5); \ - \ - tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \ - z3 = vec_madds(tmp11, pw_0707, pw_zero); \ - \ - z11 = vec_add(tmp7, z3); \ - z13 = vec_sub(tmp7, z3); \ - \ - out5 = vec_add(z13, z2); \ - out3 = vec_sub(z13, z2); \ - out1 = vec_add(z11, z4); \ - out7 = vec_sub(z11, z4); \ -} - - -void jsimd_fdct_ifast_altivec(DCTELEM *data) -{ - __vector short row0, row1, row2, row3, row4, row5, row6, row7, - col0, col1, col2, col3, col4, col5, col6, col7, - tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, - z1, z2, z3, z4, z5, z11, z13, - out0, out1, out2, out3, out4, out5, out6, out7; - - /* Constants */ - __vector short pw_zero = { __8X(0) }, - pw_0382 = { __8X(F_0_382 << CONST_SHIFT) }, - pw_0541 = { __8X(F_0_541 << CONST_SHIFT) }, - pw_0707 = { __8X(F_0_707 << CONST_SHIFT) }, - pw_1306 = { __8X(F_1_306 << CONST_SHIFT) }; - __vector unsigned short - pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) }; - - /* Pass 1: process rows */ - - row0 = vec_ld(0, data); - row1 = vec_ld(16, data); - row2 = vec_ld(32, data); - row3 = vec_ld(48, data); - row4 = vec_ld(64, data); - row5 = vec_ld(80, data); - row6 = vec_ld(96, data); - row7 = vec_ld(112, data); - - TRANSPOSE(row, col); - - tmp0 = vec_add(col0, col7); - tmp7 = vec_sub(col0, col7); - tmp1 = vec_add(col1, col6); - tmp6 = vec_sub(col1, col6); - tmp2 = vec_add(col2, col5); - tmp5 = vec_sub(col2, col5); - tmp3 = vec_add(col3, col4); - tmp4 = vec_sub(col3, col4); - - DO_FDCT(); - - /* Pass 2: process columns */ - - TRANSPOSE(out, row); - - tmp0 = vec_add(row0, row7); - tmp7 = vec_sub(row0, row7); - tmp1 = vec_add(row1, row6); - tmp6 = vec_sub(row1, row6); - tmp2 = vec_add(row2, row5); - tmp5 = vec_sub(row2, row5); - tmp3 = vec_add(row3, row4); - tmp4 = vec_sub(row3, row4); - - DO_FDCT(); - - vec_st(out0, 0, data); - vec_st(out1, 16, data); - vec_st(out2, 32, data); - vec_st(out3, 48, data); - vec_st(out4, 64, data); - vec_st(out5, 80, data); - vec_st(out6, 96, data); - vec_st(out7, 112, data); -} diff --git a/simd/powerpc/jfdctint-altivec.c b/simd/powerpc/jfdctint-altivec.c deleted file mode 100644 index 6e63cc1..0000000 --- a/simd/powerpc/jfdctint-altivec.c +++ /dev/null @@ -1,258 +0,0 @@ -/* - * AltiVec optimizations for libjpeg-turbo - * - * Copyright (C) 2014, D. R. Commander. All Rights Reserved. - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* SLOW INTEGER FORWARD DCT */ - -#include "jsimd_altivec.h" - - -#define F_0_298 2446 /* FIX(0.298631336) */ -#define F_0_390 3196 /* FIX(0.390180644) */ -#define F_0_541 4433 /* FIX(0.541196100) */ -#define F_0_765 6270 /* FIX(0.765366865) */ -#define F_0_899 7373 /* FIX(0.899976223) */ -#define F_1_175 9633 /* FIX(1.175875602) */ -#define F_1_501 12299 /* FIX(1.501321110) */ -#define F_1_847 15137 /* FIX(1.847759065) */ -#define F_1_961 16069 /* FIX(1.961570560) */ -#define F_2_053 16819 /* FIX(2.053119869) */ -#define F_2_562 20995 /* FIX(2.562915447) */ -#define F_3_072 25172 /* FIX(3.072711026) */ - -#define CONST_BITS 13 -#define PASS1_BITS 2 -#define DESCALE_P1 (CONST_BITS - PASS1_BITS) -#define DESCALE_P2 (CONST_BITS + PASS1_BITS) - - -#define DO_FDCT_COMMON(PASS) { \ - /* (Original) \ - * z1 = (tmp12 + tmp13) * 0.541196100; \ - * data2 = z1 + tmp13 * 0.765366865; \ - * data6 = z1 + tmp12 * -1.847759065; \ - * \ - * (This implementation) \ - * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \ - * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \ - */ \ - \ - tmp1312l = vec_mergeh(tmp13, tmp12); \ - tmp1312h = vec_mergel(tmp13, tmp12); \ - \ - out2l = vec_msums(tmp1312l, pw_f130_f054, pd_descale_p##PASS); \ - out2h = vec_msums(tmp1312h, pw_f130_f054, pd_descale_p##PASS); \ - out6l = vec_msums(tmp1312l, pw_f054_mf130, pd_descale_p##PASS); \ - out6h = vec_msums(tmp1312h, pw_f054_mf130, pd_descale_p##PASS); \ - \ - out2l = vec_sra(out2l, descale_p##PASS); \ - out2h = vec_sra(out2h, descale_p##PASS); \ - out6l = vec_sra(out6l, descale_p##PASS); \ - out6h = vec_sra(out6h, descale_p##PASS); \ - \ - out2 = vec_pack(out2l, out2h); \ - out6 = vec_pack(out6l, out6h); \ - \ - /* Odd part */ \ - \ - z3 = vec_add(tmp4, tmp6); \ - z4 = vec_add(tmp5, tmp7); \ - \ - /* (Original) \ - * z5 = (z3 + z4) * 1.175875602; \ - * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \ - * z3 += z5; z4 += z5; \ - * \ - * (This implementation) \ - * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \ - * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \ - */ \ - \ - z34l = vec_mergeh(z3, z4); \ - z34h = vec_mergel(z3, z4); \ - \ - z3l = vec_msums(z34l, pw_mf078_f117, pd_descale_p##PASS); \ - z3h = vec_msums(z34h, pw_mf078_f117, pd_descale_p##PASS); \ - z4l = vec_msums(z34l, pw_f117_f078, pd_descale_p##PASS); \ - z4h = vec_msums(z34h, pw_f117_f078, pd_descale_p##PASS); \ - \ - /* (Original) \ - * z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; \ - * tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; \ - * tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; \ - * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \ - * data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; \ - * data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; \ - * \ - * (This implementation) \ - * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \ - * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \ - * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \ - * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \ - * data7 = tmp4 + z3; data5 = tmp5 + z4; \ - * data3 = tmp6 + z3; data1 = tmp7 + z4; \ - */ \ - \ - tmp47l = vec_mergeh(tmp4, tmp7); \ - tmp47h = vec_mergel(tmp4, tmp7); \ - \ - out7l = vec_msums(tmp47l, pw_mf060_mf089, z3l); \ - out7h = vec_msums(tmp47h, pw_mf060_mf089, z3h); \ - out1l = vec_msums(tmp47l, pw_mf089_f060, z4l); \ - out1h = vec_msums(tmp47h, pw_mf089_f060, z4h); \ - \ - out7l = vec_sra(out7l, descale_p##PASS); \ - out7h = vec_sra(out7h, descale_p##PASS); \ - out1l = vec_sra(out1l, descale_p##PASS); \ - out1h = vec_sra(out1h, descale_p##PASS); \ - \ - out7 = vec_pack(out7l, out7h); \ - out1 = vec_pack(out1l, out1h); \ - \ - tmp56l = vec_mergeh(tmp5, tmp6); \ - tmp56h = vec_mergel(tmp5, tmp6); \ - \ - out5l = vec_msums(tmp56l, pw_mf050_mf256, z4l); \ - out5h = vec_msums(tmp56h, pw_mf050_mf256, z4h); \ - out3l = vec_msums(tmp56l, pw_mf256_f050, z3l); \ - out3h = vec_msums(tmp56h, pw_mf256_f050, z3h); \ - \ - out5l = vec_sra(out5l, descale_p##PASS); \ - out5h = vec_sra(out5h, descale_p##PASS); \ - out3l = vec_sra(out3l, descale_p##PASS); \ - out3h = vec_sra(out3h, descale_p##PASS); \ - \ - out5 = vec_pack(out5l, out5h); \ - out3 = vec_pack(out3l, out3h); \ -} - -#define DO_FDCT_PASS1() { \ - /* Even part */ \ - \ - tmp10 = vec_add(tmp0, tmp3); \ - tmp13 = vec_sub(tmp0, tmp3); \ - tmp11 = vec_add(tmp1, tmp2); \ - tmp12 = vec_sub(tmp1, tmp2); \ - \ - out0 = vec_add(tmp10, tmp11); \ - out0 = vec_sl(out0, pass1_bits); \ - out4 = vec_sub(tmp10, tmp11); \ - out4 = vec_sl(out4, pass1_bits); \ - \ - DO_FDCT_COMMON(1); \ -} - -#define DO_FDCT_PASS2() { \ - /* Even part */ \ - \ - tmp10 = vec_add(tmp0, tmp3); \ - tmp13 = vec_sub(tmp0, tmp3); \ - tmp11 = vec_add(tmp1, tmp2); \ - tmp12 = vec_sub(tmp1, tmp2); \ - \ - out0 = vec_add(tmp10, tmp11); \ - out0 = vec_add(out0, pw_descale_p2x); \ - out0 = vec_sra(out0, pass1_bits); \ - out4 = vec_sub(tmp10, tmp11); \ - out4 = vec_add(out4, pw_descale_p2x); \ - out4 = vec_sra(out4, pass1_bits); \ - \ - DO_FDCT_COMMON(2); \ -} - - -void jsimd_fdct_islow_altivec(DCTELEM *data) -{ - __vector short row0, row1, row2, row3, row4, row5, row6, row7, - col0, col1, col2, col3, col4, col5, col6, col7, - tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, - tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h, - z3, z4, z34l, z34h, - out0, out1, out2, out3, out4, out5, out6, out7; - __vector int z3l, z3h, z4l, z4h, - out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h, - out7l, out7h; - - /* Constants */ - __vector short - pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) }, - pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) }, - pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) }, - pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) }, - pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) }, - pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) }, - pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) }, - pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) }, - pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) }; - __vector unsigned short pass1_bits = { __8X(PASS1_BITS) }; - __vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) }, - pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) }; - __vector unsigned int descale_p1 = { __4X(DESCALE_P1) }, - descale_p2 = { __4X(DESCALE_P2) }; - - /* Pass 1: process rows */ - - row0 = vec_ld(0, data); - row1 = vec_ld(16, data); - row2 = vec_ld(32, data); - row3 = vec_ld(48, data); - row4 = vec_ld(64, data); - row5 = vec_ld(80, data); - row6 = vec_ld(96, data); - row7 = vec_ld(112, data); - - TRANSPOSE(row, col); - - tmp0 = vec_add(col0, col7); - tmp7 = vec_sub(col0, col7); - tmp1 = vec_add(col1, col6); - tmp6 = vec_sub(col1, col6); - tmp2 = vec_add(col2, col5); - tmp5 = vec_sub(col2, col5); - tmp3 = vec_add(col3, col4); - tmp4 = vec_sub(col3, col4); - - DO_FDCT_PASS1(); - - /* Pass 2: process columns */ - - TRANSPOSE(out, row); - - tmp0 = vec_add(row0, row7); - tmp7 = vec_sub(row0, row7); - tmp1 = vec_add(row1, row6); - tmp6 = vec_sub(row1, row6); - tmp2 = vec_add(row2, row5); - tmp5 = vec_sub(row2, row5); - tmp3 = vec_add(row3, row4); - tmp4 = vec_sub(row3, row4); - - DO_FDCT_PASS2(); - - vec_st(out0, 0, data); - vec_st(out1, 16, data); - vec_st(out2, 32, data); - vec_st(out3, 48, data); - vec_st(out4, 64, data); - vec_st(out5, 80, data); - vec_st(out6, 96, data); - vec_st(out7, 112, data); -} diff --git a/simd/powerpc/jidctfst-altivec.c b/simd/powerpc/jidctfst-altivec.c deleted file mode 100644 index 456c6c6..0000000 --- a/simd/powerpc/jidctfst-altivec.c +++ /dev/null @@ -1,255 +0,0 @@ -/* - * AltiVec optimizations for libjpeg-turbo - * - * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* FAST INTEGER INVERSE DCT - * - * This is similar to the SSE2 implementation, except that we left-shift the - * constants by 1 less bit (the -1 in CONST_SHIFT.) This is because - * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of: - * the elements in arg3 + the most significant 17 bits of - * (the elements in arg1 * the elements in arg2). - */ - -#include "jsimd_altivec.h" - - -#define F_1_082 277 /* FIX(1.082392200) */ -#define F_1_414 362 /* FIX(1.414213562) */ -#define F_1_847 473 /* FIX(1.847759065) */ -#define F_2_613 669 /* FIX(2.613125930) */ -#define F_1_613 (F_2_613 - 256) /* FIX(2.613125930) - FIX(1) */ - -#define CONST_BITS 8 -#define PASS1_BITS 2 -#define PRE_MULTIPLY_SCALE_BITS 2 -#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1) - - -#define DO_IDCT(in) { \ - /* Even part */ \ - \ - tmp10 = vec_add(in##0, in##4); \ - tmp11 = vec_sub(in##0, in##4); \ - tmp13 = vec_add(in##2, in##6); \ - \ - tmp12 = vec_sub(in##2, in##6); \ - tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \ - tmp12 = vec_madds(tmp12, pw_F1414, pw_zero); \ - tmp12 = vec_sub(tmp12, tmp13); \ - \ - tmp0 = vec_add(tmp10, tmp13); \ - tmp3 = vec_sub(tmp10, tmp13); \ - tmp1 = vec_add(tmp11, tmp12); \ - tmp2 = vec_sub(tmp11, tmp12); \ - \ - /* Odd part */ \ - \ - z13 = vec_add(in##5, in##3); \ - z10 = vec_sub(in##5, in##3); \ - z10s = vec_sl(z10, pre_multiply_scale_bits); \ - z11 = vec_add(in##1, in##7); \ - z12s = vec_sub(in##1, in##7); \ - z12s = vec_sl(z12s, pre_multiply_scale_bits); \ - \ - tmp11 = vec_sub(z11, z13); \ - tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \ - tmp11 = vec_madds(tmp11, pw_F1414, pw_zero); \ - \ - tmp7 = vec_add(z11, z13); \ - \ - /* To avoid overflow... \ - * \ - * (Original) \ - * tmp12 = -2.613125930 * z10 + z5; \ - * \ - * (This implementation) \ - * tmp12 = (-1.613125930 - 1) * z10 + z5; \ - * = -1.613125930 * z10 - z10 + z5; \ - */ \ - \ - z5 = vec_add(z10s, z12s); \ - z5 = vec_madds(z5, pw_F1847, pw_zero); \ - \ - tmp10 = vec_madds(z12s, pw_F1082, pw_zero); \ - tmp10 = vec_sub(tmp10, z5); \ - tmp12 = vec_madds(z10s, pw_MF1613, z5); \ - tmp12 = vec_sub(tmp12, z10); \ - \ - tmp6 = vec_sub(tmp12, tmp7); \ - tmp5 = vec_sub(tmp11, tmp6); \ - tmp4 = vec_add(tmp10, tmp5); \ - \ - out0 = vec_add(tmp0, tmp7); \ - out1 = vec_add(tmp1, tmp6); \ - out2 = vec_add(tmp2, tmp5); \ - out3 = vec_sub(tmp3, tmp4); \ - out4 = vec_add(tmp3, tmp4); \ - out5 = vec_sub(tmp2, tmp5); \ - out6 = vec_sub(tmp1, tmp6); \ - out7 = vec_sub(tmp0, tmp7); \ -} - - -void jsimd_idct_ifast_altivec(void *dct_table_, JCOEFPTR coef_block, - JSAMPARRAY output_buf, JDIMENSION output_col) -{ - short *dct_table = (short *)dct_table_; - int *outptr; - - __vector short row0, row1, row2, row3, row4, row5, row6, row7, - col0, col1, col2, col3, col4, col5, col6, col7, - quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7, - tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, - z5, z10, z10s, z11, z12s, z13, - out0, out1, out2, out3, out4, out5, out6, out7; - __vector signed char outb; - - /* Constants */ - __vector short pw_zero = { __8X(0) }, - pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) }, - pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) }, - pw_MF1613 = { __8X(-F_1_613 << CONST_SHIFT) }, - pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) }; - __vector unsigned short - pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) }, - pass1_bits3 = { __8X(PASS1_BITS + 3) }; - __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) }; - - /* Pass 1: process columns */ - - col0 = vec_ld(0, coef_block); - col1 = vec_ld(16, coef_block); - col2 = vec_ld(32, coef_block); - col3 = vec_ld(48, coef_block); - col4 = vec_ld(64, coef_block); - col5 = vec_ld(80, coef_block); - col6 = vec_ld(96, coef_block); - col7 = vec_ld(112, coef_block); - - tmp1 = vec_or(col1, col2); - tmp2 = vec_or(col3, col4); - tmp1 = vec_or(tmp1, tmp2); - tmp3 = vec_or(col5, col6); - tmp3 = vec_or(tmp3, col7); - tmp1 = vec_or(tmp1, tmp3); - - quant0 = vec_ld(0, dct_table); - col0 = vec_mladd(col0, quant0, pw_zero); - - if (vec_all_eq(tmp1, pw_zero)) { - /* AC terms all zero */ - - row0 = vec_splat(col0, 0); - row1 = vec_splat(col0, 1); - row2 = vec_splat(col0, 2); - row3 = vec_splat(col0, 3); - row4 = vec_splat(col0, 4); - row5 = vec_splat(col0, 5); - row6 = vec_splat(col0, 6); - row7 = vec_splat(col0, 7); - - } else { - - quant1 = vec_ld(16, dct_table); - quant2 = vec_ld(32, dct_table); - quant3 = vec_ld(48, dct_table); - quant4 = vec_ld(64, dct_table); - quant5 = vec_ld(80, dct_table); - quant6 = vec_ld(96, dct_table); - quant7 = vec_ld(112, dct_table); - - col1 = vec_mladd(col1, quant1, pw_zero); - col2 = vec_mladd(col2, quant2, pw_zero); - col3 = vec_mladd(col3, quant3, pw_zero); - col4 = vec_mladd(col4, quant4, pw_zero); - col5 = vec_mladd(col5, quant5, pw_zero); - col6 = vec_mladd(col6, quant6, pw_zero); - col7 = vec_mladd(col7, quant7, pw_zero); - - DO_IDCT(col); - - TRANSPOSE(out, row); - } - - /* Pass 2: process rows */ - - DO_IDCT(row); - - out0 = vec_sra(out0, pass1_bits3); - out1 = vec_sra(out1, pass1_bits3); - out2 = vec_sra(out2, pass1_bits3); - out3 = vec_sra(out3, pass1_bits3); - out4 = vec_sra(out4, pass1_bits3); - out5 = vec_sra(out5, pass1_bits3); - out6 = vec_sra(out6, pass1_bits3); - out7 = vec_sra(out7, pass1_bits3); - - TRANSPOSE(out, col); - - outb = vec_packs(col0, col0); - outb = vec_add(outb, pb_centerjsamp); - outptr = (int *)(output_buf[0] + output_col); - vec_ste((__vector int)outb, 0, outptr); - vec_ste((__vector int)outb, 4, outptr); - - outb = vec_packs(col1, col1); - outb = vec_add(outb, pb_centerjsamp); - outptr = (int *)(output_buf[1] + output_col); - vec_ste((__vector int)outb, 0, outptr); - vec_ste((__vector int)outb, 4, outptr); - - outb = vec_packs(col2, col2); - outb = vec_add(outb, pb_centerjsamp); - outptr = (int *)(output_buf[2] + output_col); - vec_ste((__vector int)outb, 0, outptr); - vec_ste((__vector int)outb, 4, outptr); - - outb = vec_packs(col3, col3); - outb = vec_add(outb, pb_centerjsamp); - outptr = (int *)(output_buf[3] + output_col); - vec_ste((__vector int)outb, 0, outptr); - vec_ste((__vector int)outb, 4, outptr); - - outb = vec_packs(col4, col4); - outb = vec_add(outb, pb_centerjsamp); - outptr = (int *)(output_buf[4] + output_col); - vec_ste((__vector int)outb, 0, outptr); - vec_ste((__vector int)outb, 4, outptr); - - outb = vec_packs(col5, col5); - outb = vec_add(outb, pb_centerjsamp); - outptr = (int *)(output_buf[5] + output_col); - vec_ste((__vector int)outb, 0, outptr); - vec_ste((__vector int)outb, 4, outptr); - - outb = vec_packs(col6, col6); - outb = vec_add(outb, pb_centerjsamp); - outptr = (int *)(output_buf[6] + output_col); - vec_ste((__vector int)outb, 0, outptr); - vec_ste((__vector int)outb, 4, outptr); - - outb = vec_packs(col7, col7); - outb = vec_add(outb, pb_centerjsamp); - outptr = (int *)(output_buf[7] + output_col); - vec_ste((__vector int)outb, 0, outptr); - vec_ste((__vector int)outb, 4, outptr); -} diff --git a/simd/powerpc/jidctint-altivec.c b/simd/powerpc/jidctint-altivec.c deleted file mode 100644 index 0e5dd58..0000000 --- a/simd/powerpc/jidctint-altivec.c +++ /dev/null @@ -1,357 +0,0 @@ -/* - * AltiVec optimizations for libjpeg-turbo - * - * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* SLOW INTEGER INVERSE DCT */ - -#include "jsimd_altivec.h" - - -#define F_0_298 2446 /* FIX(0.298631336) */ -#define F_0_390 3196 /* FIX(0.390180644) */ -#define F_0_541 4433 /* FIX(0.541196100) */ -#define F_0_765 6270 /* FIX(0.765366865) */ -#define F_0_899 7373 /* FIX(0.899976223) */ -#define F_1_175 9633 /* FIX(1.175875602) */ -#define F_1_501 12299 /* FIX(1.501321110) */ -#define F_1_847 15137 /* FIX(1.847759065) */ -#define F_1_961 16069 /* FIX(1.961570560) */ -#define F_2_053 16819 /* FIX(2.053119869) */ -#define F_2_562 20995 /* FIX(2.562915447) */ -#define F_3_072 25172 /* FIX(3.072711026) */ - -#define CONST_BITS 13 -#define PASS1_BITS 2 -#define DESCALE_P1 (CONST_BITS - PASS1_BITS) -#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3) - - -#define DO_IDCT(in, PASS) { \ - /* Even part \ - * \ - * (Original) \ - * z1 = (z2 + z3) * 0.541196100; \ - * tmp2 = z1 + z3 * -1.847759065; \ - * tmp3 = z1 + z2 * 0.765366865; \ - * \ - * (This implementation) \ - * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \ - * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \ - */ \ - \ - in##26l = vec_mergeh(in##2, in##6); \ - in##26h = vec_mergel(in##2, in##6); \ - \ - tmp3l = vec_msums(in##26l, pw_f130_f054, pd_zero); \ - tmp3h = vec_msums(in##26h, pw_f130_f054, pd_zero); \ - tmp2l = vec_msums(in##26l, pw_f054_mf130, pd_zero); \ - tmp2h = vec_msums(in##26h, pw_f054_mf130, pd_zero); \ - \ - tmp0 = vec_add(in##0, in##4); \ - tmp1 = vec_sub(in##0, in##4); \ - \ - tmp0l = vec_unpackh(tmp0); \ - tmp0h = vec_unpackl(tmp0); \ - tmp0l = vec_sl(tmp0l, const_bits); \ - tmp0h = vec_sl(tmp0h, const_bits); \ - tmp0l = vec_add(tmp0l, pd_descale_p##PASS); \ - tmp0h = vec_add(tmp0h, pd_descale_p##PASS); \ - \ - tmp10l = vec_add(tmp0l, tmp3l); \ - tmp10h = vec_add(tmp0h, tmp3h); \ - tmp13l = vec_sub(tmp0l, tmp3l); \ - tmp13h = vec_sub(tmp0h, tmp3h); \ - \ - tmp1l = vec_unpackh(tmp1); \ - tmp1h = vec_unpackl(tmp1); \ - tmp1l = vec_sl(tmp1l, const_bits); \ - tmp1h = vec_sl(tmp1h, const_bits); \ - tmp1l = vec_add(tmp1l, pd_descale_p##PASS); \ - tmp1h = vec_add(tmp1h, pd_descale_p##PASS); \ - \ - tmp11l = vec_add(tmp1l, tmp2l); \ - tmp11h = vec_add(tmp1h, tmp2h); \ - tmp12l = vec_sub(tmp1l, tmp2l); \ - tmp12h = vec_sub(tmp1h, tmp2h); \ - \ - /* Odd part */ \ - \ - z3 = vec_add(in##3, in##7); \ - z4 = vec_add(in##1, in##5); \ - \ - /* (Original) \ - * z5 = (z3 + z4) * 1.175875602; \ - * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \ - * z3 += z5; z4 += z5; \ - * \ - * (This implementation) \ - * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \ - * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \ - */ \ - \ - z34l = vec_mergeh(z3, z4); \ - z34h = vec_mergel(z3, z4); \ - \ - z3l = vec_msums(z34l, pw_mf078_f117, pd_zero); \ - z3h = vec_msums(z34h, pw_mf078_f117, pd_zero); \ - z4l = vec_msums(z34l, pw_f117_f078, pd_zero); \ - z4h = vec_msums(z34h, pw_f117_f078, pd_zero); \ - \ - /* (Original) \ - * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; \ - * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; \ - * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; \ - * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \ - * tmp0 += z1 + z3; tmp1 += z2 + z4; \ - * tmp2 += z2 + z3; tmp3 += z1 + z4; \ - * \ - * (This implementation) \ - * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \ - * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \ - * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \ - * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \ - * tmp0 += z3; tmp1 += z4; \ - * tmp2 += z3; tmp3 += z4; \ - */ \ - \ - in##71l = vec_mergeh(in##7, in##1); \ - in##71h = vec_mergel(in##7, in##1); \ - \ - tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l); \ - tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h); \ - tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l); \ - tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h); \ - \ - in##53l = vec_mergeh(in##5, in##3); \ - in##53h = vec_mergel(in##5, in##3); \ - \ - tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l); \ - tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h); \ - tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l); \ - tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h); \ - \ - /* Final output stage */ \ - \ - out0l = vec_add(tmp10l, tmp3l); \ - out0h = vec_add(tmp10h, tmp3h); \ - out7l = vec_sub(tmp10l, tmp3l); \ - out7h = vec_sub(tmp10h, tmp3h); \ - \ - out0l = vec_sra(out0l, descale_p##PASS); \ - out0h = vec_sra(out0h, descale_p##PASS); \ - out7l = vec_sra(out7l, descale_p##PASS); \ - out7h = vec_sra(out7h, descale_p##PASS); \ - \ - out0 = vec_pack(out0l, out0h); \ - out7 = vec_pack(out7l, out7h); \ - \ - out1l = vec_add(tmp11l, tmp2l); \ - out1h = vec_add(tmp11h, tmp2h); \ - out6l = vec_sub(tmp11l, tmp2l); \ - out6h = vec_sub(tmp11h, tmp2h); \ - \ - out1l = vec_sra(out1l, descale_p##PASS); \ - out1h = vec_sra(out1h, descale_p##PASS); \ - out6l = vec_sra(out6l, descale_p##PASS); \ - out6h = vec_sra(out6h, descale_p##PASS); \ - \ - out1 = vec_pack(out1l, out1h); \ - out6 = vec_pack(out6l, out6h); \ - \ - out2l = vec_add(tmp12l, tmp1l); \ - out2h = vec_add(tmp12h, tmp1h); \ - out5l = vec_sub(tmp12l, tmp1l); \ - out5h = vec_sub(tmp12h, tmp1h); \ - \ - out2l = vec_sra(out2l, descale_p##PASS); \ - out2h = vec_sra(out2h, descale_p##PASS); \ - out5l = vec_sra(out5l, descale_p##PASS); \ - out5h = vec_sra(out5h, descale_p##PASS); \ - \ - out2 = vec_pack(out2l, out2h); \ - out5 = vec_pack(out5l, out5h); \ - \ - out3l = vec_add(tmp13l, tmp0l); \ - out3h = vec_add(tmp13h, tmp0h); \ - out4l = vec_sub(tmp13l, tmp0l); \ - out4h = vec_sub(tmp13h, tmp0h); \ - \ - out3l = vec_sra(out3l, descale_p##PASS); \ - out3h = vec_sra(out3h, descale_p##PASS); \ - out4l = vec_sra(out4l, descale_p##PASS); \ - out4h = vec_sra(out4h, descale_p##PASS); \ - \ - out3 = vec_pack(out3l, out3h); \ - out4 = vec_pack(out4l, out4h); \ -} - - -void jsimd_idct_islow_altivec(void *dct_table_, JCOEFPTR coef_block, - JSAMPARRAY output_buf, JDIMENSION output_col) -{ - short *dct_table = (short *)dct_table_; - int *outptr; - - __vector short row0, row1, row2, row3, row4, row5, row6, row7, - col0, col1, col2, col3, col4, col5, col6, col7, - quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7, - tmp0, tmp1, tmp2, tmp3, z3, z4, - z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h, - row71l, row71h, row26l, row26h, row53l, row53h, - out0, out1, out2, out3, out4, out5, out6, out7; - __vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h, - tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h, - z3l, z3h, z4l, z4h, - out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h, - out5l, out5h, out6l, out6h, out7l, out7h; - __vector signed char outb; - - /* Constants */ - __vector short pw_zero = { __8X(0) }, - pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) }, - pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) }, - pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) }, - pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) }, - pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) }, - pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) }, - pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) }, - pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) }; - __vector unsigned short pass1_bits = { __8X(PASS1_BITS) }; - __vector int pd_zero = { __4X(0) }, - pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) }, - pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) }; - __vector unsigned int descale_p1 = { __4X(DESCALE_P1) }, - descale_p2 = { __4X(DESCALE_P2) }, - const_bits = { __4X(CONST_BITS) }; - __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) }; - - /* Pass 1: process columns */ - - col0 = vec_ld(0, coef_block); - col1 = vec_ld(16, coef_block); - col2 = vec_ld(32, coef_block); - col3 = vec_ld(48, coef_block); - col4 = vec_ld(64, coef_block); - col5 = vec_ld(80, coef_block); - col6 = vec_ld(96, coef_block); - col7 = vec_ld(112, coef_block); - - tmp1 = vec_or(col1, col2); - tmp2 = vec_or(col3, col4); - tmp1 = vec_or(tmp1, tmp2); - tmp3 = vec_or(col5, col6); - tmp3 = vec_or(tmp3, col7); - tmp1 = vec_or(tmp1, tmp3); - - quant0 = vec_ld(0, dct_table); - col0 = vec_mladd(col0, quant0, pw_zero); - - if (vec_all_eq(tmp1, pw_zero)) { - /* AC terms all zero */ - - col0 = vec_sl(col0, pass1_bits); - - row0 = vec_splat(col0, 0); - row1 = vec_splat(col0, 1); - row2 = vec_splat(col0, 2); - row3 = vec_splat(col0, 3); - row4 = vec_splat(col0, 4); - row5 = vec_splat(col0, 5); - row6 = vec_splat(col0, 6); - row7 = vec_splat(col0, 7); - - } else { - - quant1 = vec_ld(16, dct_table); - quant2 = vec_ld(32, dct_table); - quant3 = vec_ld(48, dct_table); - quant4 = vec_ld(64, dct_table); - quant5 = vec_ld(80, dct_table); - quant6 = vec_ld(96, dct_table); - quant7 = vec_ld(112, dct_table); - - col1 = vec_mladd(col1, quant1, pw_zero); - col2 = vec_mladd(col2, quant2, pw_zero); - col3 = vec_mladd(col3, quant3, pw_zero); - col4 = vec_mladd(col4, quant4, pw_zero); - col5 = vec_mladd(col5, quant5, pw_zero); - col6 = vec_mladd(col6, quant6, pw_zero); - col7 = vec_mladd(col7, quant7, pw_zero); - - DO_IDCT(col, 1); - - TRANSPOSE(out, row); - } - - /* Pass 2: process rows */ - - DO_IDCT(row, 2); - - TRANSPOSE(out, col); - - outb = vec_packs(col0, col0); - outb = vec_add(outb, pb_centerjsamp); - outptr = (int *)(output_buf[0] + output_col); - vec_ste((__vector int)outb, 0, outptr); - vec_ste((__vector int)outb, 4, outptr); - - outb = vec_packs(col1, col1); - outb = vec_add(outb, pb_centerjsamp); - outptr = (int *)(output_buf[1] + output_col); - vec_ste((__vector int)outb, 0, outptr); - vec_ste((__vector int)outb, 4, outptr); - - outb = vec_packs(col2, col2); - outb = vec_add(outb, pb_centerjsamp); - outptr = (int *)(output_buf[2] + output_col); - vec_ste((__vector int)outb, 0, outptr); - vec_ste((__vector int)outb, 4, outptr); - - outb = vec_packs(col3, col3); - outb = vec_add(outb, pb_centerjsamp); - outptr = (int *)(output_buf[3] + output_col); - vec_ste((__vector int)outb, 0, outptr); - vec_ste((__vector int)outb, 4, outptr); - - outb = vec_packs(col4, col4); - outb = vec_add(outb, pb_centerjsamp); - outptr = (int *)(output_buf[4] + output_col); - vec_ste((__vector int)outb, 0, outptr); - vec_ste((__vector int)outb, 4, outptr); - - outb = vec_packs(col5, col5); - outb = vec_add(outb, pb_centerjsamp); - outptr = (int *)(output_buf[5] + output_col); - vec_ste((__vector int)outb, 0, outptr); - vec_ste((__vector int)outb, 4, outptr); - - outb = vec_packs(col6, col6); - outb = vec_add(outb, pb_centerjsamp); - outptr = (int *)(output_buf[6] + output_col); - vec_ste((__vector int)outb, 0, outptr); - vec_ste((__vector int)outb, 4, outptr); - - outb = vec_packs(col7, col7); - outb = vec_add(outb, pb_centerjsamp); - outptr = (int *)(output_buf[7] + output_col); - vec_ste((__vector int)outb, 0, outptr); - vec_ste((__vector int)outb, 4, outptr); -} diff --git a/simd/powerpc/jquanti-altivec.c b/simd/powerpc/jquanti-altivec.c deleted file mode 100644 index 7d6e325..0000000 --- a/simd/powerpc/jquanti-altivec.c +++ /dev/null @@ -1,250 +0,0 @@ -/* - * AltiVec optimizations for libjpeg-turbo - * - * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */ - -#include "jsimd_altivec.h" - - -/* NOTE: The address will either be aligned or offset by 8 bytes, so we can - * always get the data we want by using a single vector load (although we may - * have to permute the result.) - */ -#if __BIG_ENDIAN__ - -#define LOAD_ROW(row) { \ - elemptr = sample_data[row] + start_col; \ - in##row = vec_ld(0, elemptr); \ - if ((size_t)elemptr & 15) \ - in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr)); \ -} - -#else - -#define LOAD_ROW(row) { \ - elemptr = sample_data[row] + start_col; \ - in##row = vec_vsx_ld(0, elemptr); \ -} - -#endif - - -void jsimd_convsamp_altivec(JSAMPARRAY sample_data, JDIMENSION start_col, - DCTELEM *workspace) -{ - JSAMPROW elemptr; - - __vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7; - __vector short out0, out1, out2, out3, out4, out5, out6, out7; - - /* Constants */ - __vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) }; - __vector unsigned char pb_zero = { __16X(0) }; - - LOAD_ROW(0); - LOAD_ROW(1); - LOAD_ROW(2); - LOAD_ROW(3); - LOAD_ROW(4); - LOAD_ROW(5); - LOAD_ROW(6); - LOAD_ROW(7); - - out0 = (__vector short)VEC_UNPACKHU(in0); - out1 = (__vector short)VEC_UNPACKHU(in1); - out2 = (__vector short)VEC_UNPACKHU(in2); - out3 = (__vector short)VEC_UNPACKHU(in3); - out4 = (__vector short)VEC_UNPACKHU(in4); - out5 = (__vector short)VEC_UNPACKHU(in5); - out6 = (__vector short)VEC_UNPACKHU(in6); - out7 = (__vector short)VEC_UNPACKHU(in7); - - out0 = vec_sub(out0, pw_centerjsamp); - out1 = vec_sub(out1, pw_centerjsamp); - out2 = vec_sub(out2, pw_centerjsamp); - out3 = vec_sub(out3, pw_centerjsamp); - out4 = vec_sub(out4, pw_centerjsamp); - out5 = vec_sub(out5, pw_centerjsamp); - out6 = vec_sub(out6, pw_centerjsamp); - out7 = vec_sub(out7, pw_centerjsamp); - - vec_st(out0, 0, workspace); - vec_st(out1, 16, workspace); - vec_st(out2, 32, workspace); - vec_st(out3, 48, workspace); - vec_st(out4, 64, workspace); - vec_st(out5, 80, workspace); - vec_st(out6, 96, workspace); - vec_st(out7, 112, workspace); -} - - -#define WORD_BIT 16 - -/* There is no AltiVec 16-bit unsigned multiply instruction, hence this. - We basically need an unsigned equivalent of vec_madds(). */ - -#define MULTIPLY(vs0, vs1, out) { \ - tmpe = vec_mule((__vector unsigned short)vs0, \ - (__vector unsigned short)vs1); \ - tmpo = vec_mulo((__vector unsigned short)vs0, \ - (__vector unsigned short)vs1); \ - out = (__vector short)vec_perm((__vector unsigned short)tmpe, \ - (__vector unsigned short)tmpo, \ - shift_pack_index); \ -} - -void jsimd_quantize_altivec(JCOEFPTR coef_block, DCTELEM *divisors, - DCTELEM *workspace) -{ - __vector short row0, row1, row2, row3, row4, row5, row6, row7, - row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s, - corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7, - recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7, - scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7; - __vector unsigned int tmpe, tmpo; - - /* Constants */ - __vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) }; -#if __BIG_ENDIAN__ - __vector unsigned char shift_pack_index = - { 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29 }; -#else - __vector unsigned char shift_pack_index = - { 2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31 }; -#endif - - row0 = vec_ld(0, workspace); - row1 = vec_ld(16, workspace); - row2 = vec_ld(32, workspace); - row3 = vec_ld(48, workspace); - row4 = vec_ld(64, workspace); - row5 = vec_ld(80, workspace); - row6 = vec_ld(96, workspace); - row7 = vec_ld(112, workspace); - - /* Branch-less absolute value */ - row0s = vec_sra(row0, pw_word_bit_m1); - row1s = vec_sra(row1, pw_word_bit_m1); - row2s = vec_sra(row2, pw_word_bit_m1); - row3s = vec_sra(row3, pw_word_bit_m1); - row4s = vec_sra(row4, pw_word_bit_m1); - row5s = vec_sra(row5, pw_word_bit_m1); - row6s = vec_sra(row6, pw_word_bit_m1); - row7s = vec_sra(row7, pw_word_bit_m1); - row0 = vec_xor(row0, row0s); - row1 = vec_xor(row1, row1s); - row2 = vec_xor(row2, row2s); - row3 = vec_xor(row3, row3s); - row4 = vec_xor(row4, row4s); - row5 = vec_xor(row5, row5s); - row6 = vec_xor(row6, row6s); - row7 = vec_xor(row7, row7s); - row0 = vec_sub(row0, row0s); - row1 = vec_sub(row1, row1s); - row2 = vec_sub(row2, row2s); - row3 = vec_sub(row3, row3s); - row4 = vec_sub(row4, row4s); - row5 = vec_sub(row5, row5s); - row6 = vec_sub(row6, row6s); - row7 = vec_sub(row7, row7s); - - corr0 = vec_ld(DCTSIZE2 * 2, divisors); - corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors); - corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors); - corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors); - corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors); - corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors); - corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors); - corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors); - - row0 = vec_add(row0, corr0); - row1 = vec_add(row1, corr1); - row2 = vec_add(row2, corr2); - row3 = vec_add(row3, corr3); - row4 = vec_add(row4, corr4); - row5 = vec_add(row5, corr5); - row6 = vec_add(row6, corr6); - row7 = vec_add(row7, corr7); - - recip0 = vec_ld(0, divisors); - recip1 = vec_ld(16, divisors); - recip2 = vec_ld(32, divisors); - recip3 = vec_ld(48, divisors); - recip4 = vec_ld(64, divisors); - recip5 = vec_ld(80, divisors); - recip6 = vec_ld(96, divisors); - recip7 = vec_ld(112, divisors); - - MULTIPLY(row0, recip0, row0); - MULTIPLY(row1, recip1, row1); - MULTIPLY(row2, recip2, row2); - MULTIPLY(row3, recip3, row3); - MULTIPLY(row4, recip4, row4); - MULTIPLY(row5, recip5, row5); - MULTIPLY(row6, recip6, row6); - MULTIPLY(row7, recip7, row7); - - scale0 = vec_ld(DCTSIZE2 * 4, divisors); - scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors); - scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors); - scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors); - scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors); - scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors); - scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors); - scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors); - - MULTIPLY(row0, scale0, row0); - MULTIPLY(row1, scale1, row1); - MULTIPLY(row2, scale2, row2); - MULTIPLY(row3, scale3, row3); - MULTIPLY(row4, scale4, row4); - MULTIPLY(row5, scale5, row5); - MULTIPLY(row6, scale6, row6); - MULTIPLY(row7, scale7, row7); - - row0 = vec_xor(row0, row0s); - row1 = vec_xor(row1, row1s); - row2 = vec_xor(row2, row2s); - row3 = vec_xor(row3, row3s); - row4 = vec_xor(row4, row4s); - row5 = vec_xor(row5, row5s); - row6 = vec_xor(row6, row6s); - row7 = vec_xor(row7, row7s); - row0 = vec_sub(row0, row0s); - row1 = vec_sub(row1, row1s); - row2 = vec_sub(row2, row2s); - row3 = vec_sub(row3, row3s); - row4 = vec_sub(row4, row4s); - row5 = vec_sub(row5, row5s); - row6 = vec_sub(row6, row6s); - row7 = vec_sub(row7, row7s); - - vec_st(row0, 0, coef_block); - vec_st(row1, 16, coef_block); - vec_st(row2, 32, coef_block); - vec_st(row3, 48, coef_block); - vec_st(row4, 64, coef_block); - vec_st(row5, 80, coef_block); - vec_st(row6, 96, coef_block); - vec_st(row7, 112, coef_block); -} diff --git a/simd/powerpc/jsimd.c b/simd/powerpc/jsimd.c deleted file mode 100644 index d0d3981..0000000 --- a/simd/powerpc/jsimd.c +++ /dev/null @@ -1,872 +0,0 @@ -/* - * jsimd_powerpc.c - * - * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB - * Copyright (C) 2009-2011, 2014-2016, 2018, D. R. Commander. - * Copyright (C) 2015-2016, 2018, Matthieu Darbois. - * - * Based on the x86 SIMD extension for IJG JPEG library, - * Copyright (C) 1999-2006, MIYASAKA Masaru. - * For conditions of distribution and use, see copyright notice in jsimdext.inc - * - * This file contains the interface between the "normal" portions - * of the library and the SIMD implementations when running on a - * PowerPC architecture. - */ - -#ifdef __amigaos4__ -/* This must be defined first as it re-defines GLOBAL otherwise */ -#include <proto/exec.h> -#endif - -#define JPEG_INTERNALS -#include "../../jinclude.h" -#include "../../jpeglib.h" -#include "../../jsimd.h" -#include "../../jdct.h" -#include "../../jsimddct.h" -#include "../jsimd.h" - -#include <stdio.h> -#include <string.h> -#include <ctype.h> - -#if defined(__OpenBSD__) -#include <sys/param.h> -#include <sys/sysctl.h> -#include <machine/cpu.h> -#endif - -static unsigned int simd_support = ~0; - -#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)) - -#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024) - -LOCAL(int) -check_feature(char *buffer, char *feature) -{ - char *p; - - if (*feature == 0) - return 0; - if (strncmp(buffer, "cpu", 3) != 0) - return 0; - buffer += 3; - while (isspace(*buffer)) - buffer++; - - /* Check if 'feature' is present in the buffer as a separate word */ - while ((p = strstr(buffer, feature))) { - if (p > buffer && !isspace(*(p - 1))) { - buffer++; - continue; - } - p += strlen(feature); - if (*p != 0 && !isspace(*p)) { - buffer++; - continue; - } - return 1; - } - return 0; -} - -LOCAL(int) -parse_proc_cpuinfo(int bufsize) -{ - char *buffer = (char *)malloc(bufsize); - FILE *fd; - - simd_support = 0; - - if (!buffer) - return 0; - - fd = fopen("/proc/cpuinfo", "r"); - if (fd) { - while (fgets(buffer, bufsize, fd)) { - if (!strchr(buffer, '\n') && !feof(fd)) { - /* "impossible" happened - insufficient size of the buffer! */ - fclose(fd); - free(buffer); - return 0; - } - if (check_feature(buffer, "altivec")) - simd_support |= JSIMD_ALTIVEC; - } - fclose(fd); - } - free(buffer); - return 1; -} - -#endif - -/* - * Check what SIMD accelerations are supported. - * - * FIXME: This code is racy under a multi-threaded environment. - */ -LOCAL(void) -init_simd(void) -{ -#ifndef NO_GETENV - char *env = NULL; -#endif -#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)) - int bufsize = 1024; /* an initial guess for the line buffer size limit */ -#elif defined(__amigaos4__) - uint32 altivec = 0; -#elif defined(__OpenBSD__) - int mib[2] = { CTL_MACHDEP, CPU_ALTIVEC }; - int altivec; - size_t len = sizeof(altivec); -#endif - - if (simd_support != ~0U) - return; - - simd_support = 0; - -#if defined(__ALTIVEC__) || defined(__APPLE__) - simd_support |= JSIMD_ALTIVEC; -#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) - while (!parse_proc_cpuinfo(bufsize)) { - bufsize *= 2; - if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT) - break; - } -#elif defined(__amigaos4__) - IExec->GetCPUInfoTags(GCIT_VectorUnit, &altivec, TAG_DONE); - if (altivec == VECTORTYPE_ALTIVEC) - simd_support |= JSIMD_ALTIVEC; -#elif defined(__OpenBSD__) - if (sysctl(mib, 2, &altivec, &len, NULL, 0) == 0 && altivec != 0) - simd_support |= JSIMD_ALTIVEC; -#endif - -#ifndef NO_GETENV - /* Force different settings through environment variables */ - env = getenv("JSIMD_FORCEALTIVEC"); - if ((env != NULL) && (strcmp(env, "1") == 0)) - simd_support = JSIMD_ALTIVEC; - env = getenv("JSIMD_FORCENONE"); - if ((env != NULL) && (strcmp(env, "1") == 0)) - simd_support = 0; -#endif -} - -GLOBAL(int) -jsimd_can_rgb_ycc(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) - return 0; - - if (simd_support & JSIMD_ALTIVEC) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_rgb_gray(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) - return 0; - - if (simd_support & JSIMD_ALTIVEC) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_ycc_rgb(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) - return 0; - - if (simd_support & JSIMD_ALTIVEC) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_ycc_rgb565(void) -{ - return 0; -} - -GLOBAL(void) -jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, - JSAMPIMAGE output_buf, JDIMENSION output_row, - int num_rows) -{ - void (*altivecfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); - - switch (cinfo->in_color_space) { - case JCS_EXT_RGB: - altivecfct = jsimd_extrgb_ycc_convert_altivec; - break; - case JCS_EXT_RGBX: - case JCS_EXT_RGBA: - altivecfct = jsimd_extrgbx_ycc_convert_altivec; - break; - case JCS_EXT_BGR: - altivecfct = jsimd_extbgr_ycc_convert_altivec; - break; - case JCS_EXT_BGRX: - case JCS_EXT_BGRA: - altivecfct = jsimd_extbgrx_ycc_convert_altivec; - break; - case JCS_EXT_XBGR: - case JCS_EXT_ABGR: - altivecfct = jsimd_extxbgr_ycc_convert_altivec; - break; - case JCS_EXT_XRGB: - case JCS_EXT_ARGB: - altivecfct = jsimd_extxrgb_ycc_convert_altivec; - break; - default: - altivecfct = jsimd_rgb_ycc_convert_altivec; - break; - } - - altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); -} - -GLOBAL(void) -jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, - JSAMPIMAGE output_buf, JDIMENSION output_row, - int num_rows) -{ - void (*altivecfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); - - switch (cinfo->in_color_space) { - case JCS_EXT_RGB: - altivecfct = jsimd_extrgb_gray_convert_altivec; - break; - case JCS_EXT_RGBX: - case JCS_EXT_RGBA: - altivecfct = jsimd_extrgbx_gray_convert_altivec; - break; - case JCS_EXT_BGR: - altivecfct = jsimd_extbgr_gray_convert_altivec; - break; - case JCS_EXT_BGRX: - case JCS_EXT_BGRA: - altivecfct = jsimd_extbgrx_gray_convert_altivec; - break; - case JCS_EXT_XBGR: - case JCS_EXT_ABGR: - altivecfct = jsimd_extxbgr_gray_convert_altivec; - break; - case JCS_EXT_XRGB: - case JCS_EXT_ARGB: - altivecfct = jsimd_extxrgb_gray_convert_altivec; - break; - default: - altivecfct = jsimd_rgb_gray_convert_altivec; - break; - } - - altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); -} - -GLOBAL(void) -jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, - JDIMENSION input_row, JSAMPARRAY output_buf, - int num_rows) -{ - void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); - - switch (cinfo->out_color_space) { - case JCS_EXT_RGB: - altivecfct = jsimd_ycc_extrgb_convert_altivec; - break; - case JCS_EXT_RGBX: - case JCS_EXT_RGBA: - altivecfct = jsimd_ycc_extrgbx_convert_altivec; - break; - case JCS_EXT_BGR: - altivecfct = jsimd_ycc_extbgr_convert_altivec; - break; - case JCS_EXT_BGRX: - case JCS_EXT_BGRA: - altivecfct = jsimd_ycc_extbgrx_convert_altivec; - break; - case JCS_EXT_XBGR: - case JCS_EXT_ABGR: - altivecfct = jsimd_ycc_extxbgr_convert_altivec; - break; - case JCS_EXT_XRGB: - case JCS_EXT_ARGB: - altivecfct = jsimd_ycc_extxrgb_convert_altivec; - break; - default: - altivecfct = jsimd_ycc_rgb_convert_altivec; - break; - } - - altivecfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); -} - -GLOBAL(void) -jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, - JDIMENSION input_row, JSAMPARRAY output_buf, - int num_rows) -{ -} - -GLOBAL(int) -jsimd_can_h2v2_downsample(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - - if (simd_support & JSIMD_ALTIVEC) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_h2v1_downsample(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - - if (simd_support & JSIMD_ALTIVEC) - return 1; - - return 0; -} - -GLOBAL(void) -jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, - JSAMPARRAY input_data, JSAMPARRAY output_data) -{ - jsimd_h2v2_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor, - compptr->v_samp_factor, - compptr->width_in_blocks, input_data, - output_data); -} - -GLOBAL(void) -jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, - JSAMPARRAY input_data, JSAMPARRAY output_data) -{ - jsimd_h2v1_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor, - compptr->v_samp_factor, - compptr->width_in_blocks, input_data, - output_data); -} - -GLOBAL(int) -jsimd_can_h2v2_upsample(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - - if (simd_support & JSIMD_ALTIVEC) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_h2v1_upsample(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - - if (simd_support & JSIMD_ALTIVEC) - return 1; - - return 0; -} - -GLOBAL(void) -jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) -{ - jsimd_h2v2_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width, - input_data, output_data_ptr); -} - -GLOBAL(void) -jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) -{ - jsimd_h2v1_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width, - input_data, output_data_ptr); -} - -GLOBAL(int) -jsimd_can_h2v2_fancy_upsample(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - - if (simd_support & JSIMD_ALTIVEC) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_h2v1_fancy_upsample(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - - if (simd_support & JSIMD_ALTIVEC) - return 1; - - return 0; -} - -GLOBAL(void) -jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) -{ - jsimd_h2v2_fancy_upsample_altivec(cinfo->max_v_samp_factor, - compptr->downsampled_width, input_data, - output_data_ptr); -} - -GLOBAL(void) -jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) -{ - jsimd_h2v1_fancy_upsample_altivec(cinfo->max_v_samp_factor, - compptr->downsampled_width, input_data, - output_data_ptr); -} - -GLOBAL(int) -jsimd_can_h2v2_merged_upsample(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - - if (simd_support & JSIMD_ALTIVEC) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_h2v1_merged_upsample(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - - if (simd_support & JSIMD_ALTIVEC) - return 1; - - return 0; -} - -GLOBAL(void) -jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) -{ - void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); - - switch (cinfo->out_color_space) { - case JCS_EXT_RGB: - altivecfct = jsimd_h2v2_extrgb_merged_upsample_altivec; - break; - case JCS_EXT_RGBX: - case JCS_EXT_RGBA: - altivecfct = jsimd_h2v2_extrgbx_merged_upsample_altivec; - break; - case JCS_EXT_BGR: - altivecfct = jsimd_h2v2_extbgr_merged_upsample_altivec; - break; - case JCS_EXT_BGRX: - case JCS_EXT_BGRA: - altivecfct = jsimd_h2v2_extbgrx_merged_upsample_altivec; - break; - case JCS_EXT_XBGR: - case JCS_EXT_ABGR: - altivecfct = jsimd_h2v2_extxbgr_merged_upsample_altivec; - break; - case JCS_EXT_XRGB: - case JCS_EXT_ARGB: - altivecfct = jsimd_h2v2_extxrgb_merged_upsample_altivec; - break; - default: - altivecfct = jsimd_h2v2_merged_upsample_altivec; - break; - } - - altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); -} - -GLOBAL(void) -jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, - JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) -{ - void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); - - switch (cinfo->out_color_space) { - case JCS_EXT_RGB: - altivecfct = jsimd_h2v1_extrgb_merged_upsample_altivec; - break; - case JCS_EXT_RGBX: - case JCS_EXT_RGBA: - altivecfct = jsimd_h2v1_extrgbx_merged_upsample_altivec; - break; - case JCS_EXT_BGR: - altivecfct = jsimd_h2v1_extbgr_merged_upsample_altivec; - break; - case JCS_EXT_BGRX: - case JCS_EXT_BGRA: - altivecfct = jsimd_h2v1_extbgrx_merged_upsample_altivec; - break; - case JCS_EXT_XBGR: - case JCS_EXT_ABGR: - altivecfct = jsimd_h2v1_extxbgr_merged_upsample_altivec; - break; - case JCS_EXT_XRGB: - case JCS_EXT_ARGB: - altivecfct = jsimd_h2v1_extxrgb_merged_upsample_altivec; - break; - default: - altivecfct = jsimd_h2v1_merged_upsample_altivec; - break; - } - - altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); -} - -GLOBAL(int) -jsimd_can_convsamp(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (DCTSIZE != 8) - return 0; - if (BITS_IN_JSAMPLE != 8) - return 0; - if (sizeof(JDIMENSION) != 4) - return 0; - if (sizeof(DCTELEM) != 2) - return 0; - - if (simd_support & JSIMD_ALTIVEC) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_convsamp_float(void) -{ - return 0; -} - -GLOBAL(void) -jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, - DCTELEM *workspace) -{ - jsimd_convsamp_altivec(sample_data, start_col, workspace); -} - -GLOBAL(void) -jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col, - FAST_FLOAT *workspace) -{ -} - -GLOBAL(int) -jsimd_can_fdct_islow(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (DCTSIZE != 8) - return 0; - if (sizeof(DCTELEM) != 2) - return 0; - - if (simd_support & JSIMD_ALTIVEC) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_fdct_ifast(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (DCTSIZE != 8) - return 0; - if (sizeof(DCTELEM) != 2) - return 0; - - if (simd_support & JSIMD_ALTIVEC) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_fdct_float(void) -{ - return 0; -} - -GLOBAL(void) -jsimd_fdct_islow(DCTELEM *data) -{ - jsimd_fdct_islow_altivec(data); -} - -GLOBAL(void) -jsimd_fdct_ifast(DCTELEM *data) -{ - jsimd_fdct_ifast_altivec(data); -} - -GLOBAL(void) -jsimd_fdct_float(FAST_FLOAT *data) -{ -} - -GLOBAL(int) -jsimd_can_quantize(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (DCTSIZE != 8) - return 0; - if (sizeof(JCOEF) != 2) - return 0; - if (sizeof(DCTELEM) != 2) - return 0; - - if (simd_support & JSIMD_ALTIVEC) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_quantize_float(void) -{ - return 0; -} - -GLOBAL(void) -jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace) -{ - jsimd_quantize_altivec(coef_block, divisors, workspace); -} - -GLOBAL(void) -jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors, - FAST_FLOAT *workspace) -{ -} - -GLOBAL(int) -jsimd_can_idct_2x2(void) -{ - return 0; -} - -GLOBAL(int) -jsimd_can_idct_4x4(void) -{ - return 0; -} - -GLOBAL(void) -jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) -{ -} - -GLOBAL(void) -jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) -{ -} - -GLOBAL(int) -jsimd_can_idct_islow(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (DCTSIZE != 8) - return 0; - if (sizeof(JCOEF) != 2) - return 0; - - if (simd_support & JSIMD_ALTIVEC) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_idct_ifast(void) -{ - init_simd(); - - /* The code is optimised for these values only */ - if (DCTSIZE != 8) - return 0; - if (sizeof(JCOEF) != 2) - return 0; - - if (simd_support & JSIMD_ALTIVEC) - return 1; - - return 0; -} - -GLOBAL(int) -jsimd_can_idct_float(void) -{ - return 0; -} - -GLOBAL(void) -jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) -{ - jsimd_idct_islow_altivec(compptr->dct_table, coef_block, output_buf, - output_col); -} - -GLOBAL(void) -jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) -{ - jsimd_idct_ifast_altivec(compptr->dct_table, coef_block, output_buf, - output_col); -} - -GLOBAL(void) -jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) -{ -} - -GLOBAL(int) -jsimd_can_huff_encode_one_block(void) -{ - return 0; -} - -GLOBAL(JOCTET *) -jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, - int last_dc_val, c_derived_tbl *dctbl, - c_derived_tbl *actbl) -{ - return NULL; -} - -GLOBAL(int) -jsimd_can_encode_mcu_AC_first_prepare(void) -{ - return 0; -} - -GLOBAL(void) -jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, - const int *jpeg_natural_order_start, int Sl, - int Al, JCOEF *values, size_t *zerobits) -{ -} - -GLOBAL(int) -jsimd_can_encode_mcu_AC_refine_prepare(void) -{ - return 0; -} - -GLOBAL(int) -jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, - const int *jpeg_natural_order_start, int Sl, - int Al, JCOEF *absvalues, size_t *bits) -{ - return 0; -} diff --git a/simd/powerpc/jsimd_altivec.h b/simd/powerpc/jsimd_altivec.h deleted file mode 100644 index e8bdb06..0000000 --- a/simd/powerpc/jsimd_altivec.h +++ /dev/null @@ -1,98 +0,0 @@ -/* - * AltiVec optimizations for libjpeg-turbo - * - * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -#define JPEG_INTERNALS -#include "../../jinclude.h" -#include "../../jpeglib.h" -#include "../../jsimd.h" -#include "../../jdct.h" -#include "../../jsimddct.h" -#include "../jsimd.h" -#include <altivec.h> - - -/* Common code */ - -#define __4X(a) a, a, a, a -#define __4X2(a, b) a, b, a, b, a, b, a, b -#define __8X(a) __4X(a), __4X(a) -#define __16X(a) __8X(a), __8X(a) - -#define TRANSPOSE(row, col) { \ - __vector short row04l, row04h, row15l, row15h, \ - row26l, row26h, row37l, row37h; \ - __vector short col01e, col01o, col23e, col23o, \ - col45e, col45o, col67e, col67o; \ - \ - /* transpose coefficients (phase 1) */ \ - row04l = vec_mergeh(row##0, row##4); /* row04l=(00 40 01 41 02 42 03 43) */ \ - row04h = vec_mergel(row##0, row##4); /* row04h=(04 44 05 45 06 46 07 47) */ \ - row15l = vec_mergeh(row##1, row##5); /* row15l=(10 50 11 51 12 52 13 53) */ \ - row15h = vec_mergel(row##1, row##5); /* row15h=(14 54 15 55 16 56 17 57) */ \ - row26l = vec_mergeh(row##2, row##6); /* row26l=(20 60 21 61 22 62 23 63) */ \ - row26h = vec_mergel(row##2, row##6); /* row26h=(24 64 25 65 26 66 27 67) */ \ - row37l = vec_mergeh(row##3, row##7); /* row37l=(30 70 31 71 32 72 33 73) */ \ - row37h = vec_mergel(row##3, row##7); /* row37h=(34 74 35 75 36 76 37 77) */ \ - \ - /* transpose coefficients (phase 2) */ \ - col01e = vec_mergeh(row04l, row26l); /* col01e=(00 20 40 60 01 21 41 61) */ \ - col23e = vec_mergel(row04l, row26l); /* col23e=(02 22 42 62 03 23 43 63) */ \ - col45e = vec_mergeh(row04h, row26h); /* col45e=(04 24 44 64 05 25 45 65) */ \ - col67e = vec_mergel(row04h, row26h); /* col67e=(06 26 46 66 07 27 47 67) */ \ - col01o = vec_mergeh(row15l, row37l); /* col01o=(10 30 50 70 11 31 51 71) */ \ - col23o = vec_mergel(row15l, row37l); /* col23o=(12 32 52 72 13 33 53 73) */ \ - col45o = vec_mergeh(row15h, row37h); /* col45o=(14 34 54 74 15 35 55 75) */ \ - col67o = vec_mergel(row15h, row37h); /* col67o=(16 36 56 76 17 37 57 77) */ \ - \ - /* transpose coefficients (phase 3) */ \ - col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */ \ - col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71) */ \ - col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */ \ - col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */ \ - col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */ \ - col##5 = vec_mergel(col45e, col45o); /* col5=(05 15 25 35 45 55 65 75) */ \ - col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */ \ - col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */ \ -} - -#ifndef min -#define min(a, b) ((a) < (b) ? (a) : (b)) -#endif - - -/* Macros to abstract big/little endian bit twiddling */ - -#if __BIG_ENDIAN__ - -#define VEC_LD(a, b) vec_ld(a, b) -#define VEC_ST(a, b, c) vec_st(a, b, c) -#define VEC_UNPACKHU(a) vec_mergeh(pb_zero, a) -#define VEC_UNPACKLU(a) vec_mergel(pb_zero, a) - -#else - -#define VEC_LD(a, b) vec_vsx_ld(a, b) -#define VEC_ST(a, b, c) vec_vsx_st(a, b, c) -#define VEC_UNPACKHU(a) vec_mergeh(a, pb_zero) -#define VEC_UNPACKLU(a) vec_mergel(a, pb_zero) - -#endif diff --git a/simd/x86_64/jccolext-avx2.asm b/simd/x86_64/jccolext-avx2.asm index 5fa3848..10d2834 100644 --- a/simd/x86_64/jccolext-avx2.asm +++ b/simd/x86_64/jccolext-avx2.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jcolsamp.inc" @@ -96,12 +94,12 @@ EXTN(jsimd_rgb_ycc_convert_avx2): test cl, SIZEOF_BYTE jz short .column_ld2 sub rcx, byte SIZEOF_BYTE - movzx rax, BYTE [rsi+rcx] + movzx rax, byte [rsi+rcx] .column_ld2: test cl, SIZEOF_WORD jz short .column_ld4 sub rcx, byte SIZEOF_WORD - movzx rdx, WORD [rsi+rcx] + movzx rdx, word [rsi+rcx] shl rax, WORD_BIT or rax, rdx .column_ld4: diff --git a/simd/x86_64/jccolext-sse2.asm b/simd/x86_64/jccolext-sse2.asm index b1486c0..2c914d3 100644 --- a/simd/x86_64/jccolext-sse2.asm +++ b/simd/x86_64/jccolext-sse2.asm @@ -12,8 +12,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jcolsamp.inc" @@ -95,12 +93,12 @@ EXTN(jsimd_rgb_ycc_convert_sse2): test cl, SIZEOF_BYTE jz short .column_ld2 sub rcx, byte SIZEOF_BYTE - movzx rax, BYTE [rsi+rcx] + movzx rax, byte [rsi+rcx] .column_ld2: test cl, SIZEOF_WORD jz short .column_ld4 sub rcx, byte SIZEOF_WORD - movzx rdx, WORD [rsi+rcx] + movzx rdx, word [rsi+rcx] shl rax, WORD_BIT or rax, rdx .column_ld4: diff --git a/simd/x86_64/jccolor-avx2.asm b/simd/x86_64/jccolor-avx2.asm index f9f4be0..16b7829 100644 --- a/simd/x86_64/jccolor-avx2.asm +++ b/simd/x86_64/jccolor-avx2.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/x86_64/jccolor-sse2.asm b/simd/x86_64/jccolor-sse2.asm index 3e46601..e2955c2 100644 --- a/simd/x86_64/jccolor-sse2.asm +++ b/simd/x86_64/jccolor-sse2.asm @@ -12,8 +12,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/x86_64/jcgray-avx2.asm b/simd/x86_64/jcgray-avx2.asm index 0ec2410..591255b 100644 --- a/simd/x86_64/jcgray-avx2.asm +++ b/simd/x86_64/jcgray-avx2.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/x86_64/jcgray-sse2.asm b/simd/x86_64/jcgray-sse2.asm index edf9222..e389904 100644 --- a/simd/x86_64/jcgray-sse2.asm +++ b/simd/x86_64/jcgray-sse2.asm @@ -12,8 +12,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/x86_64/jcgryext-avx2.asm b/simd/x86_64/jcgryext-avx2.asm index 79e2aa0..175b60d 100644 --- a/simd/x86_64/jcgryext-avx2.asm +++ b/simd/x86_64/jcgryext-avx2.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jcolsamp.inc" @@ -88,12 +86,12 @@ EXTN(jsimd_rgb_gray_convert_avx2): test cl, SIZEOF_BYTE jz short .column_ld2 sub rcx, byte SIZEOF_BYTE - movzx rax, BYTE [rsi+rcx] + movzx rax, byte [rsi+rcx] .column_ld2: test cl, SIZEOF_WORD jz short .column_ld4 sub rcx, byte SIZEOF_WORD - movzx rdx, WORD [rsi+rcx] + movzx rdx, word [rsi+rcx] shl rax, WORD_BIT or rax, rdx .column_ld4: diff --git a/simd/x86_64/jcgryext-sse2.asm b/simd/x86_64/jcgryext-sse2.asm index 9c3ae5e..873be80 100644 --- a/simd/x86_64/jcgryext-sse2.asm +++ b/simd/x86_64/jcgryext-sse2.asm @@ -12,8 +12,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jcolsamp.inc" @@ -87,12 +85,12 @@ EXTN(jsimd_rgb_gray_convert_sse2): test cl, SIZEOF_BYTE jz short .column_ld2 sub rcx, byte SIZEOF_BYTE - movzx rax, BYTE [rsi+rcx] + movzx rax, byte [rsi+rcx] .column_ld2: test cl, SIZEOF_WORD jz short .column_ld4 sub rcx, byte SIZEOF_WORD - movzx rdx, WORD [rsi+rcx] + movzx rdx, word [rsi+rcx] shl rax, WORD_BIT or rax, rdx .column_ld4: diff --git a/simd/x86_64/jchuff-sse2.asm b/simd/x86_64/jchuff-sse2.asm index 1b091ad..7deab58 100644 --- a/simd/x86_64/jchuff-sse2.asm +++ b/simd/x86_64/jchuff-sse2.asm @@ -17,8 +17,6 @@ ; This file contains an SSE2 implementation for Huffman coding of one block. ; The following code is based directly on jchuff.c; see jchuff.c for more ; details. -; -; [TAB8] %include "jsimdext.inc" @@ -27,11 +25,10 @@ alignz 32 GLOBAL_DATA(jconst_huff_encode_one_block) + EXTERN EXTN(jpeg_nbits_table) EXTN(jconst_huff_encode_one_block): -%include "jpeg_nbits_table.inc" - alignz 32 ; -------------------------------------------------------------------------- @@ -200,7 +197,7 @@ EXTN(jsimd_huff_encode_one_block_sse2): mov buffer, r11 ; r11 is now sratch mov put_buffer, MMWORD [r10+16] ; put_buffer = state->cur.put_buffer; - mov put_bits, DWORD [r10+24] ; put_bits = state->cur.put_bits; + mov put_bits, dword [r10+24] ; put_bits = state->cur.put_bits; push r10 ; r10 is now scratch ; Encode the DC coefficient difference per section F.1.2.1 @@ -222,7 +219,7 @@ EXTN(jsimd_huff_encode_one_block_sse2): add ebx, esi ; temp2 += temp3; ; Find the number of bits needed for the magnitude of the coefficient - lea r11, [rel jpeg_nbits_table] + lea r11, [rel EXTN(jpeg_nbits_table)] movzx rdi, byte [r11 + rdi] ; nbits = JPEG_NBITS(temp); ; Emit the Huffman-coded symbol for the number of bits mov r11d, INT [r14 + rdi * 4] ; code = dctbl->ehufco[nbits]; @@ -289,7 +286,7 @@ EXTN(jsimd_huff_encode_one_block_sse2): lea rsi, [rsi+r12*2] ; k += r; shr r11, cl ; index >>= r; movzx rdi, word [rsi] ; temp = t1[k]; - lea rbx, [rel jpeg_nbits_table] + lea rbx, [rel EXTN(jpeg_nbits_table)] movzx rdi, byte [rbx + rdi] ; nbits = JPEG_NBITS(temp); .BRLOOP: cmp r12, 16 ; while (r > 15) { @@ -333,7 +330,7 @@ EXTN(jsimd_huff_encode_one_block_sse2): pop r10 ; Save put_buffer & put_bits mov MMWORD [r10+16], put_buffer ; state->cur.put_buffer = put_buffer; - mov DWORD [r10+24], put_bits ; state->cur.put_bits = put_bits; + mov dword [r10+24], put_bits ; state->cur.put_bits = put_bits; pop rbx uncollect_args 6 diff --git a/simd/x86_64/jcphuff-sse2.asm b/simd/x86_64/jcphuff-sse2.asm index a9446b7..8ed4472 100644 --- a/simd/x86_64/jcphuff-sse2.asm +++ b/simd/x86_64/jcphuff-sse2.asm @@ -16,8 +16,6 @@ ; ; This file contains an SSE2 implementation of data preparation for progressive ; Huffman encoding. See jcphuff.c for more details. -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/x86_64/jcsample-avx2.asm b/simd/x86_64/jcsample-avx2.asm index 9d5a861..d9922bb 100644 --- a/simd/x86_64/jcsample-avx2.asm +++ b/simd/x86_64/jcsample-avx2.asm @@ -14,8 +14,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/x86_64/jcsample-sse2.asm b/simd/x86_64/jcsample-sse2.asm index 1b31536..0f107e9 100644 --- a/simd/x86_64/jcsample-sse2.asm +++ b/simd/x86_64/jcsample-sse2.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/x86_64/jdcolext-avx2.asm b/simd/x86_64/jdcolext-avx2.asm index e2b96c7..677b8ed 100644 --- a/simd/x86_64/jdcolext-avx2.asm +++ b/simd/x86_64/jdcolext-avx2.asm @@ -14,8 +14,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jcolsamp.inc" @@ -334,7 +332,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2): vmovd eax, xmmA cmp rcx, byte SIZEOF_WORD jb short .column_st1 - mov WORD [rdi], ax + mov word [rdi], ax add rdi, byte SIZEOF_WORD sub rcx, byte SIZEOF_WORD shr rax, 16 @@ -343,7 +341,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2): ; space. test rcx, rcx jz short .nextrow - mov BYTE [rdi], al + mov byte [rdi], al %else ; RGB_PIXELSIZE == 4 ; ----------- diff --git a/simd/x86_64/jdcolext-sse2.asm b/simd/x86_64/jdcolext-sse2.asm index a94954b..071aa62 100644 --- a/simd/x86_64/jdcolext-sse2.asm +++ b/simd/x86_64/jdcolext-sse2.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jcolsamp.inc" @@ -306,7 +304,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2): movd eax, xmmA cmp rcx, byte SIZEOF_WORD jb short .column_st1 - mov WORD [rdi], ax + mov word [rdi], ax add rdi, byte SIZEOF_WORD sub rcx, byte SIZEOF_WORD shr rax, 16 @@ -315,7 +313,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2): ; space. test rcx, rcx jz short .nextrow - mov BYTE [rdi], al + mov byte [rdi], al %else ; RGB_PIXELSIZE == 4 ; ----------- diff --git a/simd/x86_64/jdcolor-avx2.asm b/simd/x86_64/jdcolor-avx2.asm index abad176..43de9db 100644 --- a/simd/x86_64/jdcolor-avx2.asm +++ b/simd/x86_64/jdcolor-avx2.asm @@ -14,8 +14,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/x86_64/jdcolor-sse2.asm b/simd/x86_64/jdcolor-sse2.asm index e7079f6..b3f1fec 100644 --- a/simd/x86_64/jdcolor-sse2.asm +++ b/simd/x86_64/jdcolor-sse2.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/x86_64/jdmerge-avx2.asm b/simd/x86_64/jdmerge-avx2.asm index ca3f063..9515a17 100644 --- a/simd/x86_64/jdmerge-avx2.asm +++ b/simd/x86_64/jdmerge-avx2.asm @@ -14,8 +14,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/x86_64/jdmerge-sse2.asm b/simd/x86_64/jdmerge-sse2.asm index f3e09fa..aedccc2 100644 --- a/simd/x86_64/jdmerge-sse2.asm +++ b/simd/x86_64/jdmerge-sse2.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/x86_64/jdmrgext-avx2.asm b/simd/x86_64/jdmrgext-avx2.asm index 04e8a94..bb733c5 100644 --- a/simd/x86_64/jdmrgext-avx2.asm +++ b/simd/x86_64/jdmrgext-avx2.asm @@ -14,8 +14,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jcolsamp.inc" @@ -339,7 +337,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2): vmovd eax, xmmA cmp rcx, byte SIZEOF_WORD jb short .column_st1 - mov WORD [rdi], ax + mov word [rdi], ax add rdi, byte SIZEOF_WORD sub rcx, byte SIZEOF_WORD shr rax, 16 @@ -348,7 +346,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2): ; space. test rcx, rcx jz short .endcolumn - mov BYTE [rdi], al + mov byte [rdi], al %else ; RGB_PIXELSIZE == 4 ; ----------- diff --git a/simd/x86_64/jdmrgext-sse2.asm b/simd/x86_64/jdmrgext-sse2.asm index 1cc3345..b176a4c 100644 --- a/simd/x86_64/jdmrgext-sse2.asm +++ b/simd/x86_64/jdmrgext-sse2.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jcolsamp.inc" @@ -310,7 +308,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): movd eax, xmmA cmp rcx, byte SIZEOF_WORD jb short .column_st1 - mov WORD [rdi], ax + mov word [rdi], ax add rdi, byte SIZEOF_WORD sub rcx, byte SIZEOF_WORD shr rax, 16 @@ -319,7 +317,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): ; space. test rcx, rcx jz short .endcolumn - mov BYTE [rdi], al + mov byte [rdi], al %else ; RGB_PIXELSIZE == 4 ; ----------- diff --git a/simd/x86_64/jdsample-avx2.asm b/simd/x86_64/jdsample-avx2.asm index 10fa5c4..fc274a9 100644 --- a/simd/x86_64/jdsample-avx2.asm +++ b/simd/x86_64/jdsample-avx2.asm @@ -14,8 +14,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/x86_64/jdsample-sse2.asm b/simd/x86_64/jdsample-sse2.asm index d8ccda9..20e0767 100644 --- a/simd/x86_64/jdsample-sse2.asm +++ b/simd/x86_64/jdsample-sse2.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" diff --git a/simd/x86_64/jfdctflt-sse.asm b/simd/x86_64/jfdctflt-sse.asm index 26f9fb6..ef27966 100644 --- a/simd/x86_64/jfdctflt-sse.asm +++ b/simd/x86_64/jfdctflt-sse.asm @@ -17,8 +17,6 @@ ; This file contains a floating-point implementation of the forward DCT ; (Discrete Cosine Transform). The following code is based directly on ; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" diff --git a/simd/x86_64/jfdctfst-sse2.asm b/simd/x86_64/jfdctfst-sse2.asm index aaf8b9e..2e1bfe6 100644 --- a/simd/x86_64/jfdctfst-sse2.asm +++ b/simd/x86_64/jfdctfst-sse2.asm @@ -18,8 +18,6 @@ ; the forward DCT (Discrete Cosine Transform). The following code is ; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c ; for more details. -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" diff --git a/simd/x86_64/jfdctint-avx2.asm b/simd/x86_64/jfdctint-avx2.asm index 448f47d..6ad4cf0 100644 --- a/simd/x86_64/jfdctint-avx2.asm +++ b/simd/x86_64/jfdctint-avx2.asm @@ -18,8 +18,6 @@ ; forward DCT (Discrete Cosine Transform). The following code is based ; directly on the IJG's original jfdctint.c; see the jfdctint.c for ; more details. -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" diff --git a/simd/x86_64/jfdctint-sse2.asm b/simd/x86_64/jfdctint-sse2.asm index ef16a52..5d0de3c 100644 --- a/simd/x86_64/jfdctint-sse2.asm +++ b/simd/x86_64/jfdctint-sse2.asm @@ -18,8 +18,6 @@ ; forward DCT (Discrete Cosine Transform). The following code is based ; directly on the IJG's original jfdctint.c; see the jfdctint.c for ; more details. -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" diff --git a/simd/x86_64/jidctflt-sse2.asm b/simd/x86_64/jidctflt-sse2.asm index b676ef3..ab95e1a 100644 --- a/simd/x86_64/jidctflt-sse2.asm +++ b/simd/x86_64/jidctflt-sse2.asm @@ -17,8 +17,6 @@ ; This file contains a floating-point implementation of the inverse DCT ; (Discrete Cosine Transform). The following code is based directly on ; the IJG's original jidctflt.c; see the jidctflt.c for more details. -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" @@ -95,8 +93,8 @@ EXTN(jsimd_idct_float_sse2): mov rcx, DCTSIZE/4 ; ctr .columnloop: %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE - mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] + mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] jnz near .columnDCT movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)] diff --git a/simd/x86_64/jidctfst-sse2.asm b/simd/x86_64/jidctfst-sse2.asm index c6c42f9..a66a681 100644 --- a/simd/x86_64/jidctfst-sse2.asm +++ b/simd/x86_64/jidctfst-sse2.asm @@ -18,8 +18,6 @@ ; the inverse DCT (Discrete Cosine Transform). The following code is ; based directly on the IJG's original jidctfst.c; see the jidctfst.c ; for more details. -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" @@ -111,8 +109,8 @@ EXTN(jsimd_idct_ifast_sse2): mov rsi, r11 ; inptr %ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2 - mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] + mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] jnz near .columnDCT movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] diff --git a/simd/x86_64/jidctint-avx2.asm b/simd/x86_64/jidctint-avx2.asm index b60b44f..50270f4 100644 --- a/simd/x86_64/jidctint-avx2.asm +++ b/simd/x86_64/jidctint-avx2.asm @@ -18,8 +18,6 @@ ; inverse DCT (Discrete Cosine Transform). The following code is based ; directly on the IJG's original jidctint.c; see the jidctint.c for ; more details. -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" @@ -292,8 +290,8 @@ EXTN(jsimd_idct_islow_avx2): ; ---- Pass 1: process columns. %ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2 - mov eax, DWORD [DWBLOCK(1,0,r11,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,r11,SIZEOF_JCOEF)] + mov eax, dword [DWBLOCK(1,0,r11,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,r11,SIZEOF_JCOEF)] jnz near .columnDCT movdqa xmm0, XMMWORD [XMMBLOCK(1,0,r11,SIZEOF_JCOEF)] diff --git a/simd/x86_64/jidctint-sse2.asm b/simd/x86_64/jidctint-sse2.asm index 83fc344..034530c 100644 --- a/simd/x86_64/jidctint-sse2.asm +++ b/simd/x86_64/jidctint-sse2.asm @@ -18,8 +18,6 @@ ; inverse DCT (Discrete Cosine Transform). The following code is based ; directly on the IJG's original jidctint.c; see the jidctint.c for ; more details. -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" @@ -124,8 +122,8 @@ EXTN(jsimd_idct_islow_sse2): mov rsi, r11 ; inptr %ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2 - mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] + mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] jnz near .columnDCT movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] diff --git a/simd/x86_64/jidctred-sse2.asm b/simd/x86_64/jidctred-sse2.asm index af64fdc..7fbfcc5 100644 --- a/simd/x86_64/jidctred-sse2.asm +++ b/simd/x86_64/jidctred-sse2.asm @@ -18,8 +18,6 @@ ; output: either 4x4 or 2x2 pixels from an 8x8 DCT block. ; The following code is based directly on the IJG's original jidctred.c; ; see the jidctred.c for more details. -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" @@ -132,8 +130,8 @@ EXTN(jsimd_idct_4x4_sse2): mov rsi, r11 ; inptr %ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2 - mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] + mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] + or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] jnz short .columnDCT movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] @@ -562,8 +560,8 @@ EXTN(jsimd_idct_2x2_sse2): mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] - mov WORD [rdx+rax*SIZEOF_JSAMPLE], bx - mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx + mov word [rdx+rax*SIZEOF_JSAMPLE], bx + mov word [rsi+rax*SIZEOF_JSAMPLE], cx pop rbx uncollect_args 4 diff --git a/simd/x86_64/jquantf-sse2.asm b/simd/x86_64/jquantf-sse2.asm index 4600eec..83596a9 100644 --- a/simd/x86_64/jquantf-sse2.asm +++ b/simd/x86_64/jquantf-sse2.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" diff --git a/simd/x86_64/jquanti-avx2.asm b/simd/x86_64/jquanti-avx2.asm index b7243e4..5f04d22 100644 --- a/simd/x86_64/jquanti-avx2.asm +++ b/simd/x86_64/jquanti-avx2.asm @@ -14,8 +14,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" diff --git a/simd/x86_64/jquanti-sse2.asm b/simd/x86_64/jquanti-sse2.asm index 7ff7275..bb6fa69 100644 --- a/simd/x86_64/jquanti-sse2.asm +++ b/simd/x86_64/jquanti-sse2.asm @@ -13,8 +13,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" %include "jdct.inc" diff --git a/simd/x86_64/jsimd.c b/simd/x86_64/jsimd.c index 1e5698b..dc639fc 100644 --- a/simd/x86_64/jsimd.c +++ b/simd/x86_64/jsimd.c @@ -472,6 +472,12 @@ jsimd_can_h2v1_fancy_upsample(void) return 0; } +GLOBAL(int) +jsimd_can_h1v2_fancy_upsample(void) +{ + return 0; +} + GLOBAL(void) jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) @@ -500,6 +506,12 @@ jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, output_data_ptr); } +GLOBAL(void) +jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, + JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) +{ +} + GLOBAL(int) jsimd_can_h2v2_merged_upsample(void) { diff --git a/simd/x86_64/jsimdcpu.asm b/simd/x86_64/jsimdcpu.asm index a905282..705f813 100644 --- a/simd/x86_64/jsimdcpu.asm +++ b/simd/x86_64/jsimdcpu.asm @@ -14,8 +14,6 @@ ; assembler (including Borland's Turbo Assembler). ; NASM is available from http://nasm.sourceforge.net/ or ; http://sourceforge.net/project/showfiles.php?group_id=6208 -; -; [TAB8] %include "jsimdext.inc" |