summaryrefslogtreecommitdiff
path: root/simd
diff options
context:
space:
mode:
authorElliott Hughes <enh@google.com>2020-12-05 11:38:16 -0800
committerElliott Hughes <enh@google.com>2020-12-07 08:21:18 -0800
commit6fe8e429f22e958586ca1b56595cf0c48b2f9c08 (patch)
tree705c48595f7c73fe043c39b4d73b81d4cdca5d4e /simd
parent98e581f8227b9846b7adc92c0c63f5ed2384ff4b (diff)
parentbbb828223e9c8f83f0e84db1e98b116029e62765 (diff)
Update to bbb828223e9c8f83f0e84db1e98b116029e62765.
Test: treehugger Change-Id: I5608ab5b6eb0f2225cd578a711ea0fa3be09f5e8
Diffstat (limited to 'simd')
-rwxr-xr-xsimd/CMakeLists.txt148
-rw-r--r--simd/arm/aarch32/jccolext-neon.c (renamed from simd/arm/arm/jccolext-neon.c)43
-rw-r--r--simd/arm/aarch32/jchuff-neon.c334
-rw-r--r--simd/arm/aarch32/jsimd.c (renamed from simd/arm/arm/jsimd.c)32
-rw-r--r--simd/arm/aarch64/jccolext-neon.c (renamed from simd/arm/arm64/jccolext-neon.c)146
-rw-r--r--simd/arm/aarch64/jchuff-neon.c403
-rw-r--r--simd/arm/aarch64/jsimd.c (renamed from simd/arm/arm64/jsimd.c)115
-rw-r--r--simd/arm/align.h28
-rw-r--r--simd/arm/arm/jsimd_neon.S499
-rw-r--r--simd/arm/arm64/jsimd_neon.S538
-rw-r--r--simd/arm/common/jdsample-neon.c557
-rw-r--r--simd/arm/jccolor-neon.c (renamed from simd/arm/common/jccolor-neon.c)70
-rw-r--r--simd/arm/jcgray-neon.c (renamed from simd/arm/common/jcgray-neon.c)20
-rw-r--r--simd/arm/jcgryext-neon.c (renamed from simd/arm/common/jcgryext-neon.c)31
-rw-r--r--simd/arm/jchuff.h149
-rw-r--r--simd/arm/jcphuff-neon.c591
-rw-r--r--simd/arm/jcsample-neon.c (renamed from simd/arm/common/jcsample-neon.c)103
-rw-r--r--simd/arm/jdcolext-neon.c (renamed from simd/arm/common/jdcolext-neon.c)263
-rw-r--r--simd/arm/jdcolor-neon.c (renamed from simd/arm/common/jdcolor-neon.c)25
-rw-r--r--simd/arm/jdmerge-neon.c (renamed from simd/arm/common/jdmerge-neon.c)26
-rw-r--r--simd/arm/jdmrgext-neon.c (renamed from simd/arm/common/jdmrgext-neon.c)504
-rw-r--r--simd/arm/jdsample-neon.c569
-rw-r--r--simd/arm/jfdctfst-neon.c (renamed from simd/arm/common/jfdctfst-neon.c)43
-rw-r--r--simd/arm/jfdctint-neon.c (renamed from simd/arm/common/jfdctint-neon.c)101
-rw-r--r--simd/arm/jidctfst-neon.c (renamed from simd/arm/common/jidctfst-neon.c)146
-rw-r--r--simd/arm/jidctint-neon.c (renamed from simd/arm/common/jidctint-neon.c)344
-rw-r--r--simd/arm/jidctred-neon.c (renamed from simd/arm/common/jidctred-neon.c)191
-rw-r--r--simd/arm/jquanti-neon.c (renamed from simd/arm/common/jquanti-neon.c)82
-rw-r--r--simd/arm/neon-compat.h37
-rw-r--r--simd/arm/neon-compat.h.in35
-rwxr-xr-xsimd/gas-preprocessor.in1
-rw-r--r--simd/i386/jchuff-sse2.asm1064
-rw-r--r--simd/i386/jfdctint-avx2.asm6
-rw-r--r--simd/i386/jfdctint-mmx.asm4
-rw-r--r--simd/i386/jfdctint-sse2.asm4
-rw-r--r--simd/i386/jidctint-avx2.asm6
-rw-r--r--simd/i386/jidctint-mmx.asm4
-rw-r--r--simd/i386/jidctint-sse2.asm4
-rw-r--r--simd/i386/jsimd.c12
-rw-r--r--simd/jsimd.h118
-rw-r--r--simd/nasm/jsimdcfg.inc.h12
-rw-r--r--simd/nasm/jsimdext.inc45
-rw-r--r--simd/x86_64/jccolext-avx2.asm15
-rw-r--r--simd/x86_64/jccolext-sse2.asm15
-rw-r--r--simd/x86_64/jcgryext-avx2.asm7
-rw-r--r--simd/x86_64/jcgryext-sse2.asm7
-rw-r--r--simd/x86_64/jchuff-sse2.asm807
-rw-r--r--simd/x86_64/jcsample-avx2.asm15
-rw-r--r--simd/x86_64/jcsample-sse2.asm15
-rw-r--r--simd/x86_64/jdcolext-avx2.asm15
-rw-r--r--simd/x86_64/jdcolext-sse2.asm15
-rw-r--r--simd/x86_64/jdmrgext-avx2.asm47
-rw-r--r--simd/x86_64/jdmrgext-sse2.asm47
-rw-r--r--simd/x86_64/jdsample-avx2.asm33
-rw-r--r--simd/x86_64/jdsample-sse2.asm33
-rw-r--r--simd/x86_64/jfdctint-avx2.asm6
-rw-r--r--simd/x86_64/jfdctint-sse2.asm4
-rw-r--r--simd/x86_64/jidctflt-sse2.asm9
-rw-r--r--simd/x86_64/jidctfst-sse2.asm17
-rw-r--r--simd/x86_64/jidctint-avx2.asm23
-rw-r--r--simd/x86_64/jidctint-sse2.asm21
-rw-r--r--simd/x86_64/jidctred-sse2.asm13
-rw-r--r--simd/x86_64/jquantf-sse2.asm5
-rw-r--r--simd/x86_64/jquanti-avx2.asm17
-rw-r--r--simd/x86_64/jquanti-sse2.asm9
-rw-r--r--simd/x86_64/jsimd.c16
66 files changed, 5143 insertions, 3521 deletions
diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt
index 5c8009a..f3c24ef 100755
--- a/simd/CMakeLists.txt
+++ b/simd/CMakeLists.txt
@@ -30,6 +30,9 @@ if(CPU_TYPE STREQUAL "x86_64")
if(CYGWIN)
set(CMAKE_ASM_NASM_OBJECT_FORMAT win64)
endif()
+ if(CMAKE_C_COMPILER_ABI MATCHES "ELF X32")
+ set(CMAKE_ASM_NASM_OBJECT_FORMAT elfx32)
+ endif()
elseif(CPU_TYPE STREQUAL "i386")
if(BORLAND)
set(CMAKE_ASM_NASM_OBJECT_FORMAT obj)
@@ -205,64 +208,76 @@ endif()
###############################################################################
-# ARM (GAS)
+# Arm (Intrinsics or GAS)
###############################################################################
elseif(CPU_TYPE STREQUAL "arm64" OR CPU_TYPE STREQUAL "arm")
-enable_language(ASM)
-
-set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_ASM_FLAGS}")
-
-string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC)
-set(EFFECTIVE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CMAKE_ASM_FLAGS_${CMAKE_BUILD_TYPE_UC}}")
-message(STATUS "CMAKE_ASM_FLAGS = ${EFFECTIVE_ASM_FLAGS}")
-
-# Test whether we need gas-preprocessor.pl
-if(CPU_TYPE STREQUAL "arm")
- file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/gastest.S "
- .text
- .fpu neon
- .arch armv7a
- .object_arch armv4
- .arm
- pld [r0]
- vmovn.u16 d0, q0")
+include(CheckSymbolExists)
+if(BITS EQUAL 32)
+ set(CMAKE_REQUIRED_FLAGS -mfpu=neon)
+endif()
+check_symbol_exists(vld1_s16_x3 arm_neon.h HAVE_VLD1_S16_X3)
+check_symbol_exists(vld1_u16_x2 arm_neon.h HAVE_VLD1_U16_X2)
+check_symbol_exists(vld1q_u8_x4 arm_neon.h HAVE_VLD1Q_U8_X4)
+if(BITS EQUAL 32)
+ unset(CMAKE_REQUIRED_FLAGS)
+endif()
+configure_file(arm/neon-compat.h.in arm/neon-compat.h @ONLY)
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/arm)
+
+# GCC (as of this writing) and some older versions of Clang do not have a full
+# or optimal set of Neon intrinsics, so for performance reasons, when using
+# those compilers, we default to using the older GAS implementation of the Neon
+# SIMD extensions for certain algorithms. The presence or absence of the three
+# intrinsics we tested above is a reasonable proxy for this. We always default
+# to using the full Neon intrinsics implementation when building for macOS or
+# iOS, to avoid the need for gas-preprocessor.
+if((HAVE_VLD1_S16_X3 AND HAVE_VLD1_U16_X2 AND HAVE_VLD1Q_U8_X4) OR APPLE)
+ set(DEFAULT_NEON_INTRINSICS 1)
else()
- file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/gastest.S "
- .text
- MYVAR .req x0
- movi v0.16b, #100
- mov MYVAR, #100
- .unreq MYVAR")
+ set(DEFAULT_NEON_INTRINSICS 0)
endif()
-
-separate_arguments(CMAKE_ASM_FLAGS_SEP UNIX_COMMAND "${CMAKE_ASM_FLAGS}")
-
-execute_process(COMMAND ${CMAKE_ASM_COMPILER} ${CMAKE_ASM_FLAGS_SEP}
- -x assembler-with-cpp -c ${CMAKE_CURRENT_BINARY_DIR}/gastest.S
- RESULT_VARIABLE RESULT OUTPUT_VARIABLE OUTPUT ERROR_VARIABLE ERROR)
-if(NOT RESULT EQUAL 0)
- message(STATUS "GAS appears to be broken. Trying gas-preprocessor.pl ...")
- execute_process(COMMAND gas-preprocessor.pl ${CMAKE_ASM_COMPILER}
- ${CMAKE_ASM_FLAGS_SEP} -x assembler-with-cpp -c
- ${CMAKE_CURRENT_BINARY_DIR}/gastest.S
- RESULT_VARIABLE RESULT OUTPUT_VARIABLE OUTPUT ERROR_VARIABLE ERROR)
- if(NOT RESULT EQUAL 0)
- simd_fail("SIMD extensions disabled: GAS is not working properly")
- return()
- else()
- message(STATUS "Using gas-preprocessor.pl")
- configure_file(gas-preprocessor.in gas-preprocessor @ONLY)
- set(CMAKE_ASM_COMPILER ${CMAKE_CURRENT_BINARY_DIR}/gas-preprocessor)
- endif()
+option(NEON_INTRINSICS
+ "Because GCC (as of this writing) and some older versions of Clang do not have a full or optimal set of Neon intrinsics, for performance reasons, the default when building libjpeg-turbo with those compilers is to continue using the older GAS implementation of the Neon SIMD extensions for certain algorithms. Setting this option forces the full Neon intrinsics implementation to be used with all compilers. Unsetting this option forces the hybrid GAS/intrinsics implementation to be used with all compilers."
+ ${DEFAULT_NEON_INTRINSICS})
+boolean_number(NEON_INTRINSICS PARENT_SCOPE)
+if(NEON_INTRINSICS)
+ add_definitions(-DNEON_INTRINSICS)
+ message(STATUS "Use full Neon SIMD intrinsics implementation (NEON_INTRINSICS = ${NEON_INTRINSICS})")
else()
- message(STATUS "GAS is working properly")
+ message(STATUS "Use partial Neon SIMD intrinsics implementation (NEON_INTRINSICS = ${NEON_INTRINSICS})")
+endif()
+
+set(SIMD_SOURCES arm/jcgray-neon.c arm/jcphuff-neon.c arm/jcsample-neon.c
+ arm/jdmerge-neon.c arm/jdsample-neon.c arm/jfdctfst-neon.c
+ arm/jidctred-neon.c arm/jquanti-neon.c)
+if(NEON_INTRINSICS)
+ set(SIMD_SOURCES ${SIMD_SOURCES} arm/jccolor-neon.c arm/jidctint-neon.c)
+endif()
+if(NEON_INTRINSICS OR BITS EQUAL 64)
+ set(SIMD_SOURCES ${SIMD_SOURCES} arm/jidctfst-neon.c)
+endif()
+if(NEON_INTRINSICS OR BITS EQUAL 32)
+ set(SIMD_SOURCES ${SIMD_SOURCES} arm/aarch${BITS}/jchuff-neon.c
+ arm/jdcolor-neon.c arm/jfdctint-neon.c)
+endif()
+if(BITS EQUAL 32)
+ set_source_files_properties(${SIMD_SOURCES} COMPILE_FLAGS -mfpu=neon)
endif()
+if(NOT NEON_INTRINSICS)
+ enable_language(ASM)
-file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/gastest.S)
+ set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_ASM_FLAGS}")
-add_library(simd OBJECT ${CPU_TYPE}/jsimd_neon.S ${CPU_TYPE}/jsimd.c)
+ string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC)
+ set(EFFECTIVE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CMAKE_ASM_FLAGS_${CMAKE_BUILD_TYPE_UC}}")
+ message(STATUS "CMAKE_ASM_FLAGS = ${EFFECTIVE_ASM_FLAGS}")
+
+ set(SIMD_SOURCES ${SIMD_SOURCES} arm/aarch${BITS}/jsimd_neon.S)
+endif()
+
+add_library(simd OBJECT ${SIMD_SOURCES} arm/aarch${BITS}/jsimd.c)
if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
@@ -311,14 +326,35 @@ if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
endif()
###############################################################################
-# Loongson (Intrinsics)
+# MIPS64 (Intrinsics)
###############################################################################
-elseif(CPU_TYPE STREQUAL "loongson")
+elseif(CPU_TYPE STREQUAL "loongson" OR CPU_TYPE MATCHES "mips64*")
+
+set(CMAKE_REQUIRED_FLAGS -Wa,-mloongson-mmi,-mloongson-ext)
+
+check_c_source_compiles("
+ int main(void) {
+ int c = 0, a = 0, b = 0;
+ asm (
+ \"paddb %0, %1, %2\"
+ : \"=f\" (c)
+ : \"f\" (a), \"f\" (b)
+ );
+ return c;
+ }" HAVE_MMI)
+
+unset(CMAKE_REQUIRED_FLAGS)
-set(SIMD_SOURCES loongson/jccolor-mmi.c loongson/jcsample-mmi.c
- loongson/jdcolor-mmi.c loongson/jdsample-mmi.c loongson/jfdctint-mmi.c
- loongson/jidctint-mmi.c loongson/jquanti-mmi.c)
+if(NOT HAVE_MMI)
+ simd_fail("SIMD extensions not available for this CPU")
+ return()
+endif()
+
+set(SIMD_SOURCES mips64/jccolor-mmi.c mips64/jcgray-mmi.c mips64/jcsample-mmi.c
+ mips64/jdcolor-mmi.c mips64/jdmerge-mmi.c mips64/jdsample-mmi.c
+ mips64/jfdctfst-mmi.c mips64/jfdctint-mmi.c mips64/jidctfst-mmi.c
+ mips64/jidctint-mmi.c mips64/jquanti-mmi.c)
if(CMAKE_COMPILER_IS_GNUCC)
foreach(file ${SIMD_SOURCES})
@@ -326,8 +362,12 @@ if(CMAKE_COMPILER_IS_GNUCC)
" -fno-strict-aliasing")
endforeach()
endif()
+foreach(file ${SIMD_SOURCES})
+ set_property(SOURCE ${file} APPEND_STRING PROPERTY COMPILE_FLAGS
+ " -Wa,-mloongson-mmi,-mloongson-ext")
+endforeach()
-add_library(simd OBJECT ${SIMD_SOURCES} loongson/jsimd.c)
+add_library(simd OBJECT ${SIMD_SOURCES} mips64/jsimd.c)
if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
diff --git a/simd/arm/arm/jccolext-neon.c b/simd/arm/aarch32/jccolext-neon.c
index 4f22e1f..362102d 100644
--- a/simd/arm/arm/jccolext-neon.c
+++ b/simd/arm/aarch32/jccolext-neon.c
@@ -1,7 +1,8 @@
/*
- * jccolext-neon.c - colorspace conversion (Arm NEON)
+ * jccolext-neon.c - colorspace conversion (32-bit Arm Neon)
*
- * Copyright 2020 The Chromium Authors. All Rights Reserved.
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
@@ -22,8 +23,8 @@
/* This file is included by jccolor-neon.c */
-/*
- * RGB -> YCbCr conversion is defined by the following equations:
+
+/* RGB -> YCbCr conversion is defined by the following equations:
* Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
* Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128
* Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128
@@ -39,28 +40,29 @@
* 0.08131409 = 5329 * 2^-16
* These constants are defined in jccolor-neon.c
*
- * To ensure rounding gives correct values, we add 0.5 to Cb and Cr.
+ * We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
+ * rounds up or down the result via integer truncation.
*/
-void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width,
- JSAMPARRAY input_buf,
- JSAMPIMAGE output_buf,
- JDIMENSION output_row,
+void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
int num_rows)
{
- /* Pointer to RGB(X/A) input data. */
+ /* Pointer to RGB(X/A) input data */
JSAMPROW inptr;
- /* Pointers to Y, Cb and Cr output data. */
+ /* Pointers to Y, Cb, and Cr output data */
JSAMPROW outptr0, outptr1, outptr2;
+ /* Allocate temporary buffer for final (image_width % 8) pixels in row. */
+ ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
- /* Setup conversion constants. */
-#if defined(__clang__)
+ /* Set up conversion constants. */
+#ifdef HAVE_VLD1_U16_X2
const uint16x4x2_t consts = vld1_u16_x2(jsimd_rgb_ycc_neon_consts);
#else
/* GCC does not currently support the intrinsic vld1_<type>_x2(). */
const uint16x4_t consts1 = vld1_u16(jsimd_rgb_ycc_neon_consts);
const uint16x4_t consts2 = vld1_u16(jsimd_rgb_ycc_neon_consts + 4);
- const uint16x4x2_t consts = { consts1, consts2 };
+ const uint16x4x2_t consts = { { consts1, consts2 } };
#endif
const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
@@ -74,11 +76,11 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width,
int cols_remaining = image_width;
for (; cols_remaining > 0; cols_remaining -= 8) {
- /* To prevent buffer overread by the vector load instructions, the */
- /* last (image_width % 8) columns of data are first memcopied to a */
- /* temporary buffer large enough to accommodate the vector load. */
+ /* To prevent buffer overread by the vector load instructions, the last
+ * (image_width % 8) columns of data are first memcopied to a temporary
+ * buffer large enough to accommodate the vector load.
+ */
if (cols_remaining < 8) {
- ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
inptr = tmp_buf;
}
@@ -129,8 +131,9 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width,
/* Descale Cr values (right shift) and narrow to 16-bit. */
uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_low, 16),
vshrn_n_u32(cr_high, 16));
- /* Narrow Y, Cb and Cr values to 8-bit and store to memory. Buffer */
- /* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes. */
+ /* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
+ * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+ */
vst1_u8(outptr0, vmovn_u16(y_u16));
vst1_u8(outptr1, vmovn_u16(cb_u16));
vst1_u8(outptr2, vmovn_u16(cr_u16));
diff --git a/simd/arm/aarch32/jchuff-neon.c b/simd/arm/aarch32/jchuff-neon.c
new file mode 100644
index 0000000..19d94f7
--- /dev/null
+++ b/simd/arm/aarch32/jchuff-neon.c
@@ -0,0 +1,334 @@
+/*
+ * jchuff-neon.c - Huffman entropy encoding (32-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+#include "../jchuff.h"
+#include "neon-compat.h"
+
+#include <limits.h>
+
+#include <arm_neon.h>
+
+
+JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
+ JCOEFPTR block, int last_dc_val,
+ c_derived_tbl *dctbl,
+ c_derived_tbl *actbl)
+{
+ uint8_t block_nbits[DCTSIZE2];
+ uint16_t block_diff[DCTSIZE2];
+
+ /* Load rows of coefficients from DCT block in zig-zag order. */
+
+ /* Compute DC coefficient difference value. (F.1.1.5.1) */
+ int16x8_t row0 = vdupq_n_s16(block[0] - last_dc_val);
+ row0 = vld1q_lane_s16(block + 1, row0, 1);
+ row0 = vld1q_lane_s16(block + 8, row0, 2);
+ row0 = vld1q_lane_s16(block + 16, row0, 3);
+ row0 = vld1q_lane_s16(block + 9, row0, 4);
+ row0 = vld1q_lane_s16(block + 2, row0, 5);
+ row0 = vld1q_lane_s16(block + 3, row0, 6);
+ row0 = vld1q_lane_s16(block + 10, row0, 7);
+
+ int16x8_t row1 = vld1q_dup_s16(block + 17);
+ row1 = vld1q_lane_s16(block + 24, row1, 1);
+ row1 = vld1q_lane_s16(block + 32, row1, 2);
+ row1 = vld1q_lane_s16(block + 25, row1, 3);
+ row1 = vld1q_lane_s16(block + 18, row1, 4);
+ row1 = vld1q_lane_s16(block + 11, row1, 5);
+ row1 = vld1q_lane_s16(block + 4, row1, 6);
+ row1 = vld1q_lane_s16(block + 5, row1, 7);
+
+ int16x8_t row2 = vld1q_dup_s16(block + 12);
+ row2 = vld1q_lane_s16(block + 19, row2, 1);
+ row2 = vld1q_lane_s16(block + 26, row2, 2);
+ row2 = vld1q_lane_s16(block + 33, row2, 3);
+ row2 = vld1q_lane_s16(block + 40, row2, 4);
+ row2 = vld1q_lane_s16(block + 48, row2, 5);
+ row2 = vld1q_lane_s16(block + 41, row2, 6);
+ row2 = vld1q_lane_s16(block + 34, row2, 7);
+
+ int16x8_t row3 = vld1q_dup_s16(block + 27);
+ row3 = vld1q_lane_s16(block + 20, row3, 1);
+ row3 = vld1q_lane_s16(block + 13, row3, 2);
+ row3 = vld1q_lane_s16(block + 6, row3, 3);
+ row3 = vld1q_lane_s16(block + 7, row3, 4);
+ row3 = vld1q_lane_s16(block + 14, row3, 5);
+ row3 = vld1q_lane_s16(block + 21, row3, 6);
+ row3 = vld1q_lane_s16(block + 28, row3, 7);
+
+ int16x8_t abs_row0 = vabsq_s16(row0);
+ int16x8_t abs_row1 = vabsq_s16(row1);
+ int16x8_t abs_row2 = vabsq_s16(row2);
+ int16x8_t abs_row3 = vabsq_s16(row3);
+
+ int16x8_t row0_lz = vclzq_s16(abs_row0);
+ int16x8_t row1_lz = vclzq_s16(abs_row1);
+ int16x8_t row2_lz = vclzq_s16(abs_row2);
+ int16x8_t row3_lz = vclzq_s16(abs_row3);
+
+ /* Compute number of bits required to represent each coefficient. */
+ uint8x8_t row0_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row0_lz)));
+ uint8x8_t row1_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row1_lz)));
+ uint8x8_t row2_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row2_lz)));
+ uint8x8_t row3_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row3_lz)));
+
+ vst1_u8(block_nbits + 0 * DCTSIZE, row0_nbits);
+ vst1_u8(block_nbits + 1 * DCTSIZE, row1_nbits);
+ vst1_u8(block_nbits + 2 * DCTSIZE, row2_nbits);
+ vst1_u8(block_nbits + 3 * DCTSIZE, row3_nbits);
+
+ uint16x8_t row0_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row0, 15)),
+ vnegq_s16(row0_lz));
+ uint16x8_t row1_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row1, 15)),
+ vnegq_s16(row1_lz));
+ uint16x8_t row2_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row2, 15)),
+ vnegq_s16(row2_lz));
+ uint16x8_t row3_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row3, 15)),
+ vnegq_s16(row3_lz));
+
+ uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask);
+ uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1), row1_mask);
+ uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2), row2_mask);
+ uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3), row3_mask);
+
+ /* Store diff values for rows 0, 1, 2, and 3. */
+ vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+ vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+ vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+ vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+
+ /* Load last four rows of coefficients from DCT block in zig-zag order. */
+ int16x8_t row4 = vld1q_dup_s16(block + 35);
+ row4 = vld1q_lane_s16(block + 42, row4, 1);
+ row4 = vld1q_lane_s16(block + 49, row4, 2);
+ row4 = vld1q_lane_s16(block + 56, row4, 3);
+ row4 = vld1q_lane_s16(block + 57, row4, 4);
+ row4 = vld1q_lane_s16(block + 50, row4, 5);
+ row4 = vld1q_lane_s16(block + 43, row4, 6);
+ row4 = vld1q_lane_s16(block + 36, row4, 7);
+
+ int16x8_t row5 = vld1q_dup_s16(block + 29);
+ row5 = vld1q_lane_s16(block + 22, row5, 1);
+ row5 = vld1q_lane_s16(block + 15, row5, 2);
+ row5 = vld1q_lane_s16(block + 23, row5, 3);
+ row5 = vld1q_lane_s16(block + 30, row5, 4);
+ row5 = vld1q_lane_s16(block + 37, row5, 5);
+ row5 = vld1q_lane_s16(block + 44, row5, 6);
+ row5 = vld1q_lane_s16(block + 51, row5, 7);
+
+ int16x8_t row6 = vld1q_dup_s16(block + 58);
+ row6 = vld1q_lane_s16(block + 59, row6, 1);
+ row6 = vld1q_lane_s16(block + 52, row6, 2);
+ row6 = vld1q_lane_s16(block + 45, row6, 3);
+ row6 = vld1q_lane_s16(block + 38, row6, 4);
+ row6 = vld1q_lane_s16(block + 31, row6, 5);
+ row6 = vld1q_lane_s16(block + 39, row6, 6);
+ row6 = vld1q_lane_s16(block + 46, row6, 7);
+
+ int16x8_t row7 = vld1q_dup_s16(block + 53);
+ row7 = vld1q_lane_s16(block + 60, row7, 1);
+ row7 = vld1q_lane_s16(block + 61, row7, 2);
+ row7 = vld1q_lane_s16(block + 54, row7, 3);
+ row7 = vld1q_lane_s16(block + 47, row7, 4);
+ row7 = vld1q_lane_s16(block + 55, row7, 5);
+ row7 = vld1q_lane_s16(block + 62, row7, 6);
+ row7 = vld1q_lane_s16(block + 63, row7, 7);
+
+ int16x8_t abs_row4 = vabsq_s16(row4);
+ int16x8_t abs_row5 = vabsq_s16(row5);
+ int16x8_t abs_row6 = vabsq_s16(row6);
+ int16x8_t abs_row7 = vabsq_s16(row7);
+
+ int16x8_t row4_lz = vclzq_s16(abs_row4);
+ int16x8_t row5_lz = vclzq_s16(abs_row5);
+ int16x8_t row6_lz = vclzq_s16(abs_row6);
+ int16x8_t row7_lz = vclzq_s16(abs_row7);
+
+ /* Compute number of bits required to represent each coefficient. */
+ uint8x8_t row4_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row4_lz)));
+ uint8x8_t row5_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row5_lz)));
+ uint8x8_t row6_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row6_lz)));
+ uint8x8_t row7_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row7_lz)));
+
+ vst1_u8(block_nbits + 4 * DCTSIZE, row4_nbits);
+ vst1_u8(block_nbits + 5 * DCTSIZE, row5_nbits);
+ vst1_u8(block_nbits + 6 * DCTSIZE, row6_nbits);
+ vst1_u8(block_nbits + 7 * DCTSIZE, row7_nbits);
+
+ uint16x8_t row4_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row4, 15)),
+ vnegq_s16(row4_lz));
+ uint16x8_t row5_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row5, 15)),
+ vnegq_s16(row5_lz));
+ uint16x8_t row6_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row6, 15)),
+ vnegq_s16(row6_lz));
+ uint16x8_t row7_mask =
+ vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row7, 15)),
+ vnegq_s16(row7_lz));
+
+ uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4), row4_mask);
+ uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5), row5_mask);
+ uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6), row6_mask);
+ uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7), row7_mask);
+
+ /* Store diff values for rows 4, 5, 6, and 7. */
+ vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+ vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+ vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+ vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+ /* Construct bitmap to accelerate encoding of AC coefficients. A set bit
+ * means that the corresponding coefficient != 0.
+ */
+ uint8x8_t row0_nbits_gt0 = vcgt_u8(row0_nbits, vdup_n_u8(0));
+ uint8x8_t row1_nbits_gt0 = vcgt_u8(row1_nbits, vdup_n_u8(0));
+ uint8x8_t row2_nbits_gt0 = vcgt_u8(row2_nbits, vdup_n_u8(0));
+ uint8x8_t row3_nbits_gt0 = vcgt_u8(row3_nbits, vdup_n_u8(0));
+ uint8x8_t row4_nbits_gt0 = vcgt_u8(row4_nbits, vdup_n_u8(0));
+ uint8x8_t row5_nbits_gt0 = vcgt_u8(row5_nbits, vdup_n_u8(0));
+ uint8x8_t row6_nbits_gt0 = vcgt_u8(row6_nbits, vdup_n_u8(0));
+ uint8x8_t row7_nbits_gt0 = vcgt_u8(row7_nbits, vdup_n_u8(0));
+
+ /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
+ const uint8x8_t bitmap_mask =
+ vreinterpret_u8_u64(vmov_n_u64(0x0102040810204080));
+
+ row0_nbits_gt0 = vand_u8(row0_nbits_gt0, bitmap_mask);
+ row1_nbits_gt0 = vand_u8(row1_nbits_gt0, bitmap_mask);
+ row2_nbits_gt0 = vand_u8(row2_nbits_gt0, bitmap_mask);
+ row3_nbits_gt0 = vand_u8(row3_nbits_gt0, bitmap_mask);
+ row4_nbits_gt0 = vand_u8(row4_nbits_gt0, bitmap_mask);
+ row5_nbits_gt0 = vand_u8(row5_nbits_gt0, bitmap_mask);
+ row6_nbits_gt0 = vand_u8(row6_nbits_gt0, bitmap_mask);
+ row7_nbits_gt0 = vand_u8(row7_nbits_gt0, bitmap_mask);
+
+ uint8x8_t bitmap_rows_10 = vpadd_u8(row1_nbits_gt0, row0_nbits_gt0);
+ uint8x8_t bitmap_rows_32 = vpadd_u8(row3_nbits_gt0, row2_nbits_gt0);
+ uint8x8_t bitmap_rows_54 = vpadd_u8(row5_nbits_gt0, row4_nbits_gt0);
+ uint8x8_t bitmap_rows_76 = vpadd_u8(row7_nbits_gt0, row6_nbits_gt0);
+ uint8x8_t bitmap_rows_3210 = vpadd_u8(bitmap_rows_32, bitmap_rows_10);
+ uint8x8_t bitmap_rows_7654 = vpadd_u8(bitmap_rows_76, bitmap_rows_54);
+ uint8x8_t bitmap = vpadd_u8(bitmap_rows_7654, bitmap_rows_3210);
+
+ /* Shift left to remove DC bit. */
+ bitmap = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap), 1));
+ /* Move bitmap to 32-bit scalar registers. */
+ uint32_t bitmap_1_32 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 1);
+ uint32_t bitmap_33_63 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 0);
+
+ /* Set up state and bit buffer for output bitstream. */
+ working_state *state_ptr = (working_state *)state;
+ int free_bits = state_ptr->cur.free_bits;
+ size_t put_buffer = state_ptr->cur.put_buffer;
+
+ /* Encode DC coefficient. */
+
+ unsigned int nbits = block_nbits[0];
+ /* Emit Huffman-coded symbol and additional diff bits. */
+ unsigned int diff = block_diff[0];
+ PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
+
+ /* Encode AC coefficients. */
+
+ unsigned int r = 0; /* r = run length of zeros */
+ unsigned int i = 1; /* i = number of coefficients encoded */
+ /* Code and size information for a run length of 16 zero coefficients */
+ const unsigned int code_0xf0 = actbl->ehufco[0xf0];
+ const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
+
+ while (bitmap_1_32 != 0) {
+ r = BUILTIN_CLZ(bitmap_1_32);
+ i += r;
+ bitmap_1_32 <<= r;
+ nbits = block_nbits[i];
+ diff = block_diff[i];
+ while (r > 15) {
+ /* If run length > 15, emit special run-length-16 codes. */
+ PUT_BITS(code_0xf0, size_0xf0)
+ r -= 16;
+ }
+ /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+ unsigned int rs = (r << 4) + nbits;
+ PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+ i++;
+ bitmap_1_32 <<= 1;
+ }
+
+ r = 33 - i;
+ i = 33;
+
+ while (bitmap_33_63 != 0) {
+ unsigned int leading_zeros = BUILTIN_CLZ(bitmap_33_63);
+ r += leading_zeros;
+ i += leading_zeros;
+ bitmap_33_63 <<= leading_zeros;
+ nbits = block_nbits[i];
+ diff = block_diff[i];
+ while (r > 15) {
+ /* If run length > 15, emit special run-length-16 codes. */
+ PUT_BITS(code_0xf0, size_0xf0)
+ r -= 16;
+ }
+ /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+ unsigned int rs = (r << 4) + nbits;
+ PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+ r = 0;
+ i++;
+ bitmap_33_63 <<= 1;
+ }
+
+ /* If the last coefficient(s) were zero, emit an end-of-block (EOB) code.
+ * The value of RS for the EOB code is 0.
+ */
+ if (i != 64) {
+ PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+ }
+
+ state_ptr->cur.put_buffer = put_buffer;
+ state_ptr->cur.free_bits = free_bits;
+
+ return buffer;
+}
diff --git a/simd/arm/arm/jsimd.c b/simd/arm/aarch32/jsimd.c
index c0d5d90..fac55df 100644
--- a/simd/arm/arm/jsimd.c
+++ b/simd/arm/aarch32/jsimd.c
@@ -6,6 +6,7 @@
* Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander.
* Copyright (C) 2015-2016, 2018, Matthieu Darbois.
* Copyright (C) 2019, Google LLC.
+ * Copyright (C) 2020, Arm Limited.
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -13,7 +14,7 @@
*
* This file contains the interface between the "normal" portions
* of the library and the SIMD implementations when running on a
- * 32-bit ARM architecture.
+ * 32-bit Arm architecture.
*/
#define JPEG_INTERNALS
@@ -118,7 +119,7 @@ init_simd(void)
#if defined(__ARM_NEON__)
simd_support |= JSIMD_NEON;
#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
- /* We still have a chance to use NEON regardless of globally used
+ /* We still have a chance to use Neon regardless of globally used
* -mcpu/-mfpu options passed to gcc by performing runtime detection via
* /proc/cpuinfo parsing on linux/android */
while (!parse_proc_cpuinfo(bufsize)) {
@@ -422,7 +423,6 @@ jsimd_can_h2v1_upsample(void)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
-
if (simd_support & JSIMD_NEON)
return 1;
@@ -934,6 +934,16 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
GLOBAL(int)
jsimd_can_encode_mcu_AC_first_prepare(void)
{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
return 0;
}
@@ -942,11 +952,23 @@ jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *values, size_t *zerobits)
{
+ jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
+ Sl, Al, values, zerobits);
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
return 0;
}
@@ -955,5 +977,7 @@ jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *absvalues, size_t *bits)
{
- return 0;
+ return jsimd_encode_mcu_AC_refine_prepare_neon(block,
+ jpeg_natural_order_start, Sl,
+ Al, absvalues, bits);
}
diff --git a/simd/arm/arm64/jccolext-neon.c b/simd/arm/aarch64/jccolext-neon.c
index 89f520a..37130c2 100644
--- a/simd/arm/arm64/jccolext-neon.c
+++ b/simd/arm/aarch64/jccolext-neon.c
@@ -1,7 +1,7 @@
/*
- * jccolext-neon.c - colorspace conversion (Arm NEON)
+ * jccolext-neon.c - colorspace conversion (64-bit Arm Neon)
*
- * Copyright 2020 The Chromium Authors. All Rights Reserved.
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
@@ -22,8 +22,8 @@
/* This file is included by jccolor-neon.c */
-/*
- * RGB -> YCbCr conversion is defined by the following equations:
+
+/* RGB -> YCbCr conversion is defined by the following equations:
* Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
* Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128
* Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128
@@ -39,21 +39,22 @@
* 0.08131409 = 5329 * 2^-16
* These constants are defined in jccolor-neon.c
*
- * To ensure rounding gives correct values, we add 0.5 to Cb and Cr.
+ * We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
+ * rounds up or down the result via integer truncation.
*/
-void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width,
- JSAMPARRAY input_buf,
- JSAMPIMAGE output_buf,
- JDIMENSION output_row,
+void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
int num_rows)
{
- /* Pointer to RGB(X/A) input data. */
+ /* Pointer to RGB(X/A) input data */
JSAMPROW inptr;
- /* Pointers to Y, Cb and Cr output data. */
+ /* Pointers to Y, Cb, and Cr output data */
JSAMPROW outptr0, outptr1, outptr2;
+ /* Allocate temporary buffer for final (image_width % 16) pixels in row. */
+ ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
- /* Setup conversion constants. */
+ /* Set up conversion constants. */
const uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts);
const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
@@ -83,15 +84,15 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width,
uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
- uint32x4_t y_lh = vmull_high_laneq_u16(r_l, consts, 0);
- y_lh = vmlal_high_laneq_u16(y_lh, g_l, consts, 1);
- y_lh = vmlal_high_laneq_u16(y_lh, b_l, consts, 2);
+ uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
+ y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
+ y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
- uint32x4_t y_hh = vmull_high_laneq_u16(r_h, consts, 0);
- y_hh = vmlal_high_laneq_u16(y_hh, g_h, consts, 1);
- y_hh = vmlal_high_laneq_u16(y_hh, b_h, consts, 2);
+ uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
+ y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
+ y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
/* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
uint32x4_t cb_ll = scaled_128_5;
@@ -99,17 +100,17 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width,
cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
uint32x4_t cb_lh = scaled_128_5;
- cb_lh = vmlsl_high_laneq_u16(cb_lh, r_l, consts, 3);
- cb_lh = vmlsl_high_laneq_u16(cb_lh, g_l, consts, 4);
- cb_lh = vmlal_high_laneq_u16(cb_lh, b_l, consts, 5);
+ cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
+ cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
+ cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
uint32x4_t cb_hl = scaled_128_5;
cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
uint32x4_t cb_hh = scaled_128_5;
- cb_hh = vmlsl_high_laneq_u16(cb_hh, r_h, consts, 3);
- cb_hh = vmlsl_high_laneq_u16(cb_hh, g_h, consts, 4);
- cb_hh = vmlal_high_laneq_u16(cb_hh, b_h, consts, 5);
+ cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
+ cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
+ cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
/* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
uint32x4_t cr_ll = scaled_128_5;
@@ -117,17 +118,17 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width,
cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
uint32x4_t cr_lh = scaled_128_5;
- cr_lh = vmlal_high_laneq_u16(cr_lh, r_l, consts, 5);
- cr_lh = vmlsl_high_laneq_u16(cr_lh, g_l, consts, 6);
- cr_lh = vmlsl_high_laneq_u16(cr_lh, b_l, consts, 7);
+ cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
+ cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
+ cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
uint32x4_t cr_hl = scaled_128_5;
cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
uint32x4_t cr_hh = scaled_128_5;
- cr_hh = vmlal_high_laneq_u16(cr_hh, r_h, consts, 5);
- cr_hh = vmlsl_high_laneq_u16(cr_hh, g_h, consts, 6);
- cr_hh = vmlsl_high_laneq_u16(cr_hh, b_h, consts, 7);
+ cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
+ cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
+ cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
/* Descale Y values (rounding right shift) and narrow to 16-bit. */
uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
@@ -144,8 +145,9 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width,
vshrn_n_u32(cr_lh, 16));
uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
vshrn_n_u32(cr_hh, 16));
- /* Narrow Y, Cb and Cr values to 8-bit and store to memory. Buffer */
- /* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes. */
+ /* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
+ * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+ */
vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
@@ -158,10 +160,10 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width,
}
if (cols_remaining > 8) {
- /* To prevent buffer overread by the vector load instructions, the */
- /* last (image_width % 16) columns of data are first memcopied to a */
- /* temporary buffer large enough to accommodate the vector load. */
- ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
+ /* To prevent buffer overread by the vector load instructions, the last
+ * (image_width % 16) columns of data are first memcopied to a temporary
+ * buffer large enough to accommodate the vector load.
+ */
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
inptr = tmp_buf;
@@ -181,15 +183,15 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width,
uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
- uint32x4_t y_lh = vmull_high_laneq_u16(r_l, consts, 0);
- y_lh = vmlal_high_laneq_u16(y_lh, g_l, consts, 1);
- y_lh = vmlal_high_laneq_u16(y_lh, b_l, consts, 2);
+ uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
+ y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
+ y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
- uint32x4_t y_hh = vmull_high_laneq_u16(r_h, consts, 0);
- y_hh = vmlal_high_laneq_u16(y_hh, g_h, consts, 1);
- y_hh = vmlal_high_laneq_u16(y_hh, b_h, consts, 2);
+ uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
+ y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
+ y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
/* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
uint32x4_t cb_ll = scaled_128_5;
@@ -197,17 +199,17 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width,
cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
uint32x4_t cb_lh = scaled_128_5;
- cb_lh = vmlsl_high_laneq_u16(cb_lh, r_l, consts, 3);
- cb_lh = vmlsl_high_laneq_u16(cb_lh, g_l, consts, 4);
- cb_lh = vmlal_high_laneq_u16(cb_lh, b_l, consts, 5);
+ cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
+ cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
+ cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
uint32x4_t cb_hl = scaled_128_5;
cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
uint32x4_t cb_hh = scaled_128_5;
- cb_hh = vmlsl_high_laneq_u16(cb_hh, r_h, consts, 3);
- cb_hh = vmlsl_high_laneq_u16(cb_hh, g_h, consts, 4);
- cb_hh = vmlal_high_laneq_u16(cb_hh, b_h, consts, 5);
+ cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
+ cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
+ cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
/* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
uint32x4_t cr_ll = scaled_128_5;
@@ -215,17 +217,17 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width,
cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
uint32x4_t cr_lh = scaled_128_5;
- cr_lh = vmlal_high_laneq_u16(cr_lh, r_l, consts, 5);
- cr_lh = vmlsl_high_laneq_u16(cr_lh, g_l, consts, 6);
- cr_lh = vmlsl_high_laneq_u16(cr_lh, b_l, consts, 7);
+ cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
+ cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
+ cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
uint32x4_t cr_hl = scaled_128_5;
cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
uint32x4_t cr_hh = scaled_128_5;
- cr_hh = vmlal_high_laneq_u16(cr_hh, r_h, consts, 5);
- cr_hh = vmlsl_high_laneq_u16(cr_hh, g_h, consts, 6);
- cr_hh = vmlsl_high_laneq_u16(cr_hh, b_h, consts, 7);
+ cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
+ cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
+ cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
/* Descale Y values (rounding right shift) and narrow to 16-bit. */
uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
@@ -242,17 +244,18 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width,
vshrn_n_u32(cr_lh, 16));
uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
vshrn_n_u32(cr_hh, 16));
- /* Narrow Y, Cb and Cr values to 8-bit and store to memory. Buffer */
- /* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes. */
+ /* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
+ * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+ */
vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
} else if (cols_remaining > 0) {
- /* To prevent buffer overread by the vector load instructions, the */
- /* last (image_width % 8) columns of data are first memcopied to a */
- /* temporary buffer large enough to accommodate the vector load. */
- ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
+ /* To prevent buffer overread by the vector load instructions, the last
+ * (image_width % 8) columns of data are first memcopied to a temporary
+ * buffer large enough to accommodate the vector load.
+ */
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
inptr = tmp_buf;
@@ -269,9 +272,9 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width,
uint32x4_t y_l = vmull_laneq_u16(vget_low_u16(r), consts, 0);
y_l = vmlal_laneq_u16(y_l, vget_low_u16(g), consts, 1);
y_l = vmlal_laneq_u16(y_l, vget_low_u16(b), consts, 2);
- uint32x4_t y_h = vmull_high_laneq_u16(r, consts, 0);
- y_h = vmlal_high_laneq_u16(y_h, g, consts, 1);
- y_h = vmlal_high_laneq_u16(y_h, b, consts, 2);
+ uint32x4_t y_h = vmull_laneq_u16(vget_high_u16(r), consts, 0);
+ y_h = vmlal_laneq_u16(y_h, vget_high_u16(g), consts, 1);
+ y_h = vmlal_laneq_u16(y_h, vget_high_u16(b), consts, 2);
/* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
uint32x4_t cb_l = scaled_128_5;
@@ -279,9 +282,9 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width,
cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(g), consts, 4);
cb_l = vmlal_laneq_u16(cb_l, vget_low_u16(b), consts, 5);
uint32x4_t cb_h = scaled_128_5;
- cb_h = vmlsl_high_laneq_u16(cb_h, r, consts, 3);
- cb_h = vmlsl_high_laneq_u16(cb_h, g, consts, 4);
- cb_h = vmlal_high_laneq_u16(cb_h, b, consts, 5);
+ cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(r), consts, 3);
+ cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(g), consts, 4);
+ cb_h = vmlal_laneq_u16(cb_h, vget_high_u16(b), consts, 5);
/* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
uint32x4_t cr_l = scaled_128_5;
@@ -289,9 +292,9 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width,
cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(g), consts, 6);
cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(b), consts, 7);
uint32x4_t cr_h = scaled_128_5;
- cr_h = vmlal_high_laneq_u16(cr_h, r, consts, 5);
- cr_h = vmlsl_high_laneq_u16(cr_h, g, consts, 6);
- cr_h = vmlsl_high_laneq_u16(cr_h, b, consts, 7);
+ cr_h = vmlal_laneq_u16(cr_h, vget_high_u16(r), consts, 5);
+ cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(g), consts, 6);
+ cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(b), consts, 7);
/* Descale Y values (rounding right shift) and narrow to 16-bit. */
uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_l, 16),
@@ -302,8 +305,9 @@ void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width,
/* Descale Cr values (right shift) and narrow to 16-bit. */
uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_l, 16),
vshrn_n_u32(cr_h, 16));
- /* Narrow Y, Cb and Cr values to 8-bit and store to memory. Buffer */
- /* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes. */
+ /* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
+ * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+ */
vst1_u8(outptr0, vmovn_u16(y_u16));
vst1_u8(outptr1, vmovn_u16(cb_u16));
vst1_u8(outptr2, vmovn_u16(cr_u16));
diff --git a/simd/arm/aarch64/jchuff-neon.c b/simd/arm/aarch64/jchuff-neon.c
new file mode 100644
index 0000000..a0a57a6
--- /dev/null
+++ b/simd/arm/aarch64/jchuff-neon.c
@@ -0,0 +1,403 @@
+/*
+ * jchuff-neon.c - Huffman entropy encoding (64-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+#include "../align.h"
+#include "../jchuff.h"
+#include "neon-compat.h"
+
+#include <limits.h>
+
+#include <arm_neon.h>
+
+
+ALIGN(16) static const uint8_t jsimd_huff_encode_one_block_consts[] = {
+ 0, 1, 2, 3, 16, 17, 32, 33,
+ 18, 19, 4, 5, 6, 7, 20, 21,
+ 34, 35, 48, 49, 255, 255, 50, 51,
+ 36, 37, 22, 23, 8, 9, 10, 11,
+ 255, 255, 6, 7, 20, 21, 34, 35,
+ 48, 49, 255, 255, 50, 51, 36, 37,
+ 54, 55, 40, 41, 26, 27, 12, 13,
+ 14, 15, 28, 29, 42, 43, 56, 57,
+ 6, 7, 20, 21, 34, 35, 48, 49,
+ 50, 51, 36, 37, 22, 23, 8, 9,
+ 26, 27, 12, 13, 255, 255, 14, 15,
+ 28, 29, 42, 43, 56, 57, 255, 255,
+ 52, 53, 54, 55, 40, 41, 26, 27,
+ 12, 13, 255, 255, 14, 15, 28, 29,
+ 26, 27, 40, 41, 42, 43, 28, 29,
+ 14, 15, 30, 31, 44, 45, 46, 47
+};
+
+JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
+ JCOEFPTR block, int last_dc_val,
+ c_derived_tbl *dctbl,
+ c_derived_tbl *actbl)
+{
+ uint16_t block_diff[DCTSIZE2];
+
+ /* Load lookup table indices for rows of zig-zag ordering. */
+#ifdef HAVE_VLD1Q_U8_X4
+ const uint8x16x4_t idx_rows_0123 =
+ vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE);
+ const uint8x16x4_t idx_rows_4567 =
+ vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE);
+#else
+ /* GCC does not currently support intrinsics vl1dq_<type>_x4(). */
+ const uint8x16x4_t idx_rows_0123 = { {
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE),
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 2 * DCTSIZE),
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 4 * DCTSIZE),
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 6 * DCTSIZE)
+ } };
+ const uint8x16x4_t idx_rows_4567 = { {
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE),
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 10 * DCTSIZE),
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 12 * DCTSIZE),
+ vld1q_u8(jsimd_huff_encode_one_block_consts + 14 * DCTSIZE)
+ } };
+#endif
+
+ /* Load 8x8 block of DCT coefficients. */
+#ifdef HAVE_VLD1Q_U8_X4
+ const int8x16x4_t tbl_rows_0123 =
+ vld1q_s8_x4((int8_t *)(block + 0 * DCTSIZE));
+ const int8x16x4_t tbl_rows_4567 =
+ vld1q_s8_x4((int8_t *)(block + 4 * DCTSIZE));
+#else
+ const int8x16x4_t tbl_rows_0123 = { {
+ vld1q_s8((int8_t *)(block + 0 * DCTSIZE)),
+ vld1q_s8((int8_t *)(block + 1 * DCTSIZE)),
+ vld1q_s8((int8_t *)(block + 2 * DCTSIZE)),
+ vld1q_s8((int8_t *)(block + 3 * DCTSIZE))
+ } };
+ const int8x16x4_t tbl_rows_4567 = { {
+ vld1q_s8((int8_t *)(block + 4 * DCTSIZE)),
+ vld1q_s8((int8_t *)(block + 5 * DCTSIZE)),
+ vld1q_s8((int8_t *)(block + 6 * DCTSIZE)),
+ vld1q_s8((int8_t *)(block + 7 * DCTSIZE))
+ } };
+#endif
+
+ /* Initialise extra lookup tables. */
+ const int8x16x4_t tbl_rows_2345 = { {
+ tbl_rows_0123.val[2], tbl_rows_0123.val[3],
+ tbl_rows_4567.val[0], tbl_rows_4567.val[1]
+ } };
+ const int8x16x3_t tbl_rows_567 =
+ { { tbl_rows_4567.val[1], tbl_rows_4567.val[2], tbl_rows_4567.val[3] } };
+
+ /* Shuffle coefficients into zig-zag order. */
+ int16x8_t row0 =
+ vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[0]));
+ int16x8_t row1 =
+ vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[1]));
+ int16x8_t row2 =
+ vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_0123.val[2]));
+ int16x8_t row3 =
+ vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[3]));
+ int16x8_t row4 =
+ vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[0]));
+ int16x8_t row5 =
+ vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_4567.val[1]));
+ int16x8_t row6 =
+ vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[2]));
+ int16x8_t row7 =
+ vreinterpretq_s16_s8(vqtbl3q_s8(tbl_rows_567, idx_rows_4567.val[3]));
+
+ /* Compute DC coefficient difference value (F.1.1.5.1). */
+ row0 = vsetq_lane_s16(block[0] - last_dc_val, row0, 0);
+ /* Initialize AC coefficient lanes not reachable by lookup tables. */
+ row1 =
+ vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[0]),
+ 0), row1, 2);
+ row2 =
+ vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]),
+ 4), row2, 0);
+ row2 =
+ vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]),
+ 0), row2, 5);
+ row5 =
+ vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]),
+ 7), row5, 2);
+ row5 =
+ vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]),
+ 3), row5, 7);
+ row6 =
+ vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[3]),
+ 7), row6, 5);
+
+ /* DCT block is now in zig-zag order; start Huffman encoding process. */
+ int16x8_t abs_row0 = vabsq_s16(row0);
+ int16x8_t abs_row1 = vabsq_s16(row1);
+ int16x8_t abs_row2 = vabsq_s16(row2);
+ int16x8_t abs_row3 = vabsq_s16(row3);
+ int16x8_t abs_row4 = vabsq_s16(row4);
+ int16x8_t abs_row5 = vabsq_s16(row5);
+ int16x8_t abs_row6 = vabsq_s16(row6);
+ int16x8_t abs_row7 = vabsq_s16(row7);
+
+ /* For negative coeffs: diff = abs(coeff) -1 = ~abs(coeff) */
+ uint16x8_t row0_diff =
+ vreinterpretq_u16_s16(veorq_s16(abs_row0, vshrq_n_s16(row0, 15)));
+ uint16x8_t row1_diff =
+ vreinterpretq_u16_s16(veorq_s16(abs_row1, vshrq_n_s16(row1, 15)));
+ uint16x8_t row2_diff =
+ vreinterpretq_u16_s16(veorq_s16(abs_row2, vshrq_n_s16(row2, 15)));
+ uint16x8_t row3_diff =
+ vreinterpretq_u16_s16(veorq_s16(abs_row3, vshrq_n_s16(row3, 15)));
+ uint16x8_t row4_diff =
+ vreinterpretq_u16_s16(veorq_s16(abs_row4, vshrq_n_s16(row4, 15)));
+ uint16x8_t row5_diff =
+ vreinterpretq_u16_s16(veorq_s16(abs_row5, vshrq_n_s16(row5, 15)));
+ uint16x8_t row6_diff =
+ vreinterpretq_u16_s16(veorq_s16(abs_row6, vshrq_n_s16(row6, 15)));
+ uint16x8_t row7_diff =
+ vreinterpretq_u16_s16(veorq_s16(abs_row7, vshrq_n_s16(row7, 15)));
+
+ /* Construct bitmap to accelerate encoding of AC coefficients. A set bit
+ * means that the corresponding coefficient != 0.
+ */
+ uint8x8_t abs_row0_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row0),
+ vdupq_n_u16(0)));
+ uint8x8_t abs_row1_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row1),
+ vdupq_n_u16(0)));
+ uint8x8_t abs_row2_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row2),
+ vdupq_n_u16(0)));
+ uint8x8_t abs_row3_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row3),
+ vdupq_n_u16(0)));
+ uint8x8_t abs_row4_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row4),
+ vdupq_n_u16(0)));
+ uint8x8_t abs_row5_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row5),
+ vdupq_n_u16(0)));
+ uint8x8_t abs_row6_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row6),
+ vdupq_n_u16(0)));
+ uint8x8_t abs_row7_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row7),
+ vdupq_n_u16(0)));
+
+ /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
+ const uint8x8_t bitmap_mask =
+ vreinterpret_u8_u64(vmov_n_u64(0x0102040810204080));
+
+ abs_row0_gt0 = vand_u8(abs_row0_gt0, bitmap_mask);
+ abs_row1_gt0 = vand_u8(abs_row1_gt0, bitmap_mask);
+ abs_row2_gt0 = vand_u8(abs_row2_gt0, bitmap_mask);
+ abs_row3_gt0 = vand_u8(abs_row3_gt0, bitmap_mask);
+ abs_row4_gt0 = vand_u8(abs_row4_gt0, bitmap_mask);
+ abs_row5_gt0 = vand_u8(abs_row5_gt0, bitmap_mask);
+ abs_row6_gt0 = vand_u8(abs_row6_gt0, bitmap_mask);
+ abs_row7_gt0 = vand_u8(abs_row7_gt0, bitmap_mask);
+
+ uint8x8_t bitmap_rows_10 = vpadd_u8(abs_row1_gt0, abs_row0_gt0);
+ uint8x8_t bitmap_rows_32 = vpadd_u8(abs_row3_gt0, abs_row2_gt0);
+ uint8x8_t bitmap_rows_54 = vpadd_u8(abs_row5_gt0, abs_row4_gt0);
+ uint8x8_t bitmap_rows_76 = vpadd_u8(abs_row7_gt0, abs_row6_gt0);
+ uint8x8_t bitmap_rows_3210 = vpadd_u8(bitmap_rows_32, bitmap_rows_10);
+ uint8x8_t bitmap_rows_7654 = vpadd_u8(bitmap_rows_76, bitmap_rows_54);
+ uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_7654, bitmap_rows_3210);
+
+ /* Shift left to remove DC bit. */
+ bitmap_all =
+ vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap_all), 1));
+ /* Count bits set (number of non-zero coefficients) in bitmap. */
+ unsigned int non_zero_coefficients = vaddv_u8(vcnt_u8(bitmap_all));
+ /* Move bitmap to 64-bit scalar register. */
+ uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+
+ /* Set up state and bit buffer for output bitstream. */
+ working_state *state_ptr = (working_state *)state;
+ int free_bits = state_ptr->cur.free_bits;
+ size_t put_buffer = state_ptr->cur.put_buffer;
+
+ /* Encode DC coefficient. */
+
+ /* Find nbits required to specify sign and amplitude of coefficient. */
+#if defined(_MSC_VER) && !defined(__clang__)
+ unsigned int lz = BUILTIN_CLZ(vgetq_lane_s16(abs_row0, 0));
+#else
+ unsigned int lz;
+ __asm__("clz %w0, %w1" : "=r"(lz) : "r"(vgetq_lane_s16(abs_row0, 0)));
+#endif
+ unsigned int nbits = 32 - lz;
+ /* Emit Huffman-coded symbol and additional diff bits. */
+ unsigned int diff = (unsigned int)(vgetq_lane_u16(row0_diff, 0) << lz) >> lz;
+ PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
+
+ /* Encode AC coefficients. */
+
+ unsigned int r = 0; /* r = run length of zeros */
+ unsigned int i = 1; /* i = number of coefficients encoded */
+ /* Code and size information for a run length of 16 zero coefficients */
+ const unsigned int code_0xf0 = actbl->ehufco[0xf0];
+ const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
+
+ /* The most efficient method of computing nbits and diff depends on the
+ * number of non-zero coefficients. If the bitmap is not too sparse (> 8
+ * non-zero AC coefficients), it is beneficial to use Neon; else we compute
+ * nbits and diff on demand using scalar code.
+ */
+ if (non_zero_coefficients > 8) {
+ uint8_t block_nbits[DCTSIZE2];
+
+ int16x8_t row0_lz = vclzq_s16(abs_row0);
+ int16x8_t row1_lz = vclzq_s16(abs_row1);
+ int16x8_t row2_lz = vclzq_s16(abs_row2);
+ int16x8_t row3_lz = vclzq_s16(abs_row3);
+ int16x8_t row4_lz = vclzq_s16(abs_row4);
+ int16x8_t row5_lz = vclzq_s16(abs_row5);
+ int16x8_t row6_lz = vclzq_s16(abs_row6);
+ int16x8_t row7_lz = vclzq_s16(abs_row7);
+ /* Compute nbits needed to specify magnitude of each coefficient. */
+ uint8x8_t row0_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row0_lz)));
+ uint8x8_t row1_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row1_lz)));
+ uint8x8_t row2_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row2_lz)));
+ uint8x8_t row3_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row3_lz)));
+ uint8x8_t row4_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row4_lz)));
+ uint8x8_t row5_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row5_lz)));
+ uint8x8_t row6_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row6_lz)));
+ uint8x8_t row7_nbits = vsub_u8(vdup_n_u8(16),
+ vmovn_u16(vreinterpretq_u16_s16(row7_lz)));
+ /* Store nbits. */
+ vst1_u8(block_nbits + 0 * DCTSIZE, row0_nbits);
+ vst1_u8(block_nbits + 1 * DCTSIZE, row1_nbits);
+ vst1_u8(block_nbits + 2 * DCTSIZE, row2_nbits);
+ vst1_u8(block_nbits + 3 * DCTSIZE, row3_nbits);
+ vst1_u8(block_nbits + 4 * DCTSIZE, row4_nbits);
+ vst1_u8(block_nbits + 5 * DCTSIZE, row5_nbits);
+ vst1_u8(block_nbits + 6 * DCTSIZE, row6_nbits);
+ vst1_u8(block_nbits + 7 * DCTSIZE, row7_nbits);
+ /* Mask bits not required to specify sign and amplitude of diff. */
+ row0_diff = vshlq_u16(row0_diff, row0_lz);
+ row1_diff = vshlq_u16(row1_diff, row1_lz);
+ row2_diff = vshlq_u16(row2_diff, row2_lz);
+ row3_diff = vshlq_u16(row3_diff, row3_lz);
+ row4_diff = vshlq_u16(row4_diff, row4_lz);
+ row5_diff = vshlq_u16(row5_diff, row5_lz);
+ row6_diff = vshlq_u16(row6_diff, row6_lz);
+ row7_diff = vshlq_u16(row7_diff, row7_lz);
+ row0_diff = vshlq_u16(row0_diff, vnegq_s16(row0_lz));
+ row1_diff = vshlq_u16(row1_diff, vnegq_s16(row1_lz));
+ row2_diff = vshlq_u16(row2_diff, vnegq_s16(row2_lz));
+ row3_diff = vshlq_u16(row3_diff, vnegq_s16(row3_lz));
+ row4_diff = vshlq_u16(row4_diff, vnegq_s16(row4_lz));
+ row5_diff = vshlq_u16(row5_diff, vnegq_s16(row5_lz));
+ row6_diff = vshlq_u16(row6_diff, vnegq_s16(row6_lz));
+ row7_diff = vshlq_u16(row7_diff, vnegq_s16(row7_lz));
+ /* Store diff bits. */
+ vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+ vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+ vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+ vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+ vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+ vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+ vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+ vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+ while (bitmap != 0) {
+ r = BUILTIN_CLZL(bitmap);
+ i += r;
+ bitmap <<= r;
+ nbits = block_nbits[i];
+ diff = block_diff[i];
+ while (r > 15) {
+ /* If run length > 15, emit special run-length-16 codes. */
+ PUT_BITS(code_0xf0, size_0xf0)
+ r -= 16;
+ }
+ /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+ unsigned int rs = (r << 4) + nbits;
+ PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+ i++;
+ bitmap <<= 1;
+ }
+ } else if (bitmap != 0) {
+ uint16_t block_abs[DCTSIZE2];
+ /* Store absolute value of coefficients. */
+ vst1q_u16(block_abs + 0 * DCTSIZE, vreinterpretq_u16_s16(abs_row0));
+ vst1q_u16(block_abs + 1 * DCTSIZE, vreinterpretq_u16_s16(abs_row1));
+ vst1q_u16(block_abs + 2 * DCTSIZE, vreinterpretq_u16_s16(abs_row2));
+ vst1q_u16(block_abs + 3 * DCTSIZE, vreinterpretq_u16_s16(abs_row3));
+ vst1q_u16(block_abs + 4 * DCTSIZE, vreinterpretq_u16_s16(abs_row4));
+ vst1q_u16(block_abs + 5 * DCTSIZE, vreinterpretq_u16_s16(abs_row5));
+ vst1q_u16(block_abs + 6 * DCTSIZE, vreinterpretq_u16_s16(abs_row6));
+ vst1q_u16(block_abs + 7 * DCTSIZE, vreinterpretq_u16_s16(abs_row7));
+ /* Store diff bits. */
+ vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+ vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+ vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+ vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+ vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+ vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+ vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+ vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+ /* Same as above but must mask diff bits and compute nbits on demand. */
+ while (bitmap != 0) {
+ r = BUILTIN_CLZL(bitmap);
+ i += r;
+ bitmap <<= r;
+ lz = BUILTIN_CLZ(block_abs[i]);
+ nbits = 32 - lz;
+ diff = (unsigned int)(block_diff[i] << lz) >> lz;
+ while (r > 15) {
+ /* If run length > 15, emit special run-length-16 codes. */
+ PUT_BITS(code_0xf0, size_0xf0)
+ r -= 16;
+ }
+ /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+ unsigned int rs = (r << 4) + nbits;
+ PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+ i++;
+ bitmap <<= 1;
+ }
+ }
+
+ /* If the last coefficient(s) were zero, emit an end-of-block (EOB) code.
+ * The value of RS for the EOB code is 0.
+ */
+ if (i != 64) {
+ PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+ }
+
+ state_ptr->cur.put_buffer = put_buffer;
+ state_ptr->cur.free_bits = free_bits;
+
+ return buffer;
+}
diff --git a/simd/arm/arm64/jsimd.c b/simd/arm/aarch64/jsimd.c
index ca29cd6..8570b82 100644
--- a/simd/arm/arm64/jsimd.c
+++ b/simd/arm/aarch64/jsimd.c
@@ -3,8 +3,9 @@
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
- * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander.
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2020, D. R. Commander.
* Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -12,7 +13,7 @@
*
* This file contains the interface between the "normal" portions
* of the library and the SIMD implementations when running on a
- * 64-bit ARM architecture.
+ * 64-bit Arm architecture.
*/
#define JPEG_INTERNALS
@@ -22,16 +23,20 @@
#include "../../../jdct.h"
#include "../../../jsimddct.h"
#include "../../jsimd.h"
+#include "jconfigint.h"
#include <stdio.h>
#include <string.h>
#include <ctype.h>
+#define JSIMD_FASTLD3 1
+#define JSIMD_FASTST3 2
#define JSIMD_FASTTBL 4
static unsigned int simd_support = ~0;
static unsigned int simd_huffman = 1;
-static unsigned int simd_features = JSIMD_FASTTBL;
+static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 |
+ JSIMD_FASTTBL;
#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
@@ -111,8 +116,8 @@ parse_proc_cpuinfo(int bufsize)
*/
/*
- * ARMv8 architectures support NEON extensions by default.
- * It is no longer optional as it was with ARMv7.
+ * Armv8 architectures support Neon extensions by default.
+ * It is no longer optional as it was with Armv7.
*/
@@ -151,6 +156,16 @@ init_simd(void)
env = getenv("JSIMD_NOHUFFENC");
if ((env != NULL) && (strcmp(env, "1") == 0))
simd_huffman = 0;
+ env = getenv("JSIMD_FASTLD3");
+ if ((env != NULL) && (strcmp(env, "1") == 0))
+ simd_features |= JSIMD_FASTLD3;
+ if ((env != NULL) && (strcmp(env, "0") == 0))
+ simd_features &= ~JSIMD_FASTLD3;
+ env = getenv("JSIMD_FASTST3");
+ if ((env != NULL) && (strcmp(env, "1") == 0))
+ simd_features |= JSIMD_FASTST3;
+ if ((env != NULL) && (strcmp(env, "0") == 0))
+ simd_features &= ~JSIMD_FASTST3;
#endif
}
@@ -237,14 +252,28 @@ jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
switch (cinfo->in_color_space) {
case JCS_EXT_RGB:
- neonfct = jsimd_extrgb_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
+ if (simd_features & JSIMD_FASTLD3)
+#endif
+ neonfct = jsimd_extrgb_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
+ else
+ neonfct = jsimd_extrgb_ycc_convert_neon_slowld3;
+#endif
break;
case JCS_EXT_RGBX:
case JCS_EXT_RGBA:
neonfct = jsimd_extrgbx_ycc_convert_neon;
break;
case JCS_EXT_BGR:
- neonfct = jsimd_extbgr_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
+ if (simd_features & JSIMD_FASTLD3)
+#endif
+ neonfct = jsimd_extbgr_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
+ else
+ neonfct = jsimd_extbgr_ycc_convert_neon_slowld3;
+#endif
break;
case JCS_EXT_BGRX:
case JCS_EXT_BGRA:
@@ -259,7 +288,14 @@ jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
neonfct = jsimd_extxrgb_ycc_convert_neon;
break;
default:
- neonfct = jsimd_extrgb_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
+ if (simd_features & JSIMD_FASTLD3)
+#endif
+ neonfct = jsimd_extrgb_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
+ else
+ neonfct = jsimd_extrgb_ycc_convert_neon_slowld3;
+#endif
break;
}
@@ -313,14 +349,28 @@ jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
switch (cinfo->out_color_space) {
case JCS_EXT_RGB:
- neonfct = jsimd_ycc_extrgb_convert_neon;
+#ifndef NEON_INTRINSICS
+ if (simd_features & JSIMD_FASTST3)
+#endif
+ neonfct = jsimd_ycc_extrgb_convert_neon;
+#ifndef NEON_INTRINSICS
+ else
+ neonfct = jsimd_ycc_extrgb_convert_neon_slowst3;
+#endif
break;
case JCS_EXT_RGBX:
case JCS_EXT_RGBA:
neonfct = jsimd_ycc_extrgbx_convert_neon;
break;
case JCS_EXT_BGR:
- neonfct = jsimd_ycc_extbgr_convert_neon;
+#ifndef NEON_INTRINSICS
+ if (simd_features & JSIMD_FASTST3)
+#endif
+ neonfct = jsimd_ycc_extbgr_convert_neon;
+#ifndef NEON_INTRINSICS
+ else
+ neonfct = jsimd_ycc_extbgr_convert_neon_slowst3;
+#endif
break;
case JCS_EXT_BGRX:
case JCS_EXT_BGRA:
@@ -335,7 +385,15 @@ jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
neonfct = jsimd_ycc_extxrgb_convert_neon;
break;
default:
- neonfct = jsimd_ycc_extrgb_convert_neon;
+#ifndef NEON_INTRINSICS
+ if (simd_features & JSIMD_FASTST3)
+#endif
+ neonfct = jsimd_ycc_extrgb_convert_neon;
+#ifndef NEON_INTRINSICS
+ else
+ neonfct = jsimd_ycc_extrgb_convert_neon_slowst3;
+#endif
+ break;
}
neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
@@ -433,7 +491,6 @@ jsimd_can_h2v1_upsample(void)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;
-
if (simd_support & JSIMD_NEON)
return 1;
@@ -938,17 +995,33 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
int last_dc_val, c_derived_tbl *dctbl,
c_derived_tbl *actbl)
{
+#ifndef NEON_INTRINSICS
if (simd_features & JSIMD_FASTTBL)
+#endif
return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
dctbl, actbl);
+#ifndef NEON_INTRINSICS
else
return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block,
last_dc_val, dctbl, actbl);
+#endif
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_first_prepare(void)
{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (SIZEOF_SIZE_T != 8)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
return 0;
}
@@ -957,11 +1030,25 @@ jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *values, size_t *zerobits)
{
+ jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
+ Sl, Al, values, zerobits);
}
GLOBAL(int)
jsimd_can_encode_mcu_AC_refine_prepare(void)
{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (SIZEOF_SIZE_T != 8)
+ return 0;
+
+ if (simd_support & JSIMD_NEON)
+ return 1;
+
return 0;
}
@@ -970,5 +1057,7 @@ jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
int Al, JCOEF *absvalues, size_t *bits)
{
- return 0;
+ return jsimd_encode_mcu_AC_refine_prepare_neon(block,
+ jpeg_natural_order_start,
+ Sl, Al, absvalues, bits);
}
diff --git a/simd/arm/align.h b/simd/arm/align.h
new file mode 100644
index 0000000..cff4241
--- /dev/null
+++ b/simd/arm/align.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* How to obtain memory alignment for structures and variables */
+#if defined(_MSC_VER)
+#define ALIGN(alignment) __declspec(align(alignment))
+#elif defined(__clang__) || defined(__GNUC__)
+#define ALIGN(alignment) __attribute__((aligned(alignment)))
+#else
+#error "Unknown compiler"
+#endif
diff --git a/simd/arm/arm/jsimd_neon.S b/simd/arm/arm/jsimd_neon.S
deleted file mode 100644
index 2c45324..0000000
--- a/simd/arm/arm/jsimd_neon.S
+++ /dev/null
@@ -1,499 +0,0 @@
-/*
- * ARMv7 NEON optimizations for libjpeg-turbo
- *
- * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
- * All Rights Reserved.
- * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
- * Copyright (C) 2014, Siarhei Siamashka. All Rights Reserved.
- * Copyright (C) 2014, Linaro Limited. All Rights Reserved.
- * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
- * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */
-#endif
-
-.text
-.fpu neon
-.arch armv7a
-.object_arch armv4
-.arm
-.syntax unified
-
-
-#define RESPECT_STRICT_ALIGNMENT 1
-
-
-/*****************************************************************************/
-
-/* Supplementary macro for setting function attributes */
-.macro asm_function fname
-#ifdef __APPLE__
- .private_extern _\fname
- .globl _\fname
-_\fname:
-#else
- .global \fname
-#ifdef __ELF__
- .hidden \fname
- .type \fname, %function
-#endif
-\fname:
-#endif
-.endm
-
-
-#define CENTERJSAMPLE 128
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(JOCTET*)
- * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
- * JCOEFPTR block, int last_dc_val,
- * c_derived_tbl *dctbl, c_derived_tbl *actbl)
- *
- */
-
-.macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
- sub \PUT_BITS, \PUT_BITS, #0x8
- lsr \TMP, \PUT_BUFFER, \PUT_BITS
- uxtb \TMP, \TMP
- strb \TMP, [\BUFFER, #1]!
- cmp \TMP, #0xff
- /*it eq*/
- strbeq \ZERO, [\BUFFER, #1]!
-.endm
-
-.macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE
- /*lsl \PUT_BUFFER, \PUT_BUFFER, \SIZE*/
- add \PUT_BITS, \SIZE
- /*orr \PUT_BUFFER, \PUT_BUFFER, \CODE*/
- orr \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE
-.endm
-
-.macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
- cmp \PUT_BITS, #0x10
- blt 15f
- eor \ZERO, \ZERO, \ZERO
- emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
- emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
-15:
-.endm
-
-.balign 16
-jsimd_huff_encode_one_block_neon_consts:
- .byte 0x01
- .byte 0x02
- .byte 0x04
- .byte 0x08
- .byte 0x10
- .byte 0x20
- .byte 0x40
- .byte 0x80
-
-asm_function jsimd_huff_encode_one_block_neon
- push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
- add r7, sp, #0x1c
- sub r4, sp, #0x40
- bfc r4, #0, #5
- mov sp, r4 /* align sp on 32 bytes */
- vst1.64 {d8, d9, d10, d11}, [r4, :128]!
- vst1.64 {d12, d13, d14, d15}, [r4, :128]
- sub sp, #0x140 /* reserve 320 bytes */
- str r0, [sp, #0x18] /* working state > sp + Ox18 */
- add r4, sp, #0x20 /* r4 = t1 */
- ldr lr, [r7, #0x8] /* lr = dctbl */
- sub r10, r1, #0x1 /* r10=buffer-- */
- ldrsh r1, [r2]
- mov r9, #0x10
- mov r8, #0x1
- adr r5, jsimd_huff_encode_one_block_neon_consts
- /* prepare data */
- vld1.8 {d26}, [r5, :64]
- veor q8, q8, q8
- veor q9, q9, q9
- vdup.16 q14, r9
- vdup.16 q15, r8
- veor q10, q10, q10
- veor q11, q11, q11
- sub r1, r1, r3
- add r9, r2, #0x22
- add r8, r2, #0x18
- add r3, r2, #0x36
- vmov.16 d0[0], r1
- vld1.16 {d2[0]}, [r9, :16]
- vld1.16 {d4[0]}, [r8, :16]
- vld1.16 {d6[0]}, [r3, :16]
- add r1, r2, #0x2
- add r9, r2, #0x30
- add r8, r2, #0x26
- add r3, r2, #0x28
- vld1.16 {d0[1]}, [r1, :16]
- vld1.16 {d2[1]}, [r9, :16]
- vld1.16 {d4[1]}, [r8, :16]
- vld1.16 {d6[1]}, [r3, :16]
- add r1, r2, #0x10
- add r9, r2, #0x40
- add r8, r2, #0x34
- add r3, r2, #0x1a
- vld1.16 {d0[2]}, [r1, :16]
- vld1.16 {d2[2]}, [r9, :16]
- vld1.16 {d4[2]}, [r8, :16]
- vld1.16 {d6[2]}, [r3, :16]
- add r1, r2, #0x20
- add r9, r2, #0x32
- add r8, r2, #0x42
- add r3, r2, #0xc
- vld1.16 {d0[3]}, [r1, :16]
- vld1.16 {d2[3]}, [r9, :16]
- vld1.16 {d4[3]}, [r8, :16]
- vld1.16 {d6[3]}, [r3, :16]
- add r1, r2, #0x12
- add r9, r2, #0x24
- add r8, r2, #0x50
- add r3, r2, #0xe
- vld1.16 {d1[0]}, [r1, :16]
- vld1.16 {d3[0]}, [r9, :16]
- vld1.16 {d5[0]}, [r8, :16]
- vld1.16 {d7[0]}, [r3, :16]
- add r1, r2, #0x4
- add r9, r2, #0x16
- add r8, r2, #0x60
- add r3, r2, #0x1c
- vld1.16 {d1[1]}, [r1, :16]
- vld1.16 {d3[1]}, [r9, :16]
- vld1.16 {d5[1]}, [r8, :16]
- vld1.16 {d7[1]}, [r3, :16]
- add r1, r2, #0x6
- add r9, r2, #0x8
- add r8, r2, #0x52
- add r3, r2, #0x2a
- vld1.16 {d1[2]}, [r1, :16]
- vld1.16 {d3[2]}, [r9, :16]
- vld1.16 {d5[2]}, [r8, :16]
- vld1.16 {d7[2]}, [r3, :16]
- add r1, r2, #0x14
- add r9, r2, #0xa
- add r8, r2, #0x44
- add r3, r2, #0x38
- vld1.16 {d1[3]}, [r1, :16]
- vld1.16 {d3[3]}, [r9, :16]
- vld1.16 {d5[3]}, [r8, :16]
- vld1.16 {d7[3]}, [r3, :16]
- vcgt.s16 q8, q8, q0
- vcgt.s16 q9, q9, q1
- vcgt.s16 q10, q10, q2
- vcgt.s16 q11, q11, q3
- vabs.s16 q0, q0
- vabs.s16 q1, q1
- vabs.s16 q2, q2
- vabs.s16 q3, q3
- veor q8, q8, q0
- veor q9, q9, q1
- veor q10, q10, q2
- veor q11, q11, q3
- add r9, r4, #0x20
- add r8, r4, #0x80
- add r3, r4, #0xa0
- vclz.i16 q0, q0
- vclz.i16 q1, q1
- vclz.i16 q2, q2
- vclz.i16 q3, q3
- vsub.i16 q0, q14, q0
- vsub.i16 q1, q14, q1
- vsub.i16 q2, q14, q2
- vsub.i16 q3, q14, q3
- vst1.16 {d0, d1, d2, d3}, [r4, :256]
- vst1.16 {d4, d5, d6, d7}, [r9, :256]
- vshl.s16 q0, q15, q0
- vshl.s16 q1, q15, q1
- vshl.s16 q2, q15, q2
- vshl.s16 q3, q15, q3
- vsub.i16 q0, q0, q15
- vsub.i16 q1, q1, q15
- vsub.i16 q2, q2, q15
- vsub.i16 q3, q3, q15
- vand q8, q8, q0
- vand q9, q9, q1
- vand q10, q10, q2
- vand q11, q11, q3
- vst1.16 {d16, d17, d18, d19}, [r8, :256]
- vst1.16 {d20, d21, d22, d23}, [r3, :256]
- add r1, r2, #0x46
- add r9, r2, #0x3a
- add r8, r2, #0x74
- add r3, r2, #0x6a
- vld1.16 {d8[0]}, [r1, :16]
- vld1.16 {d10[0]}, [r9, :16]
- vld1.16 {d12[0]}, [r8, :16]
- vld1.16 {d14[0]}, [r3, :16]
- veor q8, q8, q8
- veor q9, q9, q9
- veor q10, q10, q10
- veor q11, q11, q11
- add r1, r2, #0x54
- add r9, r2, #0x2c
- add r8, r2, #0x76
- add r3, r2, #0x78
- vld1.16 {d8[1]}, [r1, :16]
- vld1.16 {d10[1]}, [r9, :16]
- vld1.16 {d12[1]}, [r8, :16]
- vld1.16 {d14[1]}, [r3, :16]
- add r1, r2, #0x62
- add r9, r2, #0x1e
- add r8, r2, #0x68
- add r3, r2, #0x7a
- vld1.16 {d8[2]}, [r1, :16]
- vld1.16 {d10[2]}, [r9, :16]
- vld1.16 {d12[2]}, [r8, :16]
- vld1.16 {d14[2]}, [r3, :16]
- add r1, r2, #0x70
- add r9, r2, #0x2e
- add r8, r2, #0x5a
- add r3, r2, #0x6c
- vld1.16 {d8[3]}, [r1, :16]
- vld1.16 {d10[3]}, [r9, :16]
- vld1.16 {d12[3]}, [r8, :16]
- vld1.16 {d14[3]}, [r3, :16]
- add r1, r2, #0x72
- add r9, r2, #0x3c
- add r8, r2, #0x4c
- add r3, r2, #0x5e
- vld1.16 {d9[0]}, [r1, :16]
- vld1.16 {d11[0]}, [r9, :16]
- vld1.16 {d13[0]}, [r8, :16]
- vld1.16 {d15[0]}, [r3, :16]
- add r1, r2, #0x64
- add r9, r2, #0x4a
- add r8, r2, #0x3e
- add r3, r2, #0x6e
- vld1.16 {d9[1]}, [r1, :16]
- vld1.16 {d11[1]}, [r9, :16]
- vld1.16 {d13[1]}, [r8, :16]
- vld1.16 {d15[1]}, [r3, :16]
- add r1, r2, #0x56
- add r9, r2, #0x58
- add r8, r2, #0x4e
- add r3, r2, #0x7c
- vld1.16 {d9[2]}, [r1, :16]
- vld1.16 {d11[2]}, [r9, :16]
- vld1.16 {d13[2]}, [r8, :16]
- vld1.16 {d15[2]}, [r3, :16]
- add r1, r2, #0x48
- add r9, r2, #0x66
- add r8, r2, #0x5c
- add r3, r2, #0x7e
- vld1.16 {d9[3]}, [r1, :16]
- vld1.16 {d11[3]}, [r9, :16]
- vld1.16 {d13[3]}, [r8, :16]
- vld1.16 {d15[3]}, [r3, :16]
- vcgt.s16 q8, q8, q4
- vcgt.s16 q9, q9, q5
- vcgt.s16 q10, q10, q6
- vcgt.s16 q11, q11, q7
- vabs.s16 q4, q4
- vabs.s16 q5, q5
- vabs.s16 q6, q6
- vabs.s16 q7, q7
- veor q8, q8, q4
- veor q9, q9, q5
- veor q10, q10, q6
- veor q11, q11, q7
- add r1, r4, #0x40
- add r9, r4, #0x60
- add r8, r4, #0xc0
- add r3, r4, #0xe0
- vclz.i16 q4, q4
- vclz.i16 q5, q5
- vclz.i16 q6, q6
- vclz.i16 q7, q7
- vsub.i16 q4, q14, q4
- vsub.i16 q5, q14, q5
- vsub.i16 q6, q14, q6
- vsub.i16 q7, q14, q7
- vst1.16 {d8, d9, d10, d11}, [r1, :256]
- vst1.16 {d12, d13, d14, d15}, [r9, :256]
- vshl.s16 q4, q15, q4
- vshl.s16 q5, q15, q5
- vshl.s16 q6, q15, q6
- vshl.s16 q7, q15, q7
- vsub.i16 q4, q4, q15
- vsub.i16 q5, q5, q15
- vsub.i16 q6, q6, q15
- vsub.i16 q7, q7, q15
- vand q8, q8, q4
- vand q9, q9, q5
- vand q10, q10, q6
- vand q11, q11, q7
- vst1.16 {d16, d17, d18, d19}, [r8, :256]
- vst1.16 {d20, d21, d22, d23}, [r3, :256]
- ldr r12, [r7, #0xc] /* r12 = actbl */
- add r1, lr, #0x400 /* r1 = dctbl->ehufsi */
- mov r9, r12 /* r9 = actbl */
- add r6, r4, #0x80 /* r6 = t2 */
- ldr r11, [r0, #0x8] /* r11 = put_buffer */
- ldr r4, [r0, #0xc] /* r4 = put_bits */
- ldrh r2, [r6, #-128] /* r2 = nbits */
- ldrh r3, [r6] /* r3 = temp2 & (((JLONG)1)<<nbits) - 1; */
- ldr r0, [lr, r2, lsl #2]
- ldrb r5, [r1, r2]
- put_bits r11, r4, r0, r5
- checkbuf15 r10, r11, r4, r5, r0
- put_bits r11, r4, r3, r2
- checkbuf15 r10, r11, r4, r5, r0
- mov lr, r6 /* lr = t2 */
- add r5, r9, #0x400 /* r5 = actbl->ehufsi */
- ldrsb r6, [r5, #0xf0] /* r6 = actbl->ehufsi[0xf0] */
- veor q8, q8, q8
- vceq.i16 q0, q0, q8
- vceq.i16 q1, q1, q8
- vceq.i16 q2, q2, q8
- vceq.i16 q3, q3, q8
- vceq.i16 q4, q4, q8
- vceq.i16 q5, q5, q8
- vceq.i16 q6, q6, q8
- vceq.i16 q7, q7, q8
- vmovn.i16 d0, q0
- vmovn.i16 d2, q1
- vmovn.i16 d4, q2
- vmovn.i16 d6, q3
- vmovn.i16 d8, q4
- vmovn.i16 d10, q5
- vmovn.i16 d12, q6
- vmovn.i16 d14, q7
- vand d0, d0, d26
- vand d2, d2, d26
- vand d4, d4, d26
- vand d6, d6, d26
- vand d8, d8, d26
- vand d10, d10, d26
- vand d12, d12, d26
- vand d14, d14, d26
- vpadd.i8 d0, d0, d2
- vpadd.i8 d4, d4, d6
- vpadd.i8 d8, d8, d10
- vpadd.i8 d12, d12, d14
- vpadd.i8 d0, d0, d4
- vpadd.i8 d8, d8, d12
- vpadd.i8 d0, d0, d8
- vmov.32 r1, d0[1]
- vmov.32 r8, d0[0]
- mvn r1, r1
- mvn r8, r8
- lsrs r1, r1, #0x1
- rrx r8, r8 /* shift in last r1 bit while shifting out DC bit */
- rbit r1, r1 /* r1 = index1 */
- rbit r8, r8 /* r8 = index0 */
- ldr r0, [r9, #0x3c0] /* r0 = actbl->ehufco[0xf0] */
- str r1, [sp, #0x14] /* index1 > sp + 0x14 */
- cmp r8, #0x0
- beq 6f
-1:
- clz r2, r8
- add lr, lr, r2, lsl #1
- lsl r8, r8, r2
- ldrh r1, [lr, #-126]
-2:
- cmp r2, #0x10
- blt 3f
- sub r2, r2, #0x10
- put_bits r11, r4, r0, r6
- cmp r4, #0x10
- blt 2b
- eor r3, r3, r3
- emit_byte r10, r11, r4, r3, r12
- emit_byte r10, r11, r4, r3, r12
- b 2b
-3:
- add r2, r1, r2, lsl #4
- ldrh r3, [lr, #2]!
- ldr r12, [r9, r2, lsl #2]
- ldrb r2, [r5, r2]
- put_bits r11, r4, r12, r2
- checkbuf15 r10, r11, r4, r2, r12
- put_bits r11, r4, r3, r1
- checkbuf15 r10, r11, r4, r2, r12
- lsls r8, r8, #0x1
- bne 1b
-6:
- add r12, sp, #0x20 /* r12 = t1 */
- ldr r8, [sp, #0x14] /* r8 = index1 */
- adds r12, #0xc0 /* r12 = t2 + (DCTSIZE2/2) */
- cmp r8, #0x0
- beq 6f
- clz r2, r8
- sub r12, r12, lr
- lsl r8, r8, r2
- add r2, r2, r12, lsr #1
- add lr, lr, r2, lsl #1
- b 7f
-1:
- clz r2, r8
- add lr, lr, r2, lsl #1
- lsl r8, r8, r2
-7:
- ldrh r1, [lr, #-126]
-2:
- cmp r2, #0x10
- blt 3f
- sub r2, r2, #0x10
- put_bits r11, r4, r0, r6
- cmp r4, #0x10
- blt 2b
- eor r3, r3, r3
- emit_byte r10, r11, r4, r3, r12
- emit_byte r10, r11, r4, r3, r12
- b 2b
-3:
- add r2, r1, r2, lsl #4
- ldrh r3, [lr, #2]!
- ldr r12, [r9, r2, lsl #2]
- ldrb r2, [r5, r2]
- put_bits r11, r4, r12, r2
- checkbuf15 r10, r11, r4, r2, r12
- put_bits r11, r4, r3, r1
- checkbuf15 r10, r11, r4, r2, r12
- lsls r8, r8, #0x1
- bne 1b
-6:
- add r0, sp, #0x20
- add r0, #0xfe
- cmp lr, r0
- bhs 1f
- ldr r1, [r9]
- ldrb r0, [r5]
- put_bits r11, r4, r1, r0
- checkbuf15 r10, r11, r4, r0, r1
-1:
- ldr r12, [sp, #0x18]
- str r11, [r12, #0x8]
- str r4, [r12, #0xc]
- add r0, r10, #0x1
- add r4, sp, #0x140
- vld1.64 {d8, d9, d10, d11}, [r4, :128]!
- vld1.64 {d12, d13, d14, d15}, [r4, :128]
- sub r4, r7, #0x1c
- mov sp, r4
- pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
-
-.purgem emit_byte
-.purgem put_bits
-.purgem checkbuf15
diff --git a/simd/arm/arm64/jsimd_neon.S b/simd/arm/arm64/jsimd_neon.S
deleted file mode 100644
index 898cf2c..0000000
--- a/simd/arm/arm64/jsimd_neon.S
+++ /dev/null
@@ -1,538 +0,0 @@
-/*
- * ARMv8 NEON optimizations for libjpeg-turbo
- *
- * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
- * All Rights Reserved.
- * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
- * Copyright (C) 2013-2014, Linaro Limited. All Rights Reserved.
- * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
- * Copyright (C) 2014-2016, D. R. Commander. All Rights Reserved.
- * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved.
- * Copyright (C) 2016, Siarhei Siamashka. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */
-#endif
-
-#if defined(__APPLE__)
-.section __DATA, __const
-#elif defined(_WIN32)
-.section .rdata
-#else
-.section .rodata, "a", %progbits
-#endif
-
-/* Constants for jsimd_huff_encode_one_block_neon() */
-
-.balign 16
-Ljsimd_huff_encode_one_block_neon_consts:
- .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
- 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
- .byte 0, 1, 2, 3, 16, 17, 32, 33, \
- 18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */
- .byte 34, 35, 48, 49, 255, 255, 50, 51, \
- 36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */
- .byte 8, 9, 22, 23, 36, 37, 50, 51, \
- 255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */
- .byte 54, 55, 40, 41, 26, 27, 12, 13, \
- 14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */
- .byte 6, 7, 20, 21, 34, 35, 48, 49, \
- 50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */
- .byte 42, 43, 28, 29, 14, 15, 30, 31, \
- 44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */
- .byte 255, 255, 255, 255, 56, 57, 42, 43, \
- 28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */
- .byte 26, 27, 40, 41, 42, 43, 28, 29, \
- 14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */
- .byte 255, 255, 255, 255, 0, 1, 255, 255, \
- 255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */
- .byte 255, 255, 255, 255, 255, 255, 255, 255, \
- 0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */
- .byte 255, 255, 255, 255, 255, 255, 255, 255, \
- 255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */
- .byte 4, 5, 6, 7, 255, 255, 255, 255, \
- 255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */
-
-.text
-
-
-#define RESPECT_STRICT_ALIGNMENT 1
-
-
-/*****************************************************************************/
-
-/* Supplementary macro for setting function attributes */
-.macro asm_function fname
-#ifdef __APPLE__
- .private_extern _\fname
- .globl _\fname
-_\fname:
-#else
- .global \fname
-#ifdef __ELF__
- .hidden \fname
- .type \fname, %function
-#endif
-\fname:
-#endif
-.endm
-
-/* Get symbol location */
-.macro get_symbol_loc reg, symbol
-#ifdef __APPLE__
- adrp \reg, \symbol@PAGE
- add \reg, \reg, \symbol@PAGEOFF
-#else
- adrp \reg, \symbol
- add \reg, \reg, :lo12:\symbol
-#endif
-.endm
-
-
-#define CENTERJSAMPLE 128
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(JOCTET *)
- * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
- * JCOEFPTR block, int last_dc_val,
- * c_derived_tbl *dctbl, c_derived_tbl *actbl)
- *
- */
-
- BUFFER .req x1
- PUT_BUFFER .req x6
- PUT_BITS .req x7
- PUT_BITSw .req w7
-
-.macro emit_byte
- sub PUT_BITS, PUT_BITS, #0x8
- lsr x19, PUT_BUFFER, PUT_BITS
- uxtb w19, w19
- strb w19, [BUFFER, #1]!
- cmp w19, #0xff
- b.ne 14f
- strb wzr, [BUFFER, #1]!
-14:
-.endm
-.macro put_bits CODE, SIZE
- lsl PUT_BUFFER, PUT_BUFFER, \SIZE
- add PUT_BITS, PUT_BITS, \SIZE
- orr PUT_BUFFER, PUT_BUFFER, \CODE
-.endm
-.macro checkbuf31
- cmp PUT_BITS, #0x20
- b.lt 31f
- emit_byte
- emit_byte
- emit_byte
- emit_byte
-31:
-.endm
-.macro checkbuf47
- cmp PUT_BITS, #0x30
- b.lt 47f
- emit_byte
- emit_byte
- emit_byte
- emit_byte
- emit_byte
- emit_byte
-47:
-.endm
-
-.macro generate_jsimd_huff_encode_one_block fast_tbl
-
-.balign 16
-.if \fast_tbl == 1
-asm_function jsimd_huff_encode_one_block_neon
-.else
-asm_function jsimd_huff_encode_one_block_neon_slowtbl
-.endif
- sub sp, sp, 272
- sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */
- /* Save ARM registers */
- stp x19, x20, [sp]
- get_symbol_loc x15, Ljsimd_huff_encode_one_block_neon_consts
- ldr PUT_BUFFER, [x0, #0x10]
- ldr PUT_BITSw, [x0, #0x18]
- ldrsh w12, [x2] /* load DC coeff in w12 */
- /* prepare data */
-.if \fast_tbl == 1
- ld1 {v23.16b}, [x15], #16
- ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64
- ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64
- ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64
- ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64
- ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64
- sub w12, w12, w3 /* last_dc_val, not used afterwards */
- /* ZigZag 8x8 */
- tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b
- tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b
- tbl v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b
- tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b
- tbl v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b
- tbl v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b
- tbl v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b
- tbl v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b
- ins v0.h[0], w12
- tbx v1.16b, {v28.16b}, v16.16b
- tbx v2.16b, {v29.16b, v30.16b}, v17.16b
- tbx v5.16b, {v29.16b, v30.16b}, v18.16b
- tbx v6.16b, {v31.16b}, v19.16b
-.else
- add x13, x2, #0x22
- sub w12, w12, w3 /* last_dc_val, not used afterwards */
- ld1 {v23.16b}, [x15]
- add x14, x2, #0x18
- add x3, x2, #0x36
- ins v0.h[0], w12
- add x9, x2, #0x2
- ld1 {v1.h}[0], [x13]
- add x15, x2, #0x30
- ld1 {v2.h}[0], [x14]
- add x19, x2, #0x26
- ld1 {v3.h}[0], [x3]
- add x20, x2, #0x28
- ld1 {v0.h}[1], [x9]
- add x12, x2, #0x10
- ld1 {v1.h}[1], [x15]
- add x13, x2, #0x40
- ld1 {v2.h}[1], [x19]
- add x14, x2, #0x34
- ld1 {v3.h}[1], [x20]
- add x3, x2, #0x1a
- ld1 {v0.h}[2], [x12]
- add x9, x2, #0x20
- ld1 {v1.h}[2], [x13]
- add x15, x2, #0x32
- ld1 {v2.h}[2], [x14]
- add x19, x2, #0x42
- ld1 {v3.h}[2], [x3]
- add x20, x2, #0xc
- ld1 {v0.h}[3], [x9]
- add x12, x2, #0x12
- ld1 {v1.h}[3], [x15]
- add x13, x2, #0x24
- ld1 {v2.h}[3], [x19]
- add x14, x2, #0x50
- ld1 {v3.h}[3], [x20]
- add x3, x2, #0xe
- ld1 {v0.h}[4], [x12]
- add x9, x2, #0x4
- ld1 {v1.h}[4], [x13]
- add x15, x2, #0x16
- ld1 {v2.h}[4], [x14]
- add x19, x2, #0x60
- ld1 {v3.h}[4], [x3]
- add x20, x2, #0x1c
- ld1 {v0.h}[5], [x9]
- add x12, x2, #0x6
- ld1 {v1.h}[5], [x15]
- add x13, x2, #0x8
- ld1 {v2.h}[5], [x19]
- add x14, x2, #0x52
- ld1 {v3.h}[5], [x20]
- add x3, x2, #0x2a
- ld1 {v0.h}[6], [x12]
- add x9, x2, #0x14
- ld1 {v1.h}[6], [x13]
- add x15, x2, #0xa
- ld1 {v2.h}[6], [x14]
- add x19, x2, #0x44
- ld1 {v3.h}[6], [x3]
- add x20, x2, #0x38
- ld1 {v0.h}[7], [x9]
- add x12, x2, #0x46
- ld1 {v1.h}[7], [x15]
- add x13, x2, #0x3a
- ld1 {v2.h}[7], [x19]
- add x14, x2, #0x74
- ld1 {v3.h}[7], [x20]
- add x3, x2, #0x6a
- ld1 {v4.h}[0], [x12]
- add x9, x2, #0x54
- ld1 {v5.h}[0], [x13]
- add x15, x2, #0x2c
- ld1 {v6.h}[0], [x14]
- add x19, x2, #0x76
- ld1 {v7.h}[0], [x3]
- add x20, x2, #0x78
- ld1 {v4.h}[1], [x9]
- add x12, x2, #0x62
- ld1 {v5.h}[1], [x15]
- add x13, x2, #0x1e
- ld1 {v6.h}[1], [x19]
- add x14, x2, #0x68
- ld1 {v7.h}[1], [x20]
- add x3, x2, #0x7a
- ld1 {v4.h}[2], [x12]
- add x9, x2, #0x70
- ld1 {v5.h}[2], [x13]
- add x15, x2, #0x2e
- ld1 {v6.h}[2], [x14]
- add x19, x2, #0x5a
- ld1 {v7.h}[2], [x3]
- add x20, x2, #0x6c
- ld1 {v4.h}[3], [x9]
- add x12, x2, #0x72
- ld1 {v5.h}[3], [x15]
- add x13, x2, #0x3c
- ld1 {v6.h}[3], [x19]
- add x14, x2, #0x4c
- ld1 {v7.h}[3], [x20]
- add x3, x2, #0x5e
- ld1 {v4.h}[4], [x12]
- add x9, x2, #0x64
- ld1 {v5.h}[4], [x13]
- add x15, x2, #0x4a
- ld1 {v6.h}[4], [x14]
- add x19, x2, #0x3e
- ld1 {v7.h}[4], [x3]
- add x20, x2, #0x6e
- ld1 {v4.h}[5], [x9]
- add x12, x2, #0x56
- ld1 {v5.h}[5], [x15]
- add x13, x2, #0x58
- ld1 {v6.h}[5], [x19]
- add x14, x2, #0x4e
- ld1 {v7.h}[5], [x20]
- add x3, x2, #0x7c
- ld1 {v4.h}[6], [x12]
- add x9, x2, #0x48
- ld1 {v5.h}[6], [x13]
- add x15, x2, #0x66
- ld1 {v6.h}[6], [x14]
- add x19, x2, #0x5c
- ld1 {v7.h}[6], [x3]
- add x20, x2, #0x7e
- ld1 {v4.h}[7], [x9]
- ld1 {v5.h}[7], [x15]
- ld1 {v6.h}[7], [x19]
- ld1 {v7.h}[7], [x20]
-.endif
- cmlt v24.8h, v0.8h, #0
- cmlt v25.8h, v1.8h, #0
- cmlt v26.8h, v2.8h, #0
- cmlt v27.8h, v3.8h, #0
- cmlt v28.8h, v4.8h, #0
- cmlt v29.8h, v5.8h, #0
- cmlt v30.8h, v6.8h, #0
- cmlt v31.8h, v7.8h, #0
- abs v0.8h, v0.8h
- abs v1.8h, v1.8h
- abs v2.8h, v2.8h
- abs v3.8h, v3.8h
- abs v4.8h, v4.8h
- abs v5.8h, v5.8h
- abs v6.8h, v6.8h
- abs v7.8h, v7.8h
- eor v24.16b, v24.16b, v0.16b
- eor v25.16b, v25.16b, v1.16b
- eor v26.16b, v26.16b, v2.16b
- eor v27.16b, v27.16b, v3.16b
- eor v28.16b, v28.16b, v4.16b
- eor v29.16b, v29.16b, v5.16b
- eor v30.16b, v30.16b, v6.16b
- eor v31.16b, v31.16b, v7.16b
- cmeq v16.8h, v0.8h, #0
- cmeq v17.8h, v1.8h, #0
- cmeq v18.8h, v2.8h, #0
- cmeq v19.8h, v3.8h, #0
- cmeq v20.8h, v4.8h, #0
- cmeq v21.8h, v5.8h, #0
- cmeq v22.8h, v6.8h, #0
- xtn v16.8b, v16.8h
- xtn v18.8b, v18.8h
- xtn v20.8b, v20.8h
- xtn v22.8b, v22.8h
- umov w14, v0.h[0]
- xtn2 v16.16b, v17.8h
- umov w13, v24.h[0]
- xtn2 v18.16b, v19.8h
- clz w14, w14
- xtn2 v20.16b, v21.8h
- lsl w13, w13, w14
- cmeq v17.8h, v7.8h, #0
- sub w12, w14, #32
- xtn2 v22.16b, v17.8h
- lsr w13, w13, w14
- and v16.16b, v16.16b, v23.16b
- neg w12, w12
- and v18.16b, v18.16b, v23.16b
- add x3, x4, #0x400 /* r1 = dctbl->ehufsi */
- and v20.16b, v20.16b, v23.16b
- add x15, sp, #0x90 /* x15 = t2 */
- and v22.16b, v22.16b, v23.16b
- ldr w10, [x4, x12, lsl #2]
- addp v16.16b, v16.16b, v18.16b
- ldrb w11, [x3, x12]
- addp v20.16b, v20.16b, v22.16b
- checkbuf47
- addp v16.16b, v16.16b, v20.16b
- put_bits x10, x11
- addp v16.16b, v16.16b, v18.16b
- checkbuf47
- umov x9, v16.D[0]
- put_bits x13, x12
- cnt v17.8b, v16.8b
- mvn x9, x9
- addv B18, v17.8b
- add x4, x5, #0x400 /* x4 = actbl->ehufsi */
- umov w12, v18.b[0]
- lsr x9, x9, #0x1 /* clear AC coeff */
- ldr w13, [x5, #0x3c0] /* x13 = actbl->ehufco[0xf0] */
- rbit x9, x9 /* x9 = index0 */
- ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */
- cmp w12, #(64-8)
- add x11, sp, #16
- b.lt 4f
- cbz x9, 6f
- st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
- st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
- st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
- st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
-1:
- clz x2, x9
- add x15, x15, x2, lsl #1
- lsl x9, x9, x2
- ldrh w20, [x15, #-126]
-2:
- cmp x2, #0x10
- b.lt 3f
- sub x2, x2, #0x10
- checkbuf47
- put_bits x13, x14
- b 2b
-3:
- clz w20, w20
- ldrh w3, [x15, #2]!
- sub w11, w20, #32
- lsl w3, w3, w20
- neg w11, w11
- lsr w3, w3, w20
- add x2, x11, x2, lsl #4
- lsl x9, x9, #0x1
- ldr w12, [x5, x2, lsl #2]
- ldrb w10, [x4, x2]
- checkbuf31
- put_bits x12, x10
- put_bits x3, x11
- cbnz x9, 1b
- b 6f
-4:
- movi v21.8h, #0x0010
- clz v0.8h, v0.8h
- clz v1.8h, v1.8h
- clz v2.8h, v2.8h
- clz v3.8h, v3.8h
- clz v4.8h, v4.8h
- clz v5.8h, v5.8h
- clz v6.8h, v6.8h
- clz v7.8h, v7.8h
- ushl v24.8h, v24.8h, v0.8h
- ushl v25.8h, v25.8h, v1.8h
- ushl v26.8h, v26.8h, v2.8h
- ushl v27.8h, v27.8h, v3.8h
- ushl v28.8h, v28.8h, v4.8h
- ushl v29.8h, v29.8h, v5.8h
- ushl v30.8h, v30.8h, v6.8h
- ushl v31.8h, v31.8h, v7.8h
- neg v0.8h, v0.8h
- neg v1.8h, v1.8h
- neg v2.8h, v2.8h
- neg v3.8h, v3.8h
- neg v4.8h, v4.8h
- neg v5.8h, v5.8h
- neg v6.8h, v6.8h
- neg v7.8h, v7.8h
- ushl v24.8h, v24.8h, v0.8h
- ushl v25.8h, v25.8h, v1.8h
- ushl v26.8h, v26.8h, v2.8h
- ushl v27.8h, v27.8h, v3.8h
- ushl v28.8h, v28.8h, v4.8h
- ushl v29.8h, v29.8h, v5.8h
- ushl v30.8h, v30.8h, v6.8h
- ushl v31.8h, v31.8h, v7.8h
- add v0.8h, v21.8h, v0.8h
- add v1.8h, v21.8h, v1.8h
- add v2.8h, v21.8h, v2.8h
- add v3.8h, v21.8h, v3.8h
- add v4.8h, v21.8h, v4.8h
- add v5.8h, v21.8h, v5.8h
- add v6.8h, v21.8h, v6.8h
- add v7.8h, v21.8h, v7.8h
- st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
- st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
- st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
- st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
-1:
- clz x2, x9
- add x15, x15, x2, lsl #1
- lsl x9, x9, x2
- ldrh w11, [x15, #-126]
-2:
- cmp x2, #0x10
- b.lt 3f
- sub x2, x2, #0x10
- checkbuf47
- put_bits x13, x14
- b 2b
-3:
- ldrh w3, [x15, #2]!
- add x2, x11, x2, lsl #4
- lsl x9, x9, #0x1
- ldr w12, [x5, x2, lsl #2]
- ldrb w10, [x4, x2]
- checkbuf31
- put_bits x12, x10
- put_bits x3, x11
- cbnz x9, 1b
-6:
- add x13, sp, #0x10e
- cmp x15, x13
- b.hs 1f
- ldr w12, [x5]
- ldrb w14, [x4]
- checkbuf47
- put_bits x12, x14
-1:
- str PUT_BUFFER, [x0, #0x10]
- str PUT_BITSw, [x0, #0x18]
- ldp x19, x20, [sp], 16
- add x0, BUFFER, #0x1
- add sp, sp, 256
- br x30
-
-.endm
-
-generate_jsimd_huff_encode_one_block 1
-generate_jsimd_huff_encode_one_block 0
-
- .unreq BUFFER
- .unreq PUT_BUFFER
- .unreq PUT_BITS
- .unreq PUT_BITSw
-
-.purgem emit_byte
-.purgem put_bits
-.purgem checkbuf31
-.purgem checkbuf47
diff --git a/simd/arm/common/jdsample-neon.c b/simd/arm/common/jdsample-neon.c
deleted file mode 100644
index e4f5129..0000000
--- a/simd/arm/common/jdsample-neon.c
+++ /dev/null
@@ -1,557 +0,0 @@
-/*
- * jdsample-neon.c - upsampling (Arm NEON)
- *
- * Copyright 2019 The Chromium Authors. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-#define JPEG_INTERNALS
-#include "../../../jinclude.h"
-#include "../../../jpeglib.h"
-#include "../../../jsimd.h"
-#include "../../../jdct.h"
-#include "../../../jsimddct.h"
-#include "../../jsimd.h"
-
-#include <arm_neon.h>
-
-/*
- * The diagram below shows a row of samples (luma or chroma) produced by h2v1
- * downsampling.
- *
- * s0 s1 s2
- * +---------+---------+---------+
- * | | | |
- * | p0 p1 | p2 p3 | p4 p5 |
- * | | | |
- * +---------+---------+---------+
- *
- * Each sample contains two of the original pixel channel values. These pixel
- * channel values are centred at positions p0, p1, p2, p3, p4 and p5 above. To
- * compute the channel values of the original image, we proportionally blend
- * the adjacent samples in each row.
- *
- * There are three cases to consider:
- *
- * 1) The first pixel in the original image.
- * Pixel channel value p0 contains only a component from sample s0, so we
- * set p0 = s0.
- * 2) The last pixel in the original image.
- * Pixel channel value p5 contains only a component from sample s2, so we
- * set p5 = s2.
- * 3) General case (all other pixels in the row).
- * Apart from the first and last pixels, every other pixel channel value is
- * computed by blending the containing sample and the nearest neigbouring
- * sample in the ratio 3:1.
- * For example, the pixel channel value centred at p1 would be computed as
- * follows:
- * 3/4 * s0 + 1/4 * s1
- * while the pixel channel value centred at p2 would be:
- * 3/4 * s1 + 1/4 * s0
- */
-
-void jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
- JDIMENSION downsampled_width,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
- JSAMPARRAY output_data = *output_data_ptr;
- JSAMPROW inptr, outptr;
- /* Setup constants. */
- const uint16x8_t one_u16 = vdupq_n_u16(1);
- const uint8x8_t three_u8 = vdup_n_u8(3);
-
- for (int inrow = 0; inrow < max_v_samp_factor; inrow++) {
- inptr = input_data[inrow];
- outptr = output_data[inrow];
- /* Case 1: first pixel channel value in this row of the original image. */
- *outptr = (JSAMPLE)GETJSAMPLE(*inptr);
-
- /* General case: */
- /* 3/4 * containing sample + 1/4 * nearest neighbouring sample */
- /* For p1: containing sample = s0, nearest neighbouring sample = s1. */
- /* For p2: containing sample = s1, nearest neighbouring sample = s0. */
- uint8x16_t s0 = vld1q_u8(inptr);
- uint8x16_t s1 = vld1q_u8(inptr + 1);
- /* Multiplication makes vectors twice as wide: '_l' and '_h' suffixes */
- /* denote low half and high half respectively. */
- uint16x8_t s1_add_3s0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1)),
- vget_low_u8(s0), three_u8);
- uint16x8_t s1_add_3s0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1)),
- vget_high_u8(s0), three_u8);
- uint16x8_t s0_add_3s1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0)),
- vget_low_u8(s1), three_u8);
- uint16x8_t s0_add_3s1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0)),
- vget_high_u8(s1), three_u8);
- /* Add ordered dithering bias to odd pixel values. */
- s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
- s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
-
- /* Initially 1 - due to having already stored the first pixel of the */
- /* image. However, in subsequent iterations of the SIMD loop this offset */
- /* is (2 * colctr - 1) to stay within the bounds of the sample buffers */
- /* without having to resort to a slow scalar tail case for the last */
- /* (downsampled_width % 16) samples. See "Creation of 2-D sample arrays" */
- /* in jmemmgr.c for details. */
- unsigned outptr_offset = 1;
- uint8x16x2_t output_pixels;
-
-#if defined(__aarch64__) && defined(__clang__) && !defined(__OPTIMIZE_SIZE__)
- /* Unrolling by four is beneficial on AArch64 as there are 16 additional */
- /* 128-bit SIMD registers to accommodate the extra data in flight. */
- #pragma clang loop unroll_count(4)
-#endif
- /* We use software pipelining to maximise performance. The code indented */
- /* an extra 6 spaces begins the next iteration of the loop. */
- for (unsigned colctr = 16; colctr < downsampled_width; colctr += 16) {
- s0 = vld1q_u8(inptr + colctr - 1);
- s1 = vld1q_u8(inptr + colctr);
- /* Right-shift by 2 (divide by 4), narrow to 8-bit and combine. */
- output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
- vrshrn_n_u16(s1_add_3s0_h, 2));
- output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
- vshrn_n_u16(s0_add_3s1_h, 2));
- /* Multiplication makes vectors twice as wide: '_l' and '_h' */
- /* suffixes denote low half and high half respectively. */
- s1_add_3s0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1)),
- vget_low_u8(s0), three_u8);
- s1_add_3s0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1)),
- vget_high_u8(s0), three_u8);
- s0_add_3s1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0)),
- vget_low_u8(s1), three_u8);
- s0_add_3s1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0)),
- vget_high_u8(s1), three_u8);
- /* Add ordered dithering bias to odd pixel values. */
- s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
- s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
- /* Store pixel channel values to memory. */
- vst2q_u8(outptr + outptr_offset, output_pixels);
- outptr_offset = 2 * colctr - 1;
- }
-
- /* Complete the last iteration of the loop. */
- /* Right-shift by 2 (divide by 4), narrow to 8-bit and combine. */
- output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
- vrshrn_n_u16(s1_add_3s0_h, 2));
- output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
- vshrn_n_u16(s0_add_3s1_h, 2));
- /* Store pixel channel values to memory. */
- vst2q_u8(outptr + outptr_offset, output_pixels);
-
- /* Case 2: last pixel channel value in this row of the original image. */
- outptr[2 * downsampled_width - 1] =
- GETJSAMPLE(inptr[downsampled_width - 1]);
- }
-}
-
-
-/*
- * The diagram below shows a grid-window of samples (luma or chroma) produced
- * by h2v2 downsampling.
- *
- * s0 s1
- * +---------+---------+
- * | p0 p1 | p2 p3 |
- * r0 | | |
- * | p4 p5 | p6 p7 |
- * +---------+---------+
- * | p8 p9 | p10 p11|
- * r1 | | |
- * | p12 p13| p14 p15|
- * +---------+---------+
- * | p16 p17| p18 p19|
- * r2 | | |
- * | p20 p21| p22 p23|
- * +---------+---------+
- *
- * Every sample contains four of the original pixel channel values. The pixels'
- * channel values are centred at positions p0, p1, p2,..., p23 above. For a
- * given grid-window position, r1 is always used to denote the row of samples
- * containing the pixel channel values we are computing. For the top row of
- * pixel channel values in r1 (p8-p11), the nearest neighbouring samples are in
- * the row above - denoted by r0. Likewise, for the bottom row of pixels in r1
- * (p12-p15), the nearest neighbouring samples are in the row below - denoted
- * by r2.
- *
- * To compute the pixel channel values of the original image, we proportionally
- * blend the sample containing the pixel centre with the nearest neighbouring
- * samples in each row, column and diagonal.
- *
- * There are three cases to consider:
- *
- * 1) The first pixel in this row of the original image.
- * Pixel channel value p8 only contains components from sample column s0.
- * Its value is computed by blending samples s0r1 and s0r0 in the ratio 3:1.
- * 2) The last pixel in this row of the original image.
- * Pixel channel value p11 only contains components from sample column s1.
- * Its value is computed by blending samples s1r1 and s1r0 in the ratio 3:1.
- * 3) General case (all other pixels in the row).
- * Apart from the first and last pixels, every other pixel channel value in
- * the row contains components from samples in adjacent columns.
- *
- * For example, the pixel centred at p9 would be computed as follows:
- * (9/16 * s0r1) + (3/16 * s0r0) + (3/16 * s1r1) + (1/16 * s1r0)
- *
- * This can be broken down into two steps:
- * 1) Blend samples vertically in columns s0 and s1 in the ratio 3:1:
- * s0colsum = 3/4 * s0r1 + 1/4 * s0r0
- * s1colsum = 3/4 * s1r1 + 1/4 * s1r0
- * 2) Blend the already-blended columns in the ratio 3:1:
- * p9 = 3/4 * s0colsum + 1/4 * s1colsum
- *
- * The bottom row of pixel channel values in row r1 can be computed in the same
- * way for each of the three cases, only using samples in row r2 instead of row
- * r0 - as r2 is the nearest neighbouring row.
- */
-
-void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
- JDIMENSION downsampled_width,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
- JSAMPARRAY output_data = *output_data_ptr;
- JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
- int inrow, outrow;
- /* Setup constants. */
- const uint16x8_t seven_u16 = vdupq_n_u16(7);
- const uint8x8_t three_u8 = vdup_n_u8(3);
- const uint16x8_t three_u16 = vdupq_n_u16(3);
-
- inrow = outrow = 0;
- while (outrow < max_v_samp_factor) {
- inptr0 = input_data[inrow - 1];
- inptr1 = input_data[inrow];
- inptr2 = input_data[inrow + 1];
- /* Suffixes 0 and 1 denote the top and bottom rows of output pixels */
- /* respectively. */
- outptr0 = output_data[outrow++];
- outptr1 = output_data[outrow++];
-
- /* Case 1: first pixel channel value in this row of original image. */
- int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0);
- *outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4);
- int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2);
- *outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4);
-
- /* General case as described above. */
- /* Step 1: Blend samples vertically in columns s0 and s1. */
- /* Leave the divide by 4 to the end when it can be done for both */
- /* dimensions at once, right-shifting by 4. */
-
- /* Load and compute s0colsum0 and s0colsum1. */
- uint8x16_t s0r0 = vld1q_u8(inptr0);
- uint8x16_t s0r1 = vld1q_u8(inptr1);
- uint8x16_t s0r2 = vld1q_u8(inptr2);
- /* Multiplication makes vectors twice as wide: '_l' and '_h' suffixes */
- /* denote low half and high half respectively. */
- uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r0)),
- vget_low_u8(s0r1), three_u8);
- uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r0)),
- vget_high_u8(s0r1), three_u8);
- uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r2)),
- vget_low_u8(s0r1), three_u8);
- uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r2)),
- vget_high_u8(s0r1), three_u8);
- /* Load and compute s1colsum0 and s1colsum1. */
- uint8x16_t s1r0 = vld1q_u8(inptr0 + 1);
- uint8x16_t s1r1 = vld1q_u8(inptr1 + 1);
- uint8x16_t s1r2 = vld1q_u8(inptr2 + 1);
- uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r0)),
- vget_low_u8(s1r1), three_u8);
- uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r0)),
- vget_high_u8(s1r1), three_u8);
- uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r2)),
- vget_low_u8(s1r1), three_u8);
- uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r2)),
- vget_high_u8(s1r1), three_u8);
- /* Step 2: Blend the already-blended columns. */
- uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
- uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
- uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
- uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
- uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
- uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
- uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
- uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
- /* Add ordered dithering bias to odd pixel values. */
- output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
- output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
- output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
- output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
- /* Right-shift by 4 (divide by 16), narrow to 8-bit and combine. */
- uint8x16x2_t output_pixels0 = { vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
- vshrn_n_u16(output0_p1_h, 4)),
- vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
- vrshrn_n_u16(output0_p2_h, 4))
- };
- uint8x16x2_t output_pixels1 = { vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
- vshrn_n_u16(output1_p1_h, 4)),
- vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
- vrshrn_n_u16(output1_p2_h, 4))
- };
- /* Store pixel channel values to memory. */
- /* The minimum size of the output buffer for each row is 64 bytes => no */
- /* need to worry about buffer overflow here. See "Creation of 2-D sample */
- /* arrays" in jmemmgr.c for details. */
- vst2q_u8(outptr0 + 1, output_pixels0);
- vst2q_u8(outptr1 + 1, output_pixels1);
-
- /* The first pixel of the image shifted our loads and stores by one */
- /* byte. We have to re-align on a 32-byte boundary at some point before */
- /* the end of the row (we do it now on the 32/33 pixel boundary) to stay */
- /* within the bounds of the sample buffers without having to resort to a */
- /* slow scalar tail case for the last (downsampled_width % 16) samples. */
- /* See "Creation of 2-D sample arrays" in jmemmgr.c for details.*/
- for (unsigned colctr = 16; colctr < downsampled_width; colctr += 16) {
- /* Step 1: Blend samples vertically in columns s0 and s1. */
- /* Load and compute s0colsum0 and s0colsum1. */
- s0r0 = vld1q_u8(inptr0 + colctr - 1);
- s0r1 = vld1q_u8(inptr1 + colctr - 1);
- s0r2 = vld1q_u8(inptr2 + colctr - 1);
- s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r0)),
- vget_low_u8(s0r1), three_u8);
- s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r0)),
- vget_high_u8(s0r1), three_u8);
- s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0r2)),
- vget_low_u8(s0r1), three_u8);
- s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0r2)),
- vget_high_u8(s0r1), three_u8);
- /* Load and compute s1colsum0 and s1colsum1. */
- s1r0 = vld1q_u8(inptr0 + colctr);
- s1r1 = vld1q_u8(inptr1 + colctr);
- s1r2 = vld1q_u8(inptr2 + colctr);
- s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r0)),
- vget_low_u8(s1r1), three_u8);
- s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r0)),
- vget_high_u8(s1r1), three_u8);
- s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1r2)),
- vget_low_u8(s1r1), three_u8);
- s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1r2)),
- vget_high_u8(s1r1), three_u8);
- /* Step 2: Blend the already-blended columns. */
- output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
- output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
- output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
- output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
- output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
- output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
- output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
- output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
- /* Add ordered dithering bias to odd pixel values. */
- output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
- output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
- output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
- output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
- /* Right-shift by 4 (divide by 16), narrow to 8-bit and combine. */
- output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
- vshrn_n_u16(output0_p1_h, 4));
- output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
- vrshrn_n_u16(output0_p2_h, 4));
- output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
- vshrn_n_u16(output1_p1_h, 4));
- output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
- vrshrn_n_u16(output1_p2_h, 4));
- /* Store pixel channel values to memory. */
- vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0);
- vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1);
- }
-
- /* Case 2: last pixel channel value in this row of the original image. */
- int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
- GETJSAMPLE(inptr0[downsampled_width - 1]);
- outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4);
- int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
- GETJSAMPLE(inptr2[downsampled_width - 1]);
- outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);
- inrow++;
- }
-}
-
-
-/*
- * The diagram below shows a grid-window of samples (luma or chroma) produced
- * by h2v1 downsampling; which has been subsequently rotated 90 degrees. (The
- * usual use of h1v2 upsampling is upsampling rotated or transposed h2v1
- * downsampled images.)
- *
- * s0 s1
- * +---------+---------+
- * | p0 | p1 |
- * r0 | | |
- * | p2 | p3 |
- * +---------+---------+
- * | p4 | p5 |
- * r1 | | |
- * | p6 | p7 |
- * +---------+---------+
- * | p8 | p9 |
- * r2 | | |
- * | p10 | p11 |
- * +---------+---------+
- *
- * Every sample contains two of the original pixel channel values. The pixels'
- * channel values are centred at positions p0, p1, p2,..., p11 above. For a
- * given grid-window position, r1 is always used to denote the row of samples
- * containing the pixel channel values we are computing. For the top row of
- * pixel channel values in r1 (p4 and p5), the nearest neighbouring samples are
- * in the row above - denoted by r0. Likewise, for the bottom row of pixels in
- * r1 (p6 and p7), the nearest neighbouring samples are in the row below -
- * denoted by r2.
- *
- * To compute the pixel channel values of the original image, we proportionally
- * blend the adjacent samples in each column.
- *
- * For example, the pixel channel value centred at p4 would be computed as
- * follows:
- * 3/4 * s0r1 + 1/4 * s0r0
- * while the pixel channel value centred at p6 would be:
- * 3/4 * s0r1 + 1/4 * s0r2
- */
-
-void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
- JDIMENSION downsampled_width,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
- JSAMPARRAY output_data = *output_data_ptr;
- JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
- int inrow, outrow;
- /* Setup constants. */
- const uint16x8_t one_u16 = vdupq_n_u16(1);
- const uint8x8_t three_u8 = vdup_n_u8(3);
-
- inrow = outrow = 0;
- while (outrow < max_v_samp_factor) {
- inptr0 = input_data[inrow - 1];
- inptr1 = input_data[inrow];
- inptr2 = input_data[inrow + 1];
- /* Suffixes 0 and 1 denote the top and bottom rows of output pixels */
- /* respectively. */
- outptr0 = output_data[outrow++];
- outptr1 = output_data[outrow++];
- inrow++;
-
- /* The size of the input and output buffers is always a multiple of 32 */
- /* bytes => no need to worry about buffer overflow when reading/writing */
- /* memory. See "Creation of 2-D sample arrays" in jmemmgr.c for details. */
- for (unsigned colctr = 0; colctr < downsampled_width; colctr += 16) {
- /* Load samples. */
- uint8x16_t r0 = vld1q_u8(inptr0 + colctr);
- uint8x16_t r1 = vld1q_u8(inptr1 + colctr);
- uint8x16_t r2 = vld1q_u8(inptr2 + colctr);
- /* Blend samples vertically. */
- uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(r0)),
- vget_low_u8(r1), three_u8);
- uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(r0)),
- vget_high_u8(r1), three_u8);
- uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(r2)),
- vget_low_u8(r1), three_u8);
- uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(r2)),
- vget_high_u8(r1), three_u8);
- /* Add ordered dithering bias to pixel values in even output rows. */
- colsum0_l = vaddq_u16(colsum0_l, one_u16);
- colsum0_h = vaddq_u16(colsum0_h, one_u16);
- /* Right-shift by 2 (divide by 4), narrow to 8-bit and combine. */
- uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),
- vshrn_n_u16(colsum0_h, 2));
- uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),
- vrshrn_n_u16(colsum1_h, 2));
- /* Store pixel channel values to memory. */
- vst1q_u8(outptr0 + colctr, output_pixels0);
- vst1q_u8(outptr1 + colctr, output_pixels1);
- }
- }
-}
-
-
-/*
- * The diagram below shows the operation of h2v1 (simple) upsampling. Each
- * sample in the row is duplicated to form two output pixel channel values.
- *
- * p0 p1 p2 p3
- * +----+----+ +----+----+----+----+
- * | s0 | s1 | -> | s0 | s0 | s1 | s1 |
- * +----+----+ +----+----+----+----+
- */
-
-void jsimd_h2v1_upsample_neon(int max_v_samp_factor,
- JDIMENSION output_width,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
- JSAMPARRAY output_data = *output_data_ptr;
- JSAMPROW inptr, outptr;
-
- for (int inrow = 0; inrow < max_v_samp_factor; inrow++) {
- inptr = input_data[inrow];
- outptr = output_data[inrow];
- for (unsigned colctr = 0; 2 * colctr < output_width; colctr += 16) {
- uint8x16_t samples = vld1q_u8(inptr + colctr);
- /* Duplicate the samples - the store interleaves them to produce the */
- /* pattern in the diagram above. */
- uint8x16x2_t output_pixels = { samples, samples };
- /* Store pixel values to memory. */
- /* Due to the way sample buffers are allocated, we don't need to worry */
- /* about tail cases when output_width is not a multiple of 32. */
- /* See "Creation of 2-D sample arrays" in jmemmgr.c for details. */
- vst2q_u8(outptr + 2 * colctr, output_pixels);
- }
- }
-}
-
-
-/*
- * The diagram below shows the operation of h2v2 (simple) upsampling. Each
- * sample in the row is duplicated to form two output pixel channel values.
- * This horizontally-upsampled row is then also duplicated.
- *
- * p0 p1 p2 p3
- * +-----+-----+ +-----+-----+-----+-----+
- * | s0 | s1 | -> | s0 | s0 | s1 | s1 |
- * +-----+-----+ +-----+-----+-----+-----+
- * | s0 | s0 | s1 | s1 |
- * +-----+-----+-----+-----+
- */
-
-void jsimd_h2v2_upsample_neon(int max_v_samp_factor,
- JDIMENSION output_width,
- JSAMPARRAY input_data,
- JSAMPARRAY *output_data_ptr)
-{
- JSAMPARRAY output_data = *output_data_ptr;
- JSAMPROW inptr, outptr0, outptr1;
-
- for (int inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
- inptr = input_data[inrow];
- outptr0 = output_data[outrow++];
- outptr1 = output_data[outrow++];
-
- for (unsigned colctr = 0; 2 * colctr < output_width; colctr += 16) {
- uint8x16_t samples = vld1q_u8(inptr + colctr);
- /* Duplicate the samples - the store interleaves them to produce the */
- /* pattern in the diagram above. */
- uint8x16x2_t output_pixels = { samples, samples };
- /* Store pixel values to memory for both output rows. */
- /* Due to the way sample buffers are allocated, we don't need to worry */
- /* about tail cases when output_width is not a multiple of 32. */
- /* See "Creation of 2-D sample arrays" in jmemmgr.c for details. */
- vst2q_u8(outptr0 + 2 * colctr, output_pixels);
- vst2q_u8(outptr1 + 2 * colctr, output_pixels);
- }
- }
-}
diff --git a/simd/arm/common/jccolor-neon.c b/simd/arm/jccolor-neon.c
index f87c8d9..9fcc62d 100644
--- a/simd/arm/common/jccolor-neon.c
+++ b/simd/arm/jccolor-neon.c
@@ -1,7 +1,8 @@
/*
* jccolor-neon.c - colorspace conversion (Arm Neon)
*
- * Copyright 2020 The Chromium Authors. All Rights Reserved.
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
@@ -21,17 +22,19 @@
*/
#define JPEG_INTERNALS
-#include "../../../jconfigint.h"
-#include "../../../jinclude.h"
-#include "../../../jpeglib.h"
-#include "../../../jsimd.h"
-#include "../../../jdct.h"
-#include "../../../jsimddct.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
#include <arm_neon.h>
-/* RGB -> YCbCr conversion constants. */
+
+/* RGB -> YCbCr conversion constants */
#define F_0_298 19595
#define F_0_587 38470
@@ -43,18 +46,17 @@
#define F_0_081 5329
ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
- F_0_298, F_0_587,
- F_0_113, F_0_168,
- F_0_331, F_0_500,
- F_0_418, F_0_081
- };
+ F_0_298, F_0_587, F_0_113, F_0_168,
+ F_0_331, F_0_500, F_0_418, F_0_081
+};
+
/* Include inline routines for colorspace extensions. */
-#if defined(__aarch64__)
-#include "../arm64/jccolext-neon.c"
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
#else
-#include "../arm/jccolext-neon.c"
+#include "aarch32/jccolext-neon.c"
#endif
#undef RGB_RED
#undef RGB_GREEN
@@ -66,10 +68,10 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
#define RGB_BLUE EXT_RGB_BLUE
#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
#define jsimd_rgb_ycc_convert_neon jsimd_extrgb_ycc_convert_neon
-#if defined(__aarch64__)
-#include "../arm64/jccolext-neon.c"
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
#else
-#include "../arm/jccolext-neon.c"
+#include "aarch32/jccolext-neon.c"
#endif
#undef RGB_RED
#undef RGB_GREEN
@@ -82,10 +84,10 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
#define RGB_BLUE EXT_RGBX_BLUE
#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
#define jsimd_rgb_ycc_convert_neon jsimd_extrgbx_ycc_convert_neon
-#if defined(__aarch64__)
-#include "../arm64/jccolext-neon.c"
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
#else
-#include "../arm/jccolext-neon.c"
+#include "aarch32/jccolext-neon.c"
#endif
#undef RGB_RED
#undef RGB_GREEN
@@ -98,10 +100,10 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
#define RGB_BLUE EXT_BGR_BLUE
#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
#define jsimd_rgb_ycc_convert_neon jsimd_extbgr_ycc_convert_neon
-#if defined(__aarch64__)
-#include "../arm64/jccolext-neon.c"
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
#else
-#include "../arm/jccolext-neon.c"
+#include "aarch32/jccolext-neon.c"
#endif
#undef RGB_RED
#undef RGB_GREEN
@@ -114,10 +116,10 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
#define RGB_BLUE EXT_BGRX_BLUE
#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
#define jsimd_rgb_ycc_convert_neon jsimd_extbgrx_ycc_convert_neon
-#if defined(__aarch64__)
-#include "../arm64/jccolext-neon.c"
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
#else
-#include "../arm/jccolext-neon.c"
+#include "aarch32/jccolext-neon.c"
#endif
#undef RGB_RED
#undef RGB_GREEN
@@ -130,10 +132,10 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
#define RGB_BLUE EXT_XBGR_BLUE
#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
#define jsimd_rgb_ycc_convert_neon jsimd_extxbgr_ycc_convert_neon
-#if defined(__aarch64__)
-#include "../arm64/jccolext-neon.c"
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
#else
-#include "../arm/jccolext-neon.c"
+#include "aarch32/jccolext-neon.c"
#endif
#undef RGB_RED
#undef RGB_GREEN
@@ -146,10 +148,10 @@ ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
#define RGB_BLUE EXT_XRGB_BLUE
#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
#define jsimd_rgb_ycc_convert_neon jsimd_extxrgb_ycc_convert_neon
-#if defined(__aarch64__)
-#include "../arm64/jccolext-neon.c"
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
#else
-#include "../arm/jccolext-neon.c"
+#include "aarch32/jccolext-neon.c"
#endif
#undef RGB_RED
#undef RGB_GREEN
diff --git a/simd/arm/common/jcgray-neon.c b/simd/arm/jcgray-neon.c
index 39d903f..71c7b2d 100644
--- a/simd/arm/common/jcgray-neon.c
+++ b/simd/arm/jcgray-neon.c
@@ -1,7 +1,7 @@
/*
- * jcgray-neon.c - grayscale colorspace conversion (Arm NEON)
+ * jcgray-neon.c - grayscale colorspace conversion (Arm Neon)
*
- * Copyright 2020 The Chromium Authors. All Rights Reserved.
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
@@ -21,22 +21,24 @@
*/
#define JPEG_INTERNALS
-#include "../../../jconfigint.h"
-#include "../../../jinclude.h"
-#include "../../../jpeglib.h"
-#include "../../../jsimd.h"
-#include "../../../jdct.h"
-#include "../../../jsimddct.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
#include <arm_neon.h>
-/* RGB -> Grayscale conversion constants. */
+
+/* RGB -> Grayscale conversion constants */
#define F_0_298 19595
#define F_0_587 38470
#define F_0_113 7471
+
/* Include inline routines for colorspace extensions. */
#include "jcgryext-neon.c"
diff --git a/simd/arm/common/jcgryext-neon.c b/simd/arm/jcgryext-neon.c
index 69ea67f..416a738 100644
--- a/simd/arm/common/jcgryext-neon.c
+++ b/simd/arm/jcgryext-neon.c
@@ -1,7 +1,7 @@
/*
- * jcgryext-neon.c - grayscale colorspace conversion (Arm NEON)
+ * jcgryext-neon.c - grayscale colorspace conversion (Arm Neon)
*
- * Copyright 2020 The Chromium Authors. All Rights Reserved.
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
@@ -22,8 +22,8 @@
/* This file is included by jcgray-neon.c */
-/*
- * RGB -> Grayscale conversion is defined by the following equation:
+
+/* RGB -> Grayscale conversion is defined by the following equation:
* Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
*
* Avoid floating point arithmetic by using shifted integer constants:
@@ -32,19 +32,17 @@
* 0.11399841 = 7471 * 2^-16
* These constants are defined in jcgray-neon.c
*
- * We use rounding later to get correct values.
- *
* This is the same computation as the RGB -> Y portion of RGB -> YCbCr.
*/
-void jsimd_rgb_gray_convert_neon(JDIMENSION image_width,
- JSAMPARRAY input_buf,
- JSAMPIMAGE output_buf,
- JDIMENSION output_row,
+void jsimd_rgb_gray_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
int num_rows)
{
JSAMPROW inptr;
JSAMPROW outptr;
+ /* Allocate temporary buffer for final (image_width % 16) pixels in row. */
+ ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
while (--num_rows >= 0) {
inptr = *input_buf++;
@@ -54,11 +52,11 @@ void jsimd_rgb_gray_convert_neon(JDIMENSION image_width,
int cols_remaining = image_width;
for (; cols_remaining > 0; cols_remaining -= 16) {
- /* To prevent buffer overread by the vector load instructions, the */
- /* last (image_width % 16) columns of data are first memcopied to a */
- /* temporary buffer large enough to accommodate the vector load. */
+ /* To prevent buffer overread by the vector load instructions, the last
+ * (image_width % 16) columns of data are first memcopied to a temporary
+ * buffer large enough to accommodate the vector load.
+ */
if (cols_remaining < 16) {
- ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
inptr = tmp_buf;
}
@@ -95,8 +93,9 @@ void jsimd_rgb_gray_convert_neon(JDIMENSION image_width,
uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
vrshrn_n_u32(y_hh, 16));
- /* Narrow Y values to 8-bit and store to memory. Buffer overwrite is */
- /* permitted up to the next multiple of ALIGN_SIZE bytes. */
+ /* Narrow Y values to 8-bit and store to memory. Buffer overwrite is
+ * permitted up to the next multiple of ALIGN_SIZE bytes.
+ */
vst1q_u8(outptr, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
/* Increment pointers. */
diff --git a/simd/arm/jchuff.h b/simd/arm/jchuff.h
new file mode 100644
index 0000000..d30759f
--- /dev/null
+++ b/simd/arm/jchuff.h
@@ -0,0 +1,149 @@
+/*
+ * jchuff.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2009, 2018, D. R. Commander.
+ * Copyright (C) 2018, Matthias Räncker.
+ * Copyright (C) 2020, Arm Limited.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+/* Expanded entropy encoder object for Huffman encoding.
+ *
+ * The savable_state subrecord contains fields that change within an MCU,
+ * but must not be updated permanently until we complete the MCU.
+ */
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+#define BIT_BUF_SIZE 64
+#else
+#define BIT_BUF_SIZE 32
+#endif
+
+typedef struct {
+ size_t put_buffer; /* current bit accumulation buffer */
+ int free_bits; /* # of bits available in it */
+ int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
+} savable_state;
+
+typedef struct {
+ JOCTET *next_output_byte; /* => next byte to write in buffer */
+ size_t free_in_buffer; /* # of byte spaces remaining in buffer */
+ savable_state cur; /* Current bit buffer & DC state */
+ j_compress_ptr cinfo; /* dump_buffer needs access to this */
+ int simd;
+} working_state;
+
+/* Outputting bits to the file */
+
+/* Output byte b and, speculatively, an additional 0 byte. 0xFF must be encoded
+ * as 0xFF 0x00, so the output buffer pointer is advanced by 2 if the byte is
+ * 0xFF. Otherwise, the output buffer pointer is advanced by 1, and the
+ * speculative 0 byte will be overwritten by the next byte.
+ */
+#define EMIT_BYTE(b) { \
+ buffer[0] = (JOCTET)(b); \
+ buffer[1] = 0; \
+ buffer -= -2 + ((JOCTET)(b) < 0xFF); \
+}
+
+/* Output the entire bit buffer. If there are no 0xFF bytes in it, then write
+ * directly to the output buffer. Otherwise, use the EMIT_BYTE() macro to
+ * encode 0xFF as 0xFF 0x00.
+ */
+#if defined(__aarch64__) || defined(_M_ARM64)
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#define SPLAT() { \
+ buffer[0] = (JOCTET)(put_buffer >> 56); \
+ buffer[1] = (JOCTET)(put_buffer >> 48); \
+ buffer[2] = (JOCTET)(put_buffer >> 40); \
+ buffer[3] = (JOCTET)(put_buffer >> 32); \
+ buffer[4] = (JOCTET)(put_buffer >> 24); \
+ buffer[5] = (JOCTET)(put_buffer >> 16); \
+ buffer[6] = (JOCTET)(put_buffer >> 8); \
+ buffer[7] = (JOCTET)(put_buffer ); \
+}
+#else
+#define SPLAT() { \
+ __asm__("rev %x0, %x1" : "=r"(put_buffer) : "r"(put_buffer)); \
+ *((uint64_t *)buffer) = put_buffer; \
+}
+#endif
+
+#define FLUSH() { \
+ if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
+ EMIT_BYTE(put_buffer >> 56) \
+ EMIT_BYTE(put_buffer >> 48) \
+ EMIT_BYTE(put_buffer >> 40) \
+ EMIT_BYTE(put_buffer >> 32) \
+ EMIT_BYTE(put_buffer >> 24) \
+ EMIT_BYTE(put_buffer >> 16) \
+ EMIT_BYTE(put_buffer >> 8) \
+ EMIT_BYTE(put_buffer ) \
+ } else { \
+ SPLAT() \
+ buffer += 8; \
+ } \
+}
+
+#else
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#define SPLAT() { \
+ buffer[0] = (JOCTET)(put_buffer >> 24); \
+ buffer[1] = (JOCTET)(put_buffer >> 16); \
+ buffer[2] = (JOCTET)(put_buffer >> 8); \
+ buffer[3] = (JOCTET)(put_buffer ); \
+}
+#else
+#define SPLAT() { \
+ __asm__("rev %0, %1" : "=r"(put_buffer) : "r"(put_buffer)); \
+ *((uint32_t *)buffer) = put_buffer; \
+}
+#endif
+
+#define FLUSH() { \
+ if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
+ EMIT_BYTE(put_buffer >> 24) \
+ EMIT_BYTE(put_buffer >> 16) \
+ EMIT_BYTE(put_buffer >> 8) \
+ EMIT_BYTE(put_buffer ) \
+ } else { \
+ SPLAT() \
+ buffer += 4; \
+ } \
+}
+
+#endif
+
+/* Fill the bit buffer to capacity with the leading bits from code, then output
+ * the bit buffer and put the remaining bits from code into the bit buffer.
+ */
+#define PUT_AND_FLUSH(code, size) { \
+ put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \
+ FLUSH() \
+ free_bits += BIT_BUF_SIZE; \
+ put_buffer = code; \
+}
+
+/* Insert code into the bit buffer and output the bit buffer if needed.
+ * NOTE: We can't flush with free_bits == 0, since the left shift in
+ * PUT_AND_FLUSH() would have undefined behavior.
+ */
+#define PUT_BITS(code, size) { \
+ free_bits -= size; \
+ if (free_bits < 0) \
+ PUT_AND_FLUSH(code, size) \
+ else \
+ put_buffer = (put_buffer << size) | code; \
+}
+
+#define PUT_CODE(code, size, diff) { \
+ diff |= code << nbits; \
+ nbits += size; \
+ PUT_BITS(diff, nbits) \
+}
diff --git a/simd/arm/jcphuff-neon.c b/simd/arm/jcphuff-neon.c
new file mode 100644
index 0000000..8b6d53b
--- /dev/null
+++ b/simd/arm/jcphuff-neon.c
@@ -0,0 +1,591 @@
+/*
+ * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+/* Data preparation for encode_mcu_AC_first().
+ *
+ * The equivalent scalar C function (encode_mcu_AC_first_prepare()) can be
+ * found in jcphuff.c.
+ */
+
+void jsimd_encode_mcu_AC_first_prepare_neon
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ JCOEF *values, size_t *zerobits)
+{
+ JCOEF *values_ptr = values;
+ JCOEF *diff_values_ptr = values + DCTSIZE2;
+
+ /* Rows of coefficients to zero (since they haven't been processed) */
+ int i, rows_to_zero = 8;
+
+ for (i = 0; i < Sl / 16; i++) {
+ int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+ int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
+
+ /* Isolate sign of coefficients. */
+ int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
+ int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
+ /* Compute absolute value of coefficients and apply point transform Al. */
+ int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+ int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+ coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+ coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+
+ /* Compute diff values. */
+ int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
+ int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
+
+ /* Store transformed coefficients and diff values. */
+ vst1q_s16(values_ptr, coefs1);
+ vst1q_s16(values_ptr + DCTSIZE, coefs2);
+ vst1q_s16(diff_values_ptr, diff1);
+ vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
+ values_ptr += 16;
+ diff_values_ptr += 16;
+ jpeg_natural_order_start += 16;
+ rows_to_zero -= 2;
+ }
+
+ /* Same operation but for remaining partial vector */
+ int remaining_coefs = Sl % 16;
+ if (remaining_coefs > 8) {
+ int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+ int16x8_t coefs2 = vdupq_n_s16(0);
+ switch (remaining_coefs) {
+ case 15:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+ case 14:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+ case 13:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+ case 12:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+ case 11:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+ case 10:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+ case 9:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
+ default:
+ break;
+ }
+
+ /* Isolate sign of coefficients. */
+ int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
+ int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
+ /* Compute absolute value of coefficients and apply point transform Al. */
+ int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+ int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+ coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+ coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+
+ /* Compute diff values. */
+ int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
+ int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
+
+ /* Store transformed coefficients and diff values. */
+ vst1q_s16(values_ptr, coefs1);
+ vst1q_s16(values_ptr + DCTSIZE, coefs2);
+ vst1q_s16(diff_values_ptr, diff1);
+ vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
+ values_ptr += 16;
+ diff_values_ptr += 16;
+ rows_to_zero -= 2;
+
+ } else if (remaining_coefs > 0) {
+ int16x8_t coefs = vdupq_n_s16(0);
+
+ switch (remaining_coefs) {
+ case 8:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
+ case 7:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
+ case 6:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
+ case 5:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
+ case 4:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
+ case 3:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
+ case 2:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
+ case 1:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
+ default:
+ break;
+ }
+
+ /* Isolate sign of coefficients. */
+ int16x8_t sign_coefs = vshrq_n_s16(coefs, 15);
+ /* Compute absolute value of coefficients and apply point transform Al. */
+ int16x8_t abs_coefs = vabsq_s16(coefs);
+ coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
+
+ /* Compute diff values. */
+ int16x8_t diff = veorq_s16(coefs, sign_coefs);
+
+ /* Store transformed coefficients and diff values. */
+ vst1q_s16(values_ptr, coefs);
+ vst1q_s16(diff_values_ptr, diff);
+ values_ptr += 8;
+ diff_values_ptr += 8;
+ rows_to_zero--;
+ }
+
+ /* Zero remaining memory in the values and diff_values blocks. */
+ for (i = 0; i < rows_to_zero; i++) {
+ vst1q_s16(values_ptr, vdupq_n_s16(0));
+ vst1q_s16(diff_values_ptr, vdupq_n_s16(0));
+ values_ptr += 8;
+ diff_values_ptr += 8;
+ }
+
+ /* Construct zerobits bitmap. A set bit means that the corresponding
+ * coefficient != 0.
+ */
+ int16x8_t row0 = vld1q_s16(values + 0 * DCTSIZE);
+ int16x8_t row1 = vld1q_s16(values + 1 * DCTSIZE);
+ int16x8_t row2 = vld1q_s16(values + 2 * DCTSIZE);
+ int16x8_t row3 = vld1q_s16(values + 3 * DCTSIZE);
+ int16x8_t row4 = vld1q_s16(values + 4 * DCTSIZE);
+ int16x8_t row5 = vld1q_s16(values + 5 * DCTSIZE);
+ int16x8_t row6 = vld1q_s16(values + 6 * DCTSIZE);
+ int16x8_t row7 = vld1q_s16(values + 7 * DCTSIZE);
+
+ uint8x8_t row0_eq0 = vmovn_u16(vceqq_s16(row0, vdupq_n_s16(0)));
+ uint8x8_t row1_eq0 = vmovn_u16(vceqq_s16(row1, vdupq_n_s16(0)));
+ uint8x8_t row2_eq0 = vmovn_u16(vceqq_s16(row2, vdupq_n_s16(0)));
+ uint8x8_t row3_eq0 = vmovn_u16(vceqq_s16(row3, vdupq_n_s16(0)));
+ uint8x8_t row4_eq0 = vmovn_u16(vceqq_s16(row4, vdupq_n_s16(0)));
+ uint8x8_t row5_eq0 = vmovn_u16(vceqq_s16(row5, vdupq_n_s16(0)));
+ uint8x8_t row6_eq0 = vmovn_u16(vceqq_s16(row6, vdupq_n_s16(0)));
+ uint8x8_t row7_eq0 = vmovn_u16(vceqq_s16(row7, vdupq_n_s16(0)));
+
+ /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
+ const uint8x8_t bitmap_mask =
+ vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
+
+ row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
+ row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
+ row2_eq0 = vand_u8(row2_eq0, bitmap_mask);
+ row3_eq0 = vand_u8(row3_eq0, bitmap_mask);
+ row4_eq0 = vand_u8(row4_eq0, bitmap_mask);
+ row5_eq0 = vand_u8(row5_eq0, bitmap_mask);
+ row6_eq0 = vand_u8(row6_eq0, bitmap_mask);
+ row7_eq0 = vand_u8(row7_eq0, bitmap_mask);
+
+ uint8x8_t bitmap_rows_01 = vpadd_u8(row0_eq0, row1_eq0);
+ uint8x8_t bitmap_rows_23 = vpadd_u8(row2_eq0, row3_eq0);
+ uint8x8_t bitmap_rows_45 = vpadd_u8(row4_eq0, row5_eq0);
+ uint8x8_t bitmap_rows_67 = vpadd_u8(row6_eq0, row7_eq0);
+ uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+ uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+ uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+ /* Move bitmap to a 64-bit scalar register. */
+ uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+ /* Store zerobits bitmap. */
+ *zerobits = ~bitmap;
+#else
+ /* Move bitmap to two 32-bit scalar registers. */
+ uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+ uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+ /* Store zerobits bitmap. */
+ zerobits[0] = ~bitmap0;
+ zerobits[1] = ~bitmap1;
+#endif
+}
+
+
+/* Data preparation for encode_mcu_AC_refine().
+ *
+ * The equivalent scalar C function (encode_mcu_AC_refine_prepare()) can be
+ * found in jcphuff.c.
+ */
+
+int jsimd_encode_mcu_AC_refine_prepare_neon
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ JCOEF *absvalues, size_t *bits)
+{
+ /* Temporary storage buffers for data used to compute the signbits bitmap and
+ * the end-of-block (EOB) position
+ */
+ uint8_t coef_sign_bits[64];
+ uint8_t coef_eq1_bits[64];
+
+ JCOEF *absvalues_ptr = absvalues;
+ uint8_t *coef_sign_bits_ptr = coef_sign_bits;
+ uint8_t *eq1_bits_ptr = coef_eq1_bits;
+
+ /* Rows of coefficients to zero (since they haven't been processed) */
+ int i, rows_to_zero = 8;
+
+ for (i = 0; i < Sl / 16; i++) {
+ int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+ int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
+
+ /* Compute and store data for signbits bitmap. */
+ uint8x8_t sign_coefs1 =
+ vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
+ uint8x8_t sign_coefs2 =
+ vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
+ vst1_u8(coef_sign_bits_ptr, sign_coefs1);
+ vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
+
+ /* Compute absolute value of coefficients and apply point transform Al. */
+ int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+ int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+ coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+ coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+ vst1q_s16(absvalues_ptr, coefs1);
+ vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
+
+ /* Test whether transformed coefficient values == 1 (used to find EOB
+ * position.)
+ */
+ uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
+ uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
+ vst1_u8(eq1_bits_ptr, coefs_eq11);
+ vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
+
+ absvalues_ptr += 16;
+ coef_sign_bits_ptr += 16;
+ eq1_bits_ptr += 16;
+ jpeg_natural_order_start += 16;
+ rows_to_zero -= 2;
+ }
+
+ /* Same operation but for remaining partial vector */
+ int remaining_coefs = Sl % 16;
+ if (remaining_coefs > 8) {
+ int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+ coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+ int16x8_t coefs2 = vdupq_n_s16(0);
+ switch (remaining_coefs) {
+ case 15:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+ case 14:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+ case 13:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+ case 12:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+ case 11:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+ case 10:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+ case 9:
+ coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
+ default:
+ break;
+ }
+
+ /* Compute and store data for signbits bitmap. */
+ uint8x8_t sign_coefs1 =
+ vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
+ uint8x8_t sign_coefs2 =
+ vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
+ vst1_u8(coef_sign_bits_ptr, sign_coefs1);
+ vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
+
+ /* Compute absolute value of coefficients and apply point transform Al. */
+ int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+ int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+ coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+ coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+ vst1q_s16(absvalues_ptr, coefs1);
+ vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
+
+ /* Test whether transformed coefficient values == 1 (used to find EOB
+ * position.)
+ */
+ uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
+ uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
+ vst1_u8(eq1_bits_ptr, coefs_eq11);
+ vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
+
+ absvalues_ptr += 16;
+ coef_sign_bits_ptr += 16;
+ eq1_bits_ptr += 16;
+ jpeg_natural_order_start += 16;
+ rows_to_zero -= 2;
+
+ } else if (remaining_coefs > 0) {
+ int16x8_t coefs = vdupq_n_s16(0);
+
+ switch (remaining_coefs) {
+ case 8:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
+ case 7:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
+ case 6:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
+ case 5:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
+ case 4:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
+ case 3:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
+ case 2:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
+ case 1:
+ coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
+ default:
+ break;
+ }
+
+ /* Compute and store data for signbits bitmap. */
+ uint8x8_t sign_coefs =
+ vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)));
+ vst1_u8(coef_sign_bits_ptr, sign_coefs);
+
+ /* Compute absolute value of coefficients and apply point transform Al. */
+ int16x8_t abs_coefs = vabsq_s16(coefs);
+ coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
+ vst1q_s16(absvalues_ptr, coefs);
+
+ /* Test whether transformed coefficient values == 1 (used to find EOB
+ * position.)
+ */
+ uint8x8_t coefs_eq1 = vmovn_u16(vceqq_s16(coefs, vdupq_n_s16(1)));
+ vst1_u8(eq1_bits_ptr, coefs_eq1);
+
+ absvalues_ptr += 8;
+ coef_sign_bits_ptr += 8;
+ eq1_bits_ptr += 8;
+ rows_to_zero--;
+ }
+
+ /* Zero remaining memory in blocks. */
+ for (i = 0; i < rows_to_zero; i++) {
+ vst1q_s16(absvalues_ptr, vdupq_n_s16(0));
+ vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0));
+ vst1_u8(eq1_bits_ptr, vdup_n_u8(0));
+ absvalues_ptr += 8;
+ coef_sign_bits_ptr += 8;
+ eq1_bits_ptr += 8;
+ }
+
+ /* Construct zerobits bitmap. */
+ int16x8_t abs_row0 = vld1q_s16(absvalues + 0 * DCTSIZE);
+ int16x8_t abs_row1 = vld1q_s16(absvalues + 1 * DCTSIZE);
+ int16x8_t abs_row2 = vld1q_s16(absvalues + 2 * DCTSIZE);
+ int16x8_t abs_row3 = vld1q_s16(absvalues + 3 * DCTSIZE);
+ int16x8_t abs_row4 = vld1q_s16(absvalues + 4 * DCTSIZE);
+ int16x8_t abs_row5 = vld1q_s16(absvalues + 5 * DCTSIZE);
+ int16x8_t abs_row6 = vld1q_s16(absvalues + 6 * DCTSIZE);
+ int16x8_t abs_row7 = vld1q_s16(absvalues + 7 * DCTSIZE);
+
+ uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_s16(abs_row0, vdupq_n_s16(0)));
+ uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_s16(abs_row1, vdupq_n_s16(0)));
+ uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_s16(abs_row2, vdupq_n_s16(0)));
+ uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_s16(abs_row3, vdupq_n_s16(0)));
+ uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_s16(abs_row4, vdupq_n_s16(0)));
+ uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_s16(abs_row5, vdupq_n_s16(0)));
+ uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_s16(abs_row6, vdupq_n_s16(0)));
+ uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_s16(abs_row7, vdupq_n_s16(0)));
+
+ /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
+ const uint8x8_t bitmap_mask =
+ vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
+
+ abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
+ abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
+ abs_row2_eq0 = vand_u8(abs_row2_eq0, bitmap_mask);
+ abs_row3_eq0 = vand_u8(abs_row3_eq0, bitmap_mask);
+ abs_row4_eq0 = vand_u8(abs_row4_eq0, bitmap_mask);
+ abs_row5_eq0 = vand_u8(abs_row5_eq0, bitmap_mask);
+ abs_row6_eq0 = vand_u8(abs_row6_eq0, bitmap_mask);
+ abs_row7_eq0 = vand_u8(abs_row7_eq0, bitmap_mask);
+
+ uint8x8_t bitmap_rows_01 = vpadd_u8(abs_row0_eq0, abs_row1_eq0);
+ uint8x8_t bitmap_rows_23 = vpadd_u8(abs_row2_eq0, abs_row3_eq0);
+ uint8x8_t bitmap_rows_45 = vpadd_u8(abs_row4_eq0, abs_row5_eq0);
+ uint8x8_t bitmap_rows_67 = vpadd_u8(abs_row6_eq0, abs_row7_eq0);
+ uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+ uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+ uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+ /* Move bitmap to a 64-bit scalar register. */
+ uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+ /* Store zerobits bitmap. */
+ bits[0] = ~bitmap;
+#else
+ /* Move bitmap to two 32-bit scalar registers. */
+ uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+ uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+ /* Store zerobits bitmap. */
+ bits[0] = ~bitmap0;
+ bits[1] = ~bitmap1;
+#endif
+
+ /* Construct signbits bitmap. */
+ uint8x8_t signbits_row0 = vld1_u8(coef_sign_bits + 0 * DCTSIZE);
+ uint8x8_t signbits_row1 = vld1_u8(coef_sign_bits + 1 * DCTSIZE);
+ uint8x8_t signbits_row2 = vld1_u8(coef_sign_bits + 2 * DCTSIZE);
+ uint8x8_t signbits_row3 = vld1_u8(coef_sign_bits + 3 * DCTSIZE);
+ uint8x8_t signbits_row4 = vld1_u8(coef_sign_bits + 4 * DCTSIZE);
+ uint8x8_t signbits_row5 = vld1_u8(coef_sign_bits + 5 * DCTSIZE);
+ uint8x8_t signbits_row6 = vld1_u8(coef_sign_bits + 6 * DCTSIZE);
+ uint8x8_t signbits_row7 = vld1_u8(coef_sign_bits + 7 * DCTSIZE);
+
+ signbits_row0 = vand_u8(signbits_row0, bitmap_mask);
+ signbits_row1 = vand_u8(signbits_row1, bitmap_mask);
+ signbits_row2 = vand_u8(signbits_row2, bitmap_mask);
+ signbits_row3 = vand_u8(signbits_row3, bitmap_mask);
+ signbits_row4 = vand_u8(signbits_row4, bitmap_mask);
+ signbits_row5 = vand_u8(signbits_row5, bitmap_mask);
+ signbits_row6 = vand_u8(signbits_row6, bitmap_mask);
+ signbits_row7 = vand_u8(signbits_row7, bitmap_mask);
+
+ bitmap_rows_01 = vpadd_u8(signbits_row0, signbits_row1);
+ bitmap_rows_23 = vpadd_u8(signbits_row2, signbits_row3);
+ bitmap_rows_45 = vpadd_u8(signbits_row4, signbits_row5);
+ bitmap_rows_67 = vpadd_u8(signbits_row6, signbits_row7);
+ bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+ bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+ bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+ /* Move bitmap to a 64-bit scalar register. */
+ bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+ /* Store signbits bitmap. */
+ bits[1] = ~bitmap;
+#else
+ /* Move bitmap to two 32-bit scalar registers. */
+ bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+ bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+ /* Store signbits bitmap. */
+ bits[2] = ~bitmap0;
+ bits[3] = ~bitmap1;
+#endif
+
+ /* Construct bitmap to find EOB position (the index of the last coefficient
+ * equal to 1.)
+ */
+ uint8x8_t row0_eq1 = vld1_u8(coef_eq1_bits + 0 * DCTSIZE);
+ uint8x8_t row1_eq1 = vld1_u8(coef_eq1_bits + 1 * DCTSIZE);
+ uint8x8_t row2_eq1 = vld1_u8(coef_eq1_bits + 2 * DCTSIZE);
+ uint8x8_t row3_eq1 = vld1_u8(coef_eq1_bits + 3 * DCTSIZE);
+ uint8x8_t row4_eq1 = vld1_u8(coef_eq1_bits + 4 * DCTSIZE);
+ uint8x8_t row5_eq1 = vld1_u8(coef_eq1_bits + 5 * DCTSIZE);
+ uint8x8_t row6_eq1 = vld1_u8(coef_eq1_bits + 6 * DCTSIZE);
+ uint8x8_t row7_eq1 = vld1_u8(coef_eq1_bits + 7 * DCTSIZE);
+
+ row0_eq1 = vand_u8(row0_eq1, bitmap_mask);
+ row1_eq1 = vand_u8(row1_eq1, bitmap_mask);
+ row2_eq1 = vand_u8(row2_eq1, bitmap_mask);
+ row3_eq1 = vand_u8(row3_eq1, bitmap_mask);
+ row4_eq1 = vand_u8(row4_eq1, bitmap_mask);
+ row5_eq1 = vand_u8(row5_eq1, bitmap_mask);
+ row6_eq1 = vand_u8(row6_eq1, bitmap_mask);
+ row7_eq1 = vand_u8(row7_eq1, bitmap_mask);
+
+ bitmap_rows_01 = vpadd_u8(row0_eq1, row1_eq1);
+ bitmap_rows_23 = vpadd_u8(row2_eq1, row3_eq1);
+ bitmap_rows_45 = vpadd_u8(row4_eq1, row5_eq1);
+ bitmap_rows_67 = vpadd_u8(row6_eq1, row7_eq1);
+ bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+ bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+ bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+ /* Move bitmap to a 64-bit scalar register. */
+ bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+
+ /* Return EOB position. */
+ if (bitmap == 0) {
+ /* EOB position is defined to be 0 if all coefficients != 1. */
+ return 0;
+ } else {
+ return 63 - BUILTIN_CLZL(bitmap);
+ }
+#else
+ /* Move bitmap to two 32-bit scalar registers. */
+ bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+ bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+
+ /* Return EOB position. */
+ if (bitmap0 == 0 && bitmap1 == 0) {
+ return 0;
+ } else if (bitmap1 != 0) {
+ return 63 - BUILTIN_CLZ(bitmap1);
+ } else {
+ return 31 - BUILTIN_CLZ(bitmap0);
+ }
+#endif
+}
diff --git a/simd/arm/common/jcsample-neon.c b/simd/arm/jcsample-neon.c
index a5ddf16..8a3e237 100644
--- a/simd/arm/common/jcsample-neon.c
+++ b/simd/arm/jcsample-neon.c
@@ -1,7 +1,7 @@
/*
- * jcsample-neon.c - downsampling (Arm NEON)
+ * jcsample-neon.c - downsampling (Arm Neon)
*
- * Copyright 2020 The Chromium Authors. All Rights Reserved.
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
@@ -21,13 +21,13 @@
*/
#define JPEG_INTERNALS
-#include "../../../jconfigint.h"
-#include "../../../jinclude.h"
-#include "../../../jpeglib.h"
-#include "../../../jsimd.h"
-#include "../../../jdct.h"
-#include "../../../jsimddct.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
#include <arm_neon.h>
@@ -68,35 +68,34 @@ ALIGN(16) static const uint8_t jsimd_h2_downsample_consts[] = {
};
-/*
- * Downsample pixel values of a single chroma component i.e. Cb, Cr.
+/* Downsample pixel values of a single component.
* This version handles the common case of 2:1 horizontal and 1:1 vertical,
* without smoothing.
*/
-void jsimd_h2v1_downsample_neon(JDIMENSION image_width,
- int max_v_samp_factor,
+void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
JDIMENSION v_samp_factor,
JDIMENSION width_in_blocks,
- JSAMPARRAY input_data,
- JSAMPARRAY output_data)
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
{
JSAMPROW inptr, outptr;
/* Load expansion mask to pad remaining elements of last DCT block. */
const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
- const uint8x16_t expand_mask = vld1q_u8(
- &jsimd_h2_downsample_consts[mask_offset]);
- /* Load bias pattern alternating every pixel. */
- const uint16x8_t bias = { 0, 1, 0, 1, 0, 1, 0, 1 };
-
- for (unsigned outrow = 0; outrow < v_samp_factor; outrow++) {
+ const uint8x16_t expand_mask =
+ vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
+ /* Load bias pattern (alternating every pixel.) */
+ /* { 0, 1, 0, 1, 0, 1, 0, 1 } */
+ const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00010000));
+ unsigned i, outrow;
+
+ for (outrow = 0; outrow < v_samp_factor; outrow++) {
outptr = output_data[outrow];
inptr = input_data[outrow];
/* Downsample all but the last DCT block of pixels. */
- for (unsigned i = 0; i < width_in_blocks - 1; i++) {
+ for (i = 0; i < width_in_blocks - 1; i++) {
uint8x16_t pixels = vld1q_u8(inptr + i * 2 * DCTSIZE);
- /* Add adjacent pixel values, widen to 16-bit and add bias. */
+ /* Add adjacent pixel values, widen to 16-bit, and add bias. */
uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
/* Divide total by 2 and narrow to 8-bit. */
uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
@@ -106,56 +105,56 @@ void jsimd_h2v1_downsample_neon(JDIMENSION image_width,
/* Load pixels in last DCT block into a table. */
uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE);
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
/* Pad the empty elements with the value of the last pixel. */
pixels = vqtbl1q_u8(pixels, expand_mask);
#else
- uint8x8x2_t table = { vget_low_u8(pixels), vget_high_u8(pixels) };
+ uint8x8x2_t table = { { vget_low_u8(pixels), vget_high_u8(pixels) } };
pixels = vcombine_u8(vtbl2_u8(table, vget_low_u8(expand_mask)),
vtbl2_u8(table, vget_high_u8(expand_mask)));
#endif
- /* Add adjacent pixel values, widen to 16-bit and add bias. */
+ /* Add adjacent pixel values, widen to 16-bit, and add bias. */
uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
- /* Divide total by 2, narrow to 8-bit and store. */
+ /* Divide total by 2, narrow to 8-bit, and store. */
uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
}
}
-/*
- * Downsample pixel values of a single chroma component i.e. Cb, Cr.
+/* Downsample pixel values of a single component.
* This version handles the standard case of 2:1 horizontal and 2:1 vertical,
* without smoothing.
*/
-void jsimd_h2v2_downsample_neon(JDIMENSION image_width,
- int max_v_samp_factor,
+void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
JDIMENSION v_samp_factor,
JDIMENSION width_in_blocks,
- JSAMPARRAY input_data,
- JSAMPARRAY output_data)
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
{
JSAMPROW inptr0, inptr1, outptr;
/* Load expansion mask to pad remaining elements of last DCT block. */
const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
- const uint8x16_t expand_mask = vld1q_u8(
- &jsimd_h2_downsample_consts[mask_offset]);
- /* Load bias pattern alternating every pixel. */
- const uint16x8_t bias = { 1, 2, 1, 2, 1, 2, 1, 2 };
-
- for (unsigned outrow = 0; outrow < v_samp_factor; outrow++) {
+ const uint8x16_t expand_mask =
+ vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
+ /* Load bias pattern (alternating every pixel.) */
+ /* { 1, 2, 1, 2, 1, 2, 1, 2 } */
+ const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00020001));
+ unsigned i, outrow;
+
+ for (outrow = 0; outrow < v_samp_factor; outrow++) {
outptr = output_data[outrow];
inptr0 = input_data[outrow];
inptr1 = input_data[outrow + 1];
/* Downsample all but the last DCT block of pixels. */
- for (unsigned i = 0; i < width_in_blocks - 1; i++) {
+ for (i = 0; i < width_in_blocks - 1; i++) {
uint8x16_t pixels_r0 = vld1q_u8(inptr0 + i * 2 * DCTSIZE);
uint8x16_t pixels_r1 = vld1q_u8(inptr1 + i * 2 * DCTSIZE);
- /* Add adjacent pixel values in row 0, widen to 16-bit and add bias. */
+ /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
- /* Add adjacent pixel values in row 1, widen to 16-bit and accumulate. */
+ /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate.
+ */
samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
/* Divide total by 4 and narrow to 8-bit. */
uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
@@ -164,27 +163,29 @@ void jsimd_h2v2_downsample_neon(JDIMENSION image_width,
}
/* Load pixels in last DCT block into a table. */
- uint8x16_t pixels_r0 = vld1q_u8(
- inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);
- uint8x16_t pixels_r1 = vld1q_u8(
- inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);
-#if defined(__aarch64__)
+ uint8x16_t pixels_r0 =
+ vld1q_u8(inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);
+ uint8x16_t pixels_r1 =
+ vld1q_u8(inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);
+#if defined(__aarch64__) || defined(_M_ARM64)
/* Pad the empty elements with the value of the last pixel. */
pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask);
pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask);
#else
- uint8x8x2_t table_r0 = { vget_low_u8(pixels_r0), vget_high_u8(pixels_r0) };
- uint8x8x2_t table_r1 = { vget_low_u8(pixels_r1), vget_high_u8(pixels_r1) };
+ uint8x8x2_t table_r0 =
+ { { vget_low_u8(pixels_r0), vget_high_u8(pixels_r0) } };
+ uint8x8x2_t table_r1 =
+ { { vget_low_u8(pixels_r1), vget_high_u8(pixels_r1) } };
pixels_r0 = vcombine_u8(vtbl2_u8(table_r0, vget_low_u8(expand_mask)),
vtbl2_u8(table_r0, vget_high_u8(expand_mask)));
pixels_r1 = vcombine_u8(vtbl2_u8(table_r1, vget_low_u8(expand_mask)),
vtbl2_u8(table_r1, vget_high_u8(expand_mask)));
#endif
- /* Add adjacent pixel values in row 0, widen to 16-bit and add bias. */
+ /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
- /* Add adjacent pixel values in row 1, widen to 16-bit and accumulate. */
+ /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate. */
samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
- /* Divide total by 4, narrow to 8-bit and store. */
+ /* Divide total by 4, narrow to 8-bit, and store. */
uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
}
diff --git a/simd/arm/common/jdcolext-neon.c b/simd/arm/jdcolext-neon.c
index b201792..ae440f4 100644
--- a/simd/arm/common/jdcolext-neon.c
+++ b/simd/arm/jdcolext-neon.c
@@ -1,7 +1,8 @@
/*
- * jdcolext-neon.c - colorspace conversion (Arm NEON)
+ * jdcolext-neon.c - colorspace conversion (Arm Neon)
*
- * Copyright 2019 The Chromium Authors. All Rights Reserved.
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
@@ -22,8 +23,8 @@
/* This file is included by jdcolor-neon.c. */
-/*
- * YCbCr -> RGB conversion is defined by the following equations:
+
+/* YCbCr -> RGB conversion is defined by the following equations:
* R = Y + 1.40200 * (Cr - 128)
* G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
* B = Y + 1.77200 * (Cb - 128)
@@ -35,31 +36,29 @@
* 1.7720337 = 29033 * 2^-14
* These constants are defined in jdcolor-neon.c.
*
- * Rounding is used when descaling to ensure correct results.
+ * To ensure correct results, rounding is used when descaling.
*/
-/*
- * Notes on safe memory access for YCbCr -> RGB conversion routines:
+/* Notes on safe memory access for YCbCr -> RGB conversion routines:
*
* Input memory buffers can be safely overread up to the next multiple of
- * ALIGN_SIZE bytes since they are always allocated by alloc_sarray() in
+ * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
* jmemmgr.c.
*
- * The output buffer cannot safely be written beyond output_width since the
- * TurboJPEG API permits it to be allocated with or without padding up to the
- * next multiple of ALIGN_SIZE bytes.
+ * The output buffer cannot safely be written beyond output_width, since
+ * output_buf points to a possibly unpadded row in the decompressed image
+ * buffer allocated by the calling program.
*/
-void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width,
- JSAMPIMAGE input_buf,
- JDIMENSION input_row,
- JSAMPARRAY output_buf,
+void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
int num_rows)
{
JSAMPROW outptr;
- /* Pointers to Y, Cb and Cr data. */
+ /* Pointers to Y, Cb, and Cr data */
JSAMPROW inptr0, inptr1, inptr2;
+ const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
const int16x8_t neg_128 = vdupq_n_s16(-128);
while (--num_rows >= 0) {
@@ -74,47 +73,67 @@ void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width,
uint8x16_t cb = vld1q_u8(inptr1);
uint8x16_t cr = vld1q_u8(inptr2);
/* Subtract 128 from Cb and Cr. */
- int16x8_t cr_128_l = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(neg_128), vget_low_u8(cr)));
- int16x8_t cr_128_h = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(neg_128), vget_high_u8(cr)));
- int16x8_t cb_128_l = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(neg_128), vget_low_u8(cb)));
- int16x8_t cb_128_h = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(neg_128), vget_high_u8(cb)));
+ int16x8_t cr_128_l =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+ vget_low_u8(cr)));
+ int16x8_t cr_128_h =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+ vget_high_u8(cr)));
+ int16x8_t cb_128_l =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+ vget_low_u8(cb)));
+ int16x8_t cb_128_h =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+ vget_high_u8(cb)));
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
- int32x4_t g_sub_y_ll = vmull_n_s16(vget_low_s16(cb_128_l), -F_0_344);
- int32x4_t g_sub_y_lh = vmull_n_s16(vget_high_s16(cb_128_l), -F_0_344);
- int32x4_t g_sub_y_hl = vmull_n_s16(vget_low_s16(cb_128_h), -F_0_344);
- int32x4_t g_sub_y_hh = vmull_n_s16(vget_high_s16(cb_128_h), -F_0_344);
- g_sub_y_ll = vmlsl_n_s16(g_sub_y_ll, vget_low_s16(cr_128_l), F_0_714);
- g_sub_y_lh = vmlsl_n_s16(g_sub_y_lh, vget_high_s16(cr_128_l), F_0_714);
- g_sub_y_hl = vmlsl_n_s16(g_sub_y_hl, vget_low_s16(cr_128_h), F_0_714);
- g_sub_y_hh = vmlsl_n_s16(g_sub_y_hh, vget_high_s16(cr_128_h), F_0_714);
- /* Descale G components: shift right 15, round and narrow to 16-bit. */
+ int32x4_t g_sub_y_ll = vmull_lane_s16(vget_low_s16(cb_128_l), consts, 0);
+ int32x4_t g_sub_y_lh = vmull_lane_s16(vget_high_s16(cb_128_l),
+ consts, 0);
+ int32x4_t g_sub_y_hl = vmull_lane_s16(vget_low_s16(cb_128_h), consts, 0);
+ int32x4_t g_sub_y_hh = vmull_lane_s16(vget_high_s16(cb_128_h),
+ consts, 0);
+ g_sub_y_ll = vmlsl_lane_s16(g_sub_y_ll, vget_low_s16(cr_128_l),
+ consts, 1);
+ g_sub_y_lh = vmlsl_lane_s16(g_sub_y_lh, vget_high_s16(cr_128_l),
+ consts, 1);
+ g_sub_y_hl = vmlsl_lane_s16(g_sub_y_hl, vget_low_s16(cr_128_h),
+ consts, 1);
+ g_sub_y_hh = vmlsl_lane_s16(g_sub_y_hh, vget_high_s16(cr_128_h),
+ consts, 1);
+ /* Descale G components: shift right 15, round, and narrow to 16-bit. */
int16x8_t g_sub_y_l = vcombine_s16(vrshrn_n_s32(g_sub_y_ll, 15),
vrshrn_n_s32(g_sub_y_lh, 15));
int16x8_t g_sub_y_h = vcombine_s16(vrshrn_n_s32(g_sub_y_hl, 15),
vrshrn_n_s32(g_sub_y_hh, 15));
/* Compute R-Y: 1.40200 * (Cr - 128) */
- int16x8_t r_sub_y_l = vqrdmulhq_n_s16(vshlq_n_s16(cr_128_l, 1), F_1_402);
- int16x8_t r_sub_y_h = vqrdmulhq_n_s16(vshlq_n_s16(cr_128_h, 1), F_1_402);
+ int16x8_t r_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_l, 1),
+ consts, 2);
+ int16x8_t r_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_h, 1),
+ consts, 2);
/* Compute B-Y: 1.77200 * (Cb - 128) */
- int16x8_t b_sub_y_l = vqrdmulhq_n_s16(vshlq_n_s16(cb_128_l, 1), F_1_772);
- int16x8_t b_sub_y_h = vqrdmulhq_n_s16(vshlq_n_s16(cb_128_h, 1), F_1_772);
+ int16x8_t b_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_l, 1),
+ consts, 3);
+ int16x8_t b_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_h, 1),
+ consts, 3);
/* Add Y. */
- int16x8_t r_l = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(r_sub_y_l), vget_low_u8(y)));
- int16x8_t r_h = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(r_sub_y_h), vget_high_u8(y)));
- int16x8_t b_l = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(b_sub_y_l), vget_low_u8(y)));
- int16x8_t b_h = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(b_sub_y_h), vget_high_u8(y)));
- int16x8_t g_l = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(g_sub_y_l), vget_low_u8(y)));
- int16x8_t g_h = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h), vget_high_u8(y)));
+ int16x8_t r_l =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_l),
+ vget_low_u8(y)));
+ int16x8_t r_h =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_h),
+ vget_high_u8(y)));
+ int16x8_t b_l =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_l),
+ vget_low_u8(y)));
+ int16x8_t b_h =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_h),
+ vget_high_u8(y)));
+ int16x8_t g_l =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_l),
+ vget_low_u8(y)));
+ int16x8_t g_h =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h),
+ vget_high_u8(y)));
#if RGB_PIXELSIZE == 4
uint8x16x4_t rgba;
@@ -134,8 +153,8 @@ void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width,
rgb.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
/* Store RGB pixel data to memory. */
vst3q_u8(outptr, rgb);
-#else /* RGB565 */
- /* Pack R, G and B values in ratio 5:6:5. */
+#else
+ /* Pack R, G, and B values in ratio 5:6:5. */
uint16x8_t rgb565_l = vqshluq_n_s16(r_l, 8);
rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(g_l, 8), 5);
rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(b_l, 8), 11);
@@ -145,7 +164,7 @@ void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width,
/* Store RGB pixel data to memory. */
vst1q_u16((uint16_t *)outptr, rgb565_l);
vst1q_u16(((uint16_t *)outptr) + 8, rgb565_h);
-#endif /* RGB565 */
+#endif
/* Increment pointers. */
inptr0 += 16;
@@ -159,29 +178,31 @@ void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width,
uint8x8_t cb = vld1_u8(inptr1);
uint8x8_t cr = vld1_u8(inptr2);
/* Subtract 128 from Cb and Cr. */
- int16x8_t cr_128 = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
- int16x8_t cb_128 = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+ int16x8_t cr_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+ int16x8_t cb_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
- int32x4_t g_sub_y_l = vmull_n_s16(vget_low_s16(cb_128), -F_0_344);
- int32x4_t g_sub_y_h = vmull_n_s16(vget_high_s16(cb_128), -F_0_344);
- g_sub_y_l = vmlsl_n_s16(g_sub_y_l, vget_low_s16(cr_128), F_0_714);
- g_sub_y_h = vmlsl_n_s16(g_sub_y_h, vget_high_s16(cr_128), F_0_714);
- /* Descale G components: shift right 15, round and narrow to 16-bit. */
+ int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+ int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+ g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+ g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+ /* Descale G components: shift right 15, round, and narrow to 16-bit. */
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
vrshrn_n_s32(g_sub_y_h, 15));
/* Compute R-Y: 1.40200 * (Cr - 128) */
- int16x8_t r_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cr_128, 1), F_1_402);
+ int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
+ consts, 2);
/* Compute B-Y: 1.77200 * (Cb - 128) */
- int16x8_t b_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cb_128, 1), F_1_772);
+ int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
+ consts, 3);
/* Add Y. */
- int16x8_t r = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
- int16x8_t b = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
- int16x8_t g = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
+ int16x8_t r =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
+ int16x8_t b =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
+ int16x8_t g =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
#if RGB_PIXELSIZE == 4
uint8x8x4_t rgba;
@@ -201,14 +222,14 @@ void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width,
rgb.val[RGB_BLUE] = vqmovun_s16(b);
/* Store RGB pixel data to memory. */
vst3_u8(outptr, rgb);
-#else /* RGB565 */
- /* Pack R, G and B values in ratio 5:6:5. */
+#else
+ /* Pack R, G, and B values in ratio 5:6:5. */
uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
/* Store RGB pixel data to memory. */
vst1q_u16((uint16_t *)outptr, rgb565);
-#endif /* RGB565 */
+#endif
/* Increment pointers. */
inptr0 += 8;
@@ -224,29 +245,31 @@ void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width,
uint8x8_t cb = vld1_u8(inptr1);
uint8x8_t cr = vld1_u8(inptr2);
/* Subtract 128 from Cb and Cr. */
- int16x8_t cr_128 = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
- int16x8_t cb_128 = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+ int16x8_t cr_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+ int16x8_t cb_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
- int32x4_t g_sub_y_l = vmull_n_s16(vget_low_s16(cb_128), -F_0_344);
- int32x4_t g_sub_y_h = vmull_n_s16(vget_high_s16(cb_128), -F_0_344);
- g_sub_y_l = vmlsl_n_s16(g_sub_y_l, vget_low_s16(cr_128), F_0_714);
- g_sub_y_h = vmlsl_n_s16(g_sub_y_h, vget_high_s16(cr_128), F_0_714);
- /* Descale G components: shift right 15, round and narrow to 16-bit. */
+ int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+ int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+ g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+ g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+ /* Descale G components: shift right 15, round, and narrow to 16-bit. */
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
vrshrn_n_s32(g_sub_y_h, 15));
/* Compute R-Y: 1.40200 * (Cr - 128) */
- int16x8_t r_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cr_128, 1), F_1_402);
+ int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
+ consts, 2);
/* Compute B-Y: 1.77200 * (Cb - 128) */
- int16x8_t b_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cb_128, 1), F_1_772);
+ int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
+ consts, 3);
/* Add Y. */
- int16x8_t r = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
- int16x8_t b = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
- int16x8_t g = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
+ int16x8_t r =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
+ int16x8_t b =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
+ int16x8_t g =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
#if RGB_PIXELSIZE == 4
uint8x8x4_t rgba;
@@ -258,19 +281,19 @@ void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width,
rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
/* Store RGBA pixel data to memory. */
switch (cols_remaining) {
- case 7 :
+ case 7:
vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba, 6);
- case 6 :
+ case 6:
vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba, 5);
- case 5 :
+ case 5:
vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba, 4);
- case 4 :
+ case 4:
vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba, 3);
- case 3 :
+ case 3:
vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba, 2);
- case 2 :
+ case 2:
vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba, 1);
- case 1 :
+ case 1:
vst4_lane_u8(outptr, rgba, 0);
default:
break;
@@ -283,48 +306,48 @@ void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width,
rgb.val[RGB_BLUE] = vqmovun_s16(b);
/* Store RGB pixel data to memory. */
switch (cols_remaining) {
- case 7 :
+ case 7:
vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb, 6);
- case 6 :
+ case 6:
vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb, 5);
- case 5 :
+ case 5:
vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb, 4);
- case 4 :
+ case 4:
vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb, 3);
- case 3 :
+ case 3:
vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb, 2);
- case 2 :
+ case 2:
vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb, 1);
- case 1 :
+ case 1:
vst3_lane_u8(outptr, rgb, 0);
default:
break;
}
-#else /* RGB565 */
- /* Pack R, G and B values in ratio 5:6:5. */
+#else
+ /* Pack R, G, and B values in ratio 5:6:5. */
uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
/* Store RGB565 pixel data to memory. */
switch (cols_remaining) {
- case 7 :
- vst1q_lane_u16(outptr + 6 * RGB_PIXELSIZE, rgb565, 6);
- case 6 :
- vst1q_lane_u16(outptr + 5 * RGB_PIXELSIZE, rgb565, 5);
- case 5 :
- vst1q_lane_u16(outptr + 4 * RGB_PIXELSIZE, rgb565, 4);
- case 4 :
- vst1q_lane_u16(outptr + 3 * RGB_PIXELSIZE, rgb565, 3);
- case 3 :
- vst1q_lane_u16(outptr + 2 * RGB_PIXELSIZE, rgb565, 2);
- case 2 :
- vst1q_lane_u16(outptr + RGB_PIXELSIZE, rgb565, 1);
- case 1 :
- vst1q_lane_u16(outptr, rgb565, 0);
+ case 7:
+ vst1q_lane_u16((uint16_t *)(outptr + 6 * RGB_PIXELSIZE), rgb565, 6);
+ case 6:
+ vst1q_lane_u16((uint16_t *)(outptr + 5 * RGB_PIXELSIZE), rgb565, 5);
+ case 5:
+ vst1q_lane_u16((uint16_t *)(outptr + 4 * RGB_PIXELSIZE), rgb565, 4);
+ case 4:
+ vst1q_lane_u16((uint16_t *)(outptr + 3 * RGB_PIXELSIZE), rgb565, 3);
+ case 3:
+ vst1q_lane_u16((uint16_t *)(outptr + 2 * RGB_PIXELSIZE), rgb565, 2);
+ case 2:
+ vst1q_lane_u16((uint16_t *)(outptr + RGB_PIXELSIZE), rgb565, 1);
+ case 1:
+ vst1q_lane_u16((uint16_t *)outptr, rgb565, 0);
default:
break;
}
-#endif /* RGB565 */
+#endif
}
}
}
diff --git a/simd/arm/common/jdcolor-neon.c b/simd/arm/jdcolor-neon.c
index 52dab1e..28dbc57 100644
--- a/simd/arm/common/jdcolor-neon.c
+++ b/simd/arm/jdcolor-neon.c
@@ -1,7 +1,7 @@
/*
- * jdcolor-neon.c - colorspace conversion (Arm NEON)
+ * jdcolor-neon.c - colorspace conversion (Arm Neon)
*
- * Copyright 2019 The Chromium Authors. All Rights Reserved.
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
@@ -21,22 +21,29 @@
*/
#define JPEG_INTERNALS
-#include "../../../jinclude.h"
-#include "../../../jpeglib.h"
-#include "../../../jsimd.h"
-#include "../../../jdct.h"
-#include "../../../jsimddct.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
#include <arm_neon.h>
-/* YCbCr -> RGB conversion constants. */
+
+/* YCbCr -> RGB conversion constants */
#define F_0_344 11277 /* 0.3441467 = 11277 * 2^-15 */
#define F_0_714 23401 /* 0.7141418 = 23401 * 2^-15 */
#define F_1_402 22971 /* 1.4020386 = 22971 * 2^-14 */
#define F_1_772 29033 /* 1.7720337 = 29033 * 2^-14 */
+ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
+ -F_0_344, F_0_714, F_1_402, F_1_772
+};
+
+
/* Include inline routines for colorspace extensions. */
#include "jdcolext-neon.c"
@@ -125,7 +132,7 @@
#undef RGB_PIXELSIZE
#undef jsimd_ycc_rgb_convert_neon
-/* YCbCr -> RGB565 Conversion. */
+/* YCbCr -> RGB565 Conversion */
#define RGB_PIXELSIZE 2
#define jsimd_ycc_rgb_convert_neon jsimd_ycc_rgb565_convert_neon
diff --git a/simd/arm/common/jdmerge-neon.c b/simd/arm/jdmerge-neon.c
index 71798c7..18fb9d8 100644
--- a/simd/arm/common/jdmerge-neon.c
+++ b/simd/arm/jdmerge-neon.c
@@ -1,7 +1,7 @@
/*
- * jdmerge-neon.c - merged upsampling/color conversion (Arm NEON)
+ * jdmerge-neon.c - merged upsampling/color conversion (Arm Neon)
*
- * Copyright 2019 The Chromium Authors. All Rights Reserved.
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
@@ -21,23 +21,30 @@
*/
#define JPEG_INTERNALS
-#include "../../../jinclude.h"
-#include "../../../jpeglib.h"
-#include "../../../jsimd.h"
-#include "../../../jdct.h"
-#include "../../../jsimddct.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
#include <arm_neon.h>
-/* YCbCr -> RGB conversion constants. */
+
+/* YCbCr -> RGB conversion constants */
#define F_0_344 11277 /* 0.3441467 = 11277 * 2^-15 */
#define F_0_714 23401 /* 0.7141418 = 23401 * 2^-15 */
#define F_1_402 22971 /* 1.4020386 = 22971 * 2^-14 */
#define F_1_772 29033 /* 1.7720337 = 29033 * 2^-14 */
-/* Include inline routines for colorspace extensions */
+ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
+ -F_0_344, F_0_714, F_1_402, F_1_772
+};
+
+
+/* Include inline routines for colorspace extensions. */
#include "jdmrgext-neon.c"
#undef RGB_RED
@@ -135,4 +142,3 @@
#undef RGB_ALPHA
#undef RGB_PIXELSIZE
#undef jsimd_h2v1_merged_upsample_neon
-#undef jsimd_h2v2_merged_upsample_neon
diff --git a/simd/arm/common/jdmrgext-neon.c b/simd/arm/jdmrgext-neon.c
index 8533d71..fa2ec05 100644
--- a/simd/arm/common/jdmrgext-neon.c
+++ b/simd/arm/jdmrgext-neon.c
@@ -1,7 +1,8 @@
/*
- * jdmrgext-neon.c - merged upsampling/color conversion (Arm NEON)
+ * jdmrgext-neon.c - merged upsampling/color conversion (Arm Neon)
*
- * Copyright 2019 The Chromium Authors. All Rights Reserved.
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
@@ -22,9 +23,9 @@
/* This file is included by jdmerge-neon.c. */
-/*
- * These routines perform simple chroma upsampling - h2v1 or h2v2 - followed by
- * YCbCr -> RGB color conversion all in the same function.
+
+/* These routines combine simple (non-fancy, i.e. non-smooth) h2v1 or h2v2
+ * chroma upsampling and YCbCr -> RGB color conversion into a single function.
*
* As with the standalone functions, YCbCr -> RGB conversion is defined by the
* following equations:
@@ -39,24 +40,22 @@
* 1.7720337 = 29033 * 2^-14
* These constants are defined in jdmerge-neon.c.
*
- * Rounding is used when descaling to ensure correct results.
+ * To ensure correct results, rounding is used when descaling.
*/
-/*
- * Notes on safe memory access for merged upsampling/YCbCr -> RGB conversion
+/* Notes on safe memory access for merged upsampling/YCbCr -> RGB conversion
* routines:
*
* Input memory buffers can be safely overread up to the next multiple of
- * ALIGN_SIZE bytes since they are always allocated by alloc_sarray() in
+ * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
* jmemmgr.c.
*
- * The output buffer cannot safely be written beyond output_width since the
- * TurboJPEG API permits it to be allocated with or without padding up to the
- * next multiple of ALIGN_SIZE bytes.
+ * The output buffer cannot safely be written beyond output_width, since
+ * output_buf points to a possibly unpadded row in the decompressed image
+ * buffer allocated by the calling program.
*/
-/*
- * Upsample and color convert from YCbCr -> RGB for the case of 2:1 horizontal.
+/* Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
*/
void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,
@@ -65,10 +64,11 @@ void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,
JSAMPARRAY output_buf)
{
JSAMPROW outptr;
- /* Pointers to Y, Cb and Cr data. */
+ /* Pointers to Y, Cb, and Cr data */
JSAMPROW inptr0, inptr1, inptr2;
- int16x8_t neg_128 = vdupq_n_s16(-128);
+ const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+ const int16x8_t neg_128 = vdupq_n_s16(-128);
inptr0 = input_buf[0][in_row_group_ctr];
inptr1 = input_buf[1][in_row_group_ctr];
@@ -77,43 +77,55 @@ void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,
int cols_remaining = output_width;
for (; cols_remaining >= 16; cols_remaining -= 16) {
- /* Load Y-values such that even pixel indices are in one vector and odd */
- /* pixel indices are in another vector. */
+ /* De-interleave Y component values into two separate vectors, one
+ * containing the component values with even-numbered indices and one
+ * containing the component values with odd-numbered indices.
+ */
uint8x8x2_t y = vld2_u8(inptr0);
uint8x8_t cb = vld1_u8(inptr1);
uint8x8_t cr = vld1_u8(inptr2);
/* Subtract 128 from Cb and Cr. */
- int16x8_t cr_128 = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
- int16x8_t cb_128 = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+ int16x8_t cr_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+ int16x8_t cb_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
- int32x4_t g_sub_y_l = vmull_n_s16(vget_low_s16(cb_128), -F_0_344);
- int32x4_t g_sub_y_h = vmull_n_s16(vget_high_s16(cb_128), -F_0_344);
- g_sub_y_l = vmlsl_n_s16(g_sub_y_l, vget_low_s16(cr_128), F_0_714);
- g_sub_y_h = vmlsl_n_s16(g_sub_y_h, vget_high_s16(cr_128), F_0_714);
- /* Descale G components: shift right 15, round and narrow to 16-bit. */
+ int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+ int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+ g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+ g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+ /* Descale G components: shift right 15, round, and narrow to 16-bit. */
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
vrshrn_n_s32(g_sub_y_h, 15));
/* Compute R-Y: 1.40200 * (Cr - 128) */
- int16x8_t r_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cr_128, 1), F_1_402);
+ int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
/* Compute B-Y: 1.77200 * (Cb - 128) */
- int16x8_t b_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cb_128, 1), F_1_772);
- /* Add Y and duplicate chroma components; upsampling horizontally. */
- int16x8_t g_even = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y.val[0]));
- int16x8_t r_even = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y.val[0]));
- int16x8_t b_even = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y.val[0]));
- int16x8_t g_odd = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y.val[1]));
- int16x8_t r_odd = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y.val[1]));
- int16x8_t b_odd = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y.val[1]));
- /* Convert each component to unsigned and narrow, clamping to [0-255]. */
- /* Interleave pixel channel values having odd and even pixel indices. */
+ int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+ /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
+ * "odd" Y component values. This effectively upsamples the chroma
+ * components horizontally.
+ */
+ int16x8_t g_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y.val[0]));
+ int16x8_t r_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y.val[0]));
+ int16x8_t b_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y.val[0]));
+ int16x8_t g_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y.val[1]));
+ int16x8_t r_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y.val[1]));
+ int16x8_t b_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y.val[1]));
+ /* Convert each component to unsigned and narrow, clamping to [0-255].
+ * Re-interleave the "even" and "odd" component values.
+ */
uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
@@ -144,43 +156,55 @@ void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,
}
if (cols_remaining > 0) {
- /* Load y-values such that even pixel indices are in one vector and odd */
- /* pixel indices are in another vector. */
+ /* De-interleave Y component values into two separate vectors, one
+ * containing the component values with even-numbered indices and one
+ * containing the component values with odd-numbered indices.
+ */
uint8x8x2_t y = vld2_u8(inptr0);
uint8x8_t cb = vld1_u8(inptr1);
uint8x8_t cr = vld1_u8(inptr2);
/* Subtract 128 from Cb and Cr. */
- int16x8_t cr_128 = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
- int16x8_t cb_128 = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+ int16x8_t cr_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+ int16x8_t cb_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
- int32x4_t g_sub_y_l = vmull_n_s16(vget_low_s16(cb_128), -F_0_344);
- int32x4_t g_sub_y_h = vmull_n_s16(vget_high_s16(cb_128), -F_0_344);
- g_sub_y_l = vmlsl_n_s16(g_sub_y_l, vget_low_s16(cr_128), F_0_714);
- g_sub_y_h = vmlsl_n_s16(g_sub_y_h, vget_high_s16(cr_128), F_0_714);
- /* Descale G components: shift right 15, round and narrow to 16-bit. */
+ int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+ int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+ g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+ g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+ /* Descale G components: shift right 15, round, and narrow to 16-bit. */
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
vrshrn_n_s32(g_sub_y_h, 15));
/* Compute R-Y: 1.40200 * (Cr - 128) */
- int16x8_t r_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cr_128, 1), F_1_402);
+ int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
/* Compute B-Y: 1.77200 * (Cb - 128) */
- int16x8_t b_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cb_128, 1), F_1_772);
- /* Add Y and duplicate chroma components - upsample horizontally. */
- int16x8_t g_even = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y.val[0]));
- int16x8_t r_even = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y.val[0]));
- int16x8_t b_even = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y.val[0]));
- int16x8_t g_odd = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y.val[1]));
- int16x8_t r_odd = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y.val[1]));
- int16x8_t b_odd = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y.val[1]));
- /* Convert each component to unsigned and narrow, clamping to [0-255]. */
- /* Interleave pixel channel values having odd and even pixel indices. */
+ int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+ /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
+ * "odd" Y component values. This effectively upsamples the chroma
+ * components horizontally.
+ */
+ int16x8_t g_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y.val[0]));
+ int16x8_t r_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y.val[0]));
+ int16x8_t b_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y.val[0]));
+ int16x8_t g_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y.val[1]));
+ int16x8_t r_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y.val[1]));
+ int16x8_t b_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y.val[1]));
+ /* Convert each component to unsigned and narrow, clamping to [0-255].
+ * Re-interleave the "even" and "odd" component values.
+ */
uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
@@ -200,38 +224,38 @@ void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,
rgba_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
/* Store RGBA pixel data to memory. */
switch (cols_remaining) {
- case 15 :
+ case 15:
vst4_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgba_h, 6);
- case 14 :
+ case 14:
vst4_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgba_h, 5);
- case 13 :
+ case 13:
vst4_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgba_h, 4);
- case 12 :
+ case 12:
vst4_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgba_h, 3);
- case 11 :
+ case 11:
vst4_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgba_h, 2);
- case 10 :
+ case 10:
vst4_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgba_h, 1);
- case 9 :
+ case 9:
vst4_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgba_h, 0);
- case 8 :
+ case 8:
vst4_u8(outptr, rgba_l);
break;
- case 7 :
+ case 7:
vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba_l, 6);
- case 6 :
+ case 6:
vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba_l, 5);
- case 5 :
+ case 5:
vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba_l, 4);
- case 4 :
+ case 4:
vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba_l, 3);
- case 3 :
+ case 3:
vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba_l, 2);
- case 2 :
+ case 2:
vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba_l, 1);
- case 1 :
+ case 1:
vst4_lane_u8(outptr, rgba_l, 0);
- default :
+ default:
break;
}
#else
@@ -245,38 +269,38 @@ void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,
rgb_l.val[RGB_BLUE] = b.val[0];
/* Store RGB pixel data to memory. */
switch (cols_remaining) {
- case 15 :
+ case 15:
vst3_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgb_h, 6);
- case 14 :
+ case 14:
vst3_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgb_h, 5);
- case 13 :
+ case 13:
vst3_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgb_h, 4);
- case 12 :
+ case 12:
vst3_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgb_h, 3);
- case 11 :
+ case 11:
vst3_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgb_h, 2);
- case 10 :
+ case 10:
vst3_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgb_h, 1);
- case 9 :
+ case 9:
vst3_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgb_h, 0);
- case 8 :
+ case 8:
vst3_u8(outptr, rgb_l);
break;
- case 7 :
+ case 7:
vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb_l, 6);
- case 6 :
+ case 6:
vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb_l, 5);
- case 5 :
+ case 5:
vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb_l, 4);
- case 4 :
+ case 4:
vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb_l, 3);
- case 3 :
+ case 3:
vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb_l, 2);
- case 2 :
+ case 2:
vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb_l, 1);
- case 1 :
+ case 1:
vst3_lane_u8(outptr, rgb_l, 0);
- default :
+ default:
break;
}
#endif
@@ -284,11 +308,10 @@ void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,
}
-/*
- * Upsample and color convert from YCbCr -> RGB for the case of 2:1 horizontal
- * and 2:1 vertical.
+/* Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
*
- * See above for details of color conversion and safe memory buffer access.
+ * See comments above for details regarding color conversion and safe memory
+ * access.
*/
void jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width,
@@ -297,10 +320,11 @@ void jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width,
JSAMPARRAY output_buf)
{
JSAMPROW outptr0, outptr1;
- /* Pointers to Y (both rows), Cb and Cr data. */
+ /* Pointers to Y (both rows), Cb, and Cr data */
JSAMPROW inptr0_0, inptr0_1, inptr1, inptr2;
- int16x8_t neg_128 = vdupq_n_s16(-128);
+ const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+ const int16x8_t neg_128 = vdupq_n_s16(-128);
inptr0_0 = input_buf[0][in_row_group_ctr * 2];
inptr0_1 = input_buf[0][in_row_group_ctr * 2 + 1];
@@ -311,56 +335,74 @@ void jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width,
int cols_remaining = output_width;
for (; cols_remaining >= 16; cols_remaining -= 16) {
- /* Load Y-values such that even pixel indices are in one vector and odd */
- /* pixel indices are in another vector. */
+ /* For each row, de-interleave Y component values into two separate
+ * vectors, one containing the component values with even-numbered indices
+ * and one containing the component values with odd-numbered indices.
+ */
uint8x8x2_t y0 = vld2_u8(inptr0_0);
uint8x8x2_t y1 = vld2_u8(inptr0_1);
uint8x8_t cb = vld1_u8(inptr1);
uint8x8_t cr = vld1_u8(inptr2);
/* Subtract 128 from Cb and Cr. */
- int16x8_t cr_128 = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
- int16x8_t cb_128 = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+ int16x8_t cr_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+ int16x8_t cb_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
- int32x4_t g_sub_y_l = vmull_n_s16(vget_low_s16(cb_128), -F_0_344);
- int32x4_t g_sub_y_h = vmull_n_s16(vget_high_s16(cb_128), -F_0_344);
- g_sub_y_l = vmlsl_n_s16(g_sub_y_l, vget_low_s16(cr_128), F_0_714);
- g_sub_y_h = vmlsl_n_s16(g_sub_y_h, vget_high_s16(cr_128), F_0_714);
- /* Descale G components: shift right 15, round and narrow to 16-bit. */
+ int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+ int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+ g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+ g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+ /* Descale G components: shift right 15, round, and narrow to 16-bit. */
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
vrshrn_n_s32(g_sub_y_h, 15));
/* Compute R-Y: 1.40200 * (Cr - 128) */
- int16x8_t r_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cr_128, 1), F_1_402);
+ int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
/* Compute B-Y: 1.77200 * (Cb - 128) */
- int16x8_t b_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cb_128, 1), F_1_772);
- /* Add Y and duplicate chroma components - upsample horizontally. */
- int16x8_t g0_even = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y0.val[0]));
- int16x8_t r0_even = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y0.val[0]));
- int16x8_t b0_even = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y0.val[0]));
- int16x8_t g0_odd = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y0.val[1]));
- int16x8_t r0_odd = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y0.val[1]));
- int16x8_t b0_odd = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y0.val[1]));
- int16x8_t g1_even = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y1.val[0]));
- int16x8_t r1_even = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y1.val[0]));
- int16x8_t b1_even = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y1.val[0]));
- int16x8_t g1_odd = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y1.val[1]));
- int16x8_t r1_odd = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y1.val[1]));
- int16x8_t b1_odd = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y1.val[1]));
- /* Convert each component to unsigned and narrow, clamping to [0-255]. */
- /* Interleave pixel channel values having odd and even pixel indices. */
+ int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+ /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
+ * the "even" and "odd" Y component values. This effectively upsamples the
+ * chroma components both horizontally and vertically.
+ */
+ int16x8_t g0_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y0.val[0]));
+ int16x8_t r0_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y0.val[0]));
+ int16x8_t b0_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y0.val[0]));
+ int16x8_t g0_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y0.val[1]));
+ int16x8_t r0_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y0.val[1]));
+ int16x8_t b0_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y0.val[1]));
+ int16x8_t g1_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y1.val[0]));
+ int16x8_t r1_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y1.val[0]));
+ int16x8_t b1_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y1.val[0]));
+ int16x8_t g1_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y1.val[1]));
+ int16x8_t r1_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y1.val[1]));
+ int16x8_t b1_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y1.val[1]));
+ /* Convert each component to unsigned and narrow, clamping to [0-255].
+ * Re-interleave the "even" and "odd" component values.
+ */
uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
@@ -405,56 +447,74 @@ void jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width,
}
if (cols_remaining > 0) {
- /* Load Y-values such that even pixel indices are in one vector and */
- /* odd pixel indices are in another vector. */
+ /* For each row, de-interleave Y component values into two separate
+ * vectors, one containing the component values with even-numbered indices
+ * and one containing the component values with odd-numbered indices.
+ */
uint8x8x2_t y0 = vld2_u8(inptr0_0);
uint8x8x2_t y1 = vld2_u8(inptr0_1);
uint8x8_t cb = vld1_u8(inptr1);
uint8x8_t cr = vld1_u8(inptr2);
/* Subtract 128 from Cb and Cr. */
- int16x8_t cr_128 = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
- int16x8_t cb_128 = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+ int16x8_t cr_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+ int16x8_t cb_128 =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
- int32x4_t g_sub_y_l = vmull_n_s16(vget_low_s16(cb_128), -F_0_344);
- int32x4_t g_sub_y_h = vmull_n_s16(vget_high_s16(cb_128), -F_0_344);
- g_sub_y_l = vmlsl_n_s16(g_sub_y_l, vget_low_s16(cr_128), F_0_714);
- g_sub_y_h = vmlsl_n_s16(g_sub_y_h, vget_high_s16(cr_128), F_0_714);
- /* Descale G components: shift right 15, round and narrow to 16-bit. */
+ int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+ int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+ g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+ g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+ /* Descale G components: shift right 15, round, and narrow to 16-bit. */
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
vrshrn_n_s32(g_sub_y_h, 15));
/* Compute R-Y: 1.40200 * (Cr - 128) */
- int16x8_t r_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cr_128, 1), F_1_402);
+ int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
/* Compute B-Y: 1.77200 * (Cb - 128) */
- int16x8_t b_sub_y = vqrdmulhq_n_s16(vshlq_n_s16(cb_128, 1), F_1_772);
- /* Add Y and duplicate chroma components - upsample horizontally. */
- int16x8_t g0_even = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y0.val[0]));
- int16x8_t r0_even = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y0.val[0]));
- int16x8_t b0_even = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y0.val[0]));
- int16x8_t g0_odd = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y0.val[1]));
- int16x8_t r0_odd = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y0.val[1]));
- int16x8_t b0_odd = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y0.val[1]));
- int16x8_t g1_even = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y1.val[0]));
- int16x8_t r1_even = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y1.val[0]));
- int16x8_t b1_even = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y1.val[0]));
- int16x8_t g1_odd = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y1.val[1]));
- int16x8_t r1_odd = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y1.val[1]));
- int16x8_t b1_odd = vreinterpretq_s16_u16(
- vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y1.val[1]));
- /* Convert each component to unsigned and narrow, clamping to [0-255]. */
- /* Interleave pixel channel values having odd and even pixel indices. */
+ int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+ /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
+ * the "even" and "odd" Y component values. This effectively upsamples the
+ * chroma components both horizontally and vertically.
+ */
+ int16x8_t g0_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y0.val[0]));
+ int16x8_t r0_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y0.val[0]));
+ int16x8_t b0_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y0.val[0]));
+ int16x8_t g0_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y0.val[1]));
+ int16x8_t r0_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y0.val[1]));
+ int16x8_t b0_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y0.val[1]));
+ int16x8_t g1_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y1.val[0]));
+ int16x8_t r1_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y1.val[0]));
+ int16x8_t b1_even =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y1.val[0]));
+ int16x8_t g1_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+ y1.val[1]));
+ int16x8_t r1_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+ y1.val[1]));
+ int16x8_t b1_odd =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+ y1.val[1]));
+ /* Convert each component to unsigned and narrow, clamping to [0-255].
+ * Re-interleave the "even" and "odd" component values.
+ */
uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
@@ -486,53 +546,53 @@ void jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width,
rgba1_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
/* Store RGBA pixel data to memory. */
switch (cols_remaining) {
- case 15 :
+ case 15:
vst4_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgba0_h, 6);
vst4_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgba1_h, 6);
- case 14 :
+ case 14:
vst4_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgba0_h, 5);
vst4_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgba1_h, 5);
- case 13 :
+ case 13:
vst4_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgba0_h, 4);
vst4_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgba1_h, 4);
- case 12 :
+ case 12:
vst4_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgba0_h, 3);
vst4_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgba1_h, 3);
- case 11 :
+ case 11:
vst4_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgba0_h, 2);
vst4_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgba1_h, 2);
- case 10 :
+ case 10:
vst4_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgba0_h, 1);
vst4_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgba1_h, 1);
- case 9 :
+ case 9:
vst4_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgba0_h, 0);
vst4_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgba1_h, 0);
- case 8 :
+ case 8:
vst4_u8(outptr0, rgba0_l);
vst4_u8(outptr1, rgba1_l);
break;
- case 7 :
+ case 7:
vst4_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgba0_l, 6);
vst4_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgba1_l, 6);
- case 6 :
+ case 6:
vst4_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgba0_l, 5);
vst4_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgba1_l, 5);
- case 5 :
+ case 5:
vst4_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgba0_l, 4);
vst4_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgba1_l, 4);
- case 4 :
+ case 4:
vst4_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgba0_l, 3);
vst4_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgba1_l, 3);
- case 3 :
+ case 3:
vst4_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgba0_l, 2);
vst4_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgba1_l, 2);
- case 2 :
+ case 2:
vst4_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgba0_l, 1);
vst4_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgba1_l, 1);
- case 1 :
+ case 1:
vst4_lane_u8(outptr0, rgba0_l, 0);
vst4_lane_u8(outptr1, rgba1_l, 0);
- default :
+ default:
break;
}
#else
@@ -553,53 +613,53 @@ void jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width,
rgb1_l.val[RGB_BLUE] = b1.val[0];
/* Store RGB pixel data to memory. */
switch (cols_remaining) {
- case 15 :
+ case 15:
vst3_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgb0_h, 6);
vst3_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgb1_h, 6);
- case 14 :
+ case 14:
vst3_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgb0_h, 5);
vst3_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgb1_h, 5);
- case 13 :
+ case 13:
vst3_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgb0_h, 4);
vst3_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgb1_h, 4);
- case 12 :
+ case 12:
vst3_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgb0_h, 3);
vst3_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgb1_h, 3);
- case 11 :
+ case 11:
vst3_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgb0_h, 2);
vst3_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgb1_h, 2);
- case 10 :
+ case 10:
vst3_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgb0_h, 1);
vst3_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgb1_h, 1);
- case 9 :
+ case 9:
vst3_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgb0_h, 0);
vst3_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgb1_h, 0);
- case 8 :
+ case 8:
vst3_u8(outptr0, rgb0_l);
vst3_u8(outptr1, rgb1_l);
break;
- case 7 :
+ case 7:
vst3_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgb0_l, 6);
vst3_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgb1_l, 6);
- case 6 :
+ case 6:
vst3_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgb0_l, 5);
vst3_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgb1_l, 5);
- case 5 :
+ case 5:
vst3_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgb0_l, 4);
vst3_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgb1_l, 4);
- case 4 :
+ case 4:
vst3_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgb0_l, 3);
vst3_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgb1_l, 3);
- case 3 :
+ case 3:
vst3_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgb0_l, 2);
vst3_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgb1_l, 2);
- case 2 :
+ case 2:
vst3_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgb0_l, 1);
vst3_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgb1_l, 1);
- case 1 :
+ case 1:
vst3_lane_u8(outptr0, rgb0_l, 0);
vst3_lane_u8(outptr1, rgb1_l, 0);
- default :
+ default:
break;
}
#endif
diff --git a/simd/arm/jdsample-neon.c b/simd/arm/jdsample-neon.c
new file mode 100644
index 0000000..90ec678
--- /dev/null
+++ b/simd/arm/jdsample-neon.c
@@ -0,0 +1,569 @@
+/*
+ * jdsample-neon.c - upsampling (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <arm_neon.h>
+
+
+/* The diagram below shows a row of samples produced by h2v1 downsampling.
+ *
+ * s0 s1 s2
+ * +---------+---------+---------+
+ * | | | |
+ * | p0 p1 | p2 p3 | p4 p5 |
+ * | | | |
+ * +---------+---------+---------+
+ *
+ * Samples s0-s2 were created by averaging the original pixel component values
+ * centered at positions p0-p5 above. To approximate those original pixel
+ * component values, we proportionally blend the adjacent samples in each row.
+ *
+ * An upsampled pixel component value is computed by blending the sample
+ * containing the pixel center with the nearest neighboring sample, in the
+ * ratio 3:1. For example:
+ * p1(upsampled) = 3/4 * s0 + 1/4 * s1
+ * p2(upsampled) = 3/4 * s1 + 1/4 * s0
+ * When computing the first and last pixel component values in the row, there
+ * is no adjacent sample to blend, so:
+ * p0(upsampled) = s0
+ * p5(upsampled) = s2
+ */
+
+void jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
+ JDIMENSION downsampled_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr, outptr;
+ int inrow;
+ unsigned colctr;
+ /* Set up constants. */
+ const uint16x8_t one_u16 = vdupq_n_u16(1);
+ const uint8x8_t three_u8 = vdup_n_u8(3);
+
+ for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+ inptr = input_data[inrow];
+ outptr = output_data[inrow];
+ /* First pixel component value in this row of the original image */
+ *outptr = (JSAMPLE)GETJSAMPLE(*inptr);
+
+ /* 3/4 * containing sample + 1/4 * nearest neighboring sample
+ * For p1: containing sample = s0, nearest neighboring sample = s1
+ * For p2: containing sample = s1, nearest neighboring sample = s0
+ */
+ uint8x16_t s0 = vld1q_u8(inptr);
+ uint8x16_t s1 = vld1q_u8(inptr + 1);
+ /* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
+ * denote low half and high half respectively.
+ */
+ uint16x8_t s1_add_3s0_l =
+ vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
+ uint16x8_t s1_add_3s0_h =
+ vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
+ uint16x8_t s0_add_3s1_l =
+ vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
+ uint16x8_t s0_add_3s1_h =
+ vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
+ /* Add ordered dithering bias to odd pixel values. */
+ s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
+ s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
+
+ /* The offset is initially 1, because the first pixel component has already
+ * been stored. However, in subsequent iterations of the SIMD loop, this
+ * offset is (2 * colctr - 1) to stay within the bounds of the sample
+ * buffers without having to resort to a slow scalar tail case for the last
+ * (downsampled_width % 16) samples. See "Creation of 2-D sample arrays"
+ * in jmemmgr.c for more details.
+ */
+ unsigned outptr_offset = 1;
+ uint8x16x2_t output_pixels;
+
+ /* We use software pipelining to maximise performance. The code indented
+ * an extra two spaces begins the next iteration of the loop.
+ */
+ for (colctr = 16; colctr < downsampled_width; colctr += 16) {
+
+ s0 = vld1q_u8(inptr + colctr - 1);
+ s1 = vld1q_u8(inptr + colctr);
+
+ /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+ output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
+ vrshrn_n_u16(s1_add_3s0_h, 2));
+ output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
+ vshrn_n_u16(s0_add_3s1_h, 2));
+
+ /* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
+ * denote low half and high half respectively.
+ */
+ s1_add_3s0_l =
+ vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
+ s1_add_3s0_h =
+ vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
+ s0_add_3s1_l =
+ vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
+ s0_add_3s1_h =
+ vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
+ /* Add ordered dithering bias to odd pixel values. */
+ s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
+ s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
+
+ /* Store pixel component values to memory. */
+ vst2q_u8(outptr + outptr_offset, output_pixels);
+ outptr_offset = 2 * colctr - 1;
+ }
+
+ /* Complete the last iteration of the loop. */
+
+ /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+ output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
+ vrshrn_n_u16(s1_add_3s0_h, 2));
+ output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
+ vshrn_n_u16(s0_add_3s1_h, 2));
+ /* Store pixel component values to memory. */
+ vst2q_u8(outptr + outptr_offset, output_pixels);
+
+ /* Last pixel component value in this row of the original image */
+ outptr[2 * downsampled_width - 1] =
+ GETJSAMPLE(inptr[downsampled_width - 1]);
+ }
+}
+
+
+/* The diagram below shows an array of samples produced by h2v2 downsampling.
+ *
+ * s0 s1 s2
+ * +---------+---------+---------+
+ * | p0 p1 | p2 p3 | p4 p5 |
+ * sA | | | |
+ * | p6 p7 | p8 p9 | p10 p11|
+ * +---------+---------+---------+
+ * | p12 p13| p14 p15| p16 p17|
+ * sB | | | |
+ * | p18 p19| p20 p21| p22 p23|
+ * +---------+---------+---------+
+ * | p24 p25| p26 p27| p28 p29|
+ * sC | | | |
+ * | p30 p31| p32 p33| p34 p35|
+ * +---------+---------+---------+
+ *
+ * Samples s0A-s2C were created by averaging the original pixel component
+ * values centered at positions p0-p35 above. To approximate one of those
+ * original pixel component values, we proportionally blend the sample
+ * containing the pixel center with the nearest neighboring samples in each
+ * row, column, and diagonal.
+ *
+ * An upsampled pixel component value is computed by first blending the sample
+ * containing the pixel center with the nearest neighboring samples in the
+ * same column, in the ratio 3:1, and then blending each column sum with the
+ * nearest neighboring column sum, in the ratio 3:1. For example:
+ * p14(upsampled) = 3/4 * (3/4 * s1B + 1/4 * s1A) +
+ * 1/4 * (3/4 * s0B + 1/4 * s0A)
+ * = 9/16 * s1B + 3/16 * s1A + 3/16 * s0B + 1/16 * s0A
+ * When computing the first and last pixel component values in the row, there
+ * is no horizontally adjacent sample to blend, so:
+ * p12(upsampled) = 3/4 * s0B + 1/4 * s0A
+ * p23(upsampled) = 3/4 * s2B + 1/4 * s2C
+ * When computing the first and last pixel component values in the column,
+ * there is no vertically adjacent sample to blend, so:
+ * p2(upsampled) = 3/4 * s1A + 1/4 * s0A
+ * p33(upsampled) = 3/4 * s1C + 1/4 * s2C
+ * When computing the corner pixel component values, there is no adjacent
+ * sample to blend, so:
+ * p0(upsampled) = s0A
+ * p35(upsampled) = s2C
+ */
+
+void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
+ JDIMENSION downsampled_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
+ int inrow, outrow;
+ unsigned colctr;
+ /* Set up constants. */
+ const uint16x8_t seven_u16 = vdupq_n_u16(7);
+ const uint8x8_t three_u8 = vdup_n_u8(3);
+ const uint16x8_t three_u16 = vdupq_n_u16(3);
+
+ inrow = outrow = 0;
+ while (outrow < max_v_samp_factor) {
+ inptr0 = input_data[inrow - 1];
+ inptr1 = input_data[inrow];
+ inptr2 = input_data[inrow + 1];
+ /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
+ * respectively.
+ */
+ outptr0 = output_data[outrow++];
+ outptr1 = output_data[outrow++];
+
+ /* First pixel component value in this row of the original image */
+ int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0);
+ *outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4);
+ int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2);
+ *outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4);
+
+ /* Step 1: Blend samples vertically in columns s0 and s1.
+ * Leave the divide by 4 until the end, when it can be done for both
+ * dimensions at once, right-shifting by 4.
+ */
+
+ /* Load and compute s0colsum0 and s0colsum1. */
+ uint8x16_t s0A = vld1q_u8(inptr0);
+ uint8x16_t s0B = vld1q_u8(inptr1);
+ uint8x16_t s0C = vld1q_u8(inptr2);
+ /* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes
+ * denote low half and high half respectively.
+ */
+ uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)),
+ vget_low_u8(s0B), three_u8);
+ uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)),
+ vget_high_u8(s0B), three_u8);
+ uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)),
+ vget_low_u8(s0B), three_u8);
+ uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)),
+ vget_high_u8(s0B), three_u8);
+ /* Load and compute s1colsum0 and s1colsum1. */
+ uint8x16_t s1A = vld1q_u8(inptr0 + 1);
+ uint8x16_t s1B = vld1q_u8(inptr1 + 1);
+ uint8x16_t s1C = vld1q_u8(inptr2 + 1);
+ uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)),
+ vget_low_u8(s1B), three_u8);
+ uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)),
+ vget_high_u8(s1B), three_u8);
+ uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)),
+ vget_low_u8(s1B), three_u8);
+ uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)),
+ vget_high_u8(s1B), three_u8);
+
+ /* Step 2: Blend the already-blended columns. */
+
+ uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
+ uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
+ uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
+ uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
+ uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
+ uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
+ uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
+ uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
+ /* Add ordered dithering bias to odd pixel values. */
+ output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
+ output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
+ output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
+ output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
+ /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
+ uint8x16x2_t output_pixels0 = { {
+ vcombine_u8(vshrn_n_u16(output0_p1_l, 4), vshrn_n_u16(output0_p1_h, 4)),
+ vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), vrshrn_n_u16(output0_p2_h, 4))
+ } };
+ uint8x16x2_t output_pixels1 = { {
+ vcombine_u8(vshrn_n_u16(output1_p1_l, 4), vshrn_n_u16(output1_p1_h, 4)),
+ vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), vrshrn_n_u16(output1_p2_h, 4))
+ } };
+
+ /* Store pixel component values to memory.
+ * The minimum size of the output buffer for each row is 64 bytes => no
+ * need to worry about buffer overflow here. See "Creation of 2-D sample
+ * arrays" in jmemmgr.c for more details.
+ */
+ vst2q_u8(outptr0 + 1, output_pixels0);
+ vst2q_u8(outptr1 + 1, output_pixels1);
+
+ /* The first pixel of the image shifted our loads and stores by one byte.
+ * We have to re-align on a 32-byte boundary at some point before the end
+ * of the row (we do it now on the 32/33 pixel boundary) to stay within the
+ * bounds of the sample buffers without having to resort to a slow scalar
+ * tail case for the last (downsampled_width % 16) samples. See "Creation
+ * of 2-D sample arrays" in jmemmgr.c for more details.
+ */
+ for (colctr = 16; colctr < downsampled_width; colctr += 16) {
+ /* Step 1: Blend samples vertically in columns s0 and s1. */
+
+ /* Load and compute s0colsum0 and s0colsum1. */
+ s0A = vld1q_u8(inptr0 + colctr - 1);
+ s0B = vld1q_u8(inptr1 + colctr - 1);
+ s0C = vld1q_u8(inptr2 + colctr - 1);
+ s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)), vget_low_u8(s0B),
+ three_u8);
+ s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)), vget_high_u8(s0B),
+ three_u8);
+ s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)), vget_low_u8(s0B),
+ three_u8);
+ s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)), vget_high_u8(s0B),
+ three_u8);
+ /* Load and compute s1colsum0 and s1colsum1. */
+ s1A = vld1q_u8(inptr0 + colctr);
+ s1B = vld1q_u8(inptr1 + colctr);
+ s1C = vld1q_u8(inptr2 + colctr);
+ s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)), vget_low_u8(s1B),
+ three_u8);
+ s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)), vget_high_u8(s1B),
+ three_u8);
+ s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)), vget_low_u8(s1B),
+ three_u8);
+ s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)), vget_high_u8(s1B),
+ three_u8);
+
+ /* Step 2: Blend the already-blended columns. */
+
+ output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
+ output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
+ output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
+ output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
+ output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
+ output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
+ output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
+ output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
+ /* Add ordered dithering bias to odd pixel values. */
+ output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
+ output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
+ output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
+ output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
+ /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
+ output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
+ vshrn_n_u16(output0_p1_h, 4));
+ output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
+ vrshrn_n_u16(output0_p2_h, 4));
+ output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
+ vshrn_n_u16(output1_p1_h, 4));
+ output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
+ vrshrn_n_u16(output1_p2_h, 4));
+ /* Store pixel component values to memory. */
+ vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0);
+ vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1);
+ }
+
+ /* Last pixel component value in this row of the original image */
+ int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
+ GETJSAMPLE(inptr0[downsampled_width - 1]);
+ outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4);
+ int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
+ GETJSAMPLE(inptr2[downsampled_width - 1]);
+ outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);
+ inrow++;
+ }
+}
+
+
+/* The diagram below shows a column of samples produced by h1v2 downsampling
+ * (or by losslessly rotating or transposing an h2v1-downsampled image.)
+ *
+ * +---------+
+ * | p0 |
+ * sA | |
+ * | p1 |
+ * +---------+
+ * | p2 |
+ * sB | |
+ * | p3 |
+ * +---------+
+ * | p4 |
+ * sC | |
+ * | p5 |
+ * +---------+
+ *
+ * Samples sA-sC were created by averaging the original pixel component values
+ * centered at positions p0-p5 above. To approximate those original pixel
+ * component values, we proportionally blend the adjacent samples in each
+ * column.
+ *
+ * An upsampled pixel component value is computed by blending the sample
+ * containing the pixel center with the nearest neighboring sample, in the
+ * ratio 3:1. For example:
+ * p1(upsampled) = 3/4 * sA + 1/4 * sB
+ * p2(upsampled) = 3/4 * sB + 1/4 * sA
+ * When computing the first and last pixel component values in the column,
+ * there is no adjacent sample to blend, so:
+ * p0(upsampled) = sA
+ * p5(upsampled) = sC
+ */
+
+void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
+ JDIMENSION downsampled_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
+ int inrow, outrow;
+ unsigned colctr;
+ /* Set up constants. */
+ const uint16x8_t one_u16 = vdupq_n_u16(1);
+ const uint8x8_t three_u8 = vdup_n_u8(3);
+
+ inrow = outrow = 0;
+ while (outrow < max_v_samp_factor) {
+ inptr0 = input_data[inrow - 1];
+ inptr1 = input_data[inrow];
+ inptr2 = input_data[inrow + 1];
+ /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
+ * respectively.
+ */
+ outptr0 = output_data[outrow++];
+ outptr1 = output_data[outrow++];
+ inrow++;
+
+ /* The size of the input and output buffers is always a multiple of 32
+ * bytes => no need to worry about buffer overflow when reading/writing
+ * memory. See "Creation of 2-D sample arrays" in jmemmgr.c for more
+ * details.
+ */
+ for (colctr = 0; colctr < downsampled_width; colctr += 16) {
+ /* Load samples. */
+ uint8x16_t sA = vld1q_u8(inptr0 + colctr);
+ uint8x16_t sB = vld1q_u8(inptr1 + colctr);
+ uint8x16_t sC = vld1q_u8(inptr2 + colctr);
+ /* Blend samples vertically. */
+ uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(sA)),
+ vget_low_u8(sB), three_u8);
+ uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(sA)),
+ vget_high_u8(sB), three_u8);
+ uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(sC)),
+ vget_low_u8(sB), three_u8);
+ uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(sC)),
+ vget_high_u8(sB), three_u8);
+ /* Add ordered dithering bias to pixel values in even output rows. */
+ colsum0_l = vaddq_u16(colsum0_l, one_u16);
+ colsum0_h = vaddq_u16(colsum0_h, one_u16);
+ /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+ uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),
+ vshrn_n_u16(colsum0_h, 2));
+ uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),
+ vrshrn_n_u16(colsum1_h, 2));
+ /* Store pixel component values to memory. */
+ vst1q_u8(outptr0 + colctr, output_pixels0);
+ vst1q_u8(outptr1 + colctr, output_pixels1);
+ }
+ }
+}
+
+
+/* The diagram below shows a row of samples produced by h2v1 downsampling.
+ *
+ * s0 s1
+ * +---------+---------+
+ * | | |
+ * | p0 p1 | p2 p3 |
+ * | | |
+ * +---------+---------+
+ *
+ * Samples s0 and s1 were created by averaging the original pixel component
+ * values centered at positions p0-p3 above. To approximate those original
+ * pixel component values, we duplicate the samples horizontally:
+ * p0(upsampled) = p1(upsampled) = s0
+ * p2(upsampled) = p3(upsampled) = s1
+ */
+
+void jsimd_h2v1_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr, outptr;
+ int inrow;
+ unsigned colctr;
+
+ for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+ inptr = input_data[inrow];
+ outptr = output_data[inrow];
+ for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
+ uint8x16_t samples = vld1q_u8(inptr + colctr);
+ /* Duplicate the samples. The store operation below interleaves them so
+ * that adjacent pixel component values take on the same sample value,
+ * per above.
+ */
+ uint8x16x2_t output_pixels = { { samples, samples } };
+ /* Store pixel component values to memory.
+ * Due to the way sample buffers are allocated, we don't need to worry
+ * about tail cases when output_width is not a multiple of 32. See
+ * "Creation of 2-D sample arrays" in jmemmgr.c for details.
+ */
+ vst2q_u8(outptr + 2 * colctr, output_pixels);
+ }
+ }
+}
+
+
+/* The diagram below shows an array of samples produced by h2v2 downsampling.
+ *
+ * s0 s1
+ * +---------+---------+
+ * | p0 p1 | p2 p3 |
+ * sA | | |
+ * | p4 p5 | p6 p7 |
+ * +---------+---------+
+ * | p8 p9 | p10 p11|
+ * sB | | |
+ * | p12 p13| p14 p15|
+ * +---------+---------+
+ *
+ * Samples s0A-s1B were created by averaging the original pixel component
+ * values centered at positions p0-p15 above. To approximate those original
+ * pixel component values, we duplicate the samples both horizontally and
+ * vertically:
+ * p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A
+ * p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A
+ * p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B
+ * p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B
+ */
+
+void jsimd_h2v2_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr, outptr0, outptr1;
+ int inrow, outrow;
+ unsigned colctr;
+
+ for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+ inptr = input_data[inrow];
+ outptr0 = output_data[outrow++];
+ outptr1 = output_data[outrow++];
+
+ for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
+ uint8x16_t samples = vld1q_u8(inptr + colctr);
+ /* Duplicate the samples. The store operation below interleaves them so
+ * that adjacent pixel component values take on the same sample value,
+ * per above.
+ */
+ uint8x16x2_t output_pixels = { { samples, samples } };
+ /* Store pixel component values for both output rows to memory.
+ * Due to the way sample buffers are allocated, we don't need to worry
+ * about tail cases when output_width is not a multiple of 32. See
+ * "Creation of 2-D sample arrays" in jmemmgr.c for details.
+ */
+ vst2q_u8(outptr0 + 2 * colctr, output_pixels);
+ vst2q_u8(outptr1 + 2 * colctr, output_pixels);
+ }
+ }
+}
diff --git a/simd/arm/common/jfdctfst-neon.c b/simd/arm/jfdctfst-neon.c
index e7b2e96..bb371be 100644
--- a/simd/arm/common/jfdctfst-neon.c
+++ b/simd/arm/jfdctfst-neon.c
@@ -1,7 +1,7 @@
/*
- * jfdctfst-neon.c - fast DCT (Arm NEON)
+ * jfdctfst-neon.c - fast integer FDCT (Arm Neon)
*
- * Copyright 2020 The Chromium Authors. All Rights Reserved.
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
@@ -21,21 +21,21 @@
*/
#define JPEG_INTERNALS
-#include "../../../jconfigint.h"
-#include "../../../jinclude.h"
-#include "../../../jpeglib.h"
-#include "../../../jsimd.h"
-#include "../../../jdct.h"
-#include "../../../jsimddct.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
#include <arm_neon.h>
-/*
- * 'jsimd_fdct_ifast_neon' performs a fast, not so accurate forward DCT
- * (Discrete Cosine Transform) on one block of samples. It uses the same
+
+/* jsimd_fdct_ifast_neon() performs a fast, not so accurate forward DCT
+ * (Discrete Cosine Transform) on one block of samples. It uses the same
* calculations and produces exactly the same output as IJG's original
- * 'jpeg_fdct_ifast' function, which can be found in jfdctfst.c.
+ * jpeg_fdct_ifast() function, which can be found in jfdctfst.c.
*
* Scaled integer constants are used to avoid floating-point arithmetic:
* 0.382683433 = 12544 * 2^-15
@@ -43,9 +43,9 @@
* 0.707106781 = 23168 * 2^-15
* 0.306562965 = 9984 * 2^-15
*
- * See jfdctfst.c for further details of the IDCT algorithm. Where possible,
- * the variable names and comments here in 'jsimd_fdct_ifast_neon' match up
- * with those in 'jpeg_fdct_ifast'.
+ * See jfdctfst.c for further details of the DCT algorithm. Where possible,
+ * the variable names and comments here in jsimd_fdct_ifast_neon() match up
+ * with those in jpeg_fdct_ifast().
*/
#define F_0_382 12544
@@ -53,16 +53,17 @@
#define F_0_707 23168
#define F_0_306 9984
+
ALIGN(16) static const int16_t jsimd_fdct_ifast_neon_consts[] = {
F_0_382, F_0_541, F_0_707, F_0_306
};
void jsimd_fdct_ifast_neon(DCTELEM *data)
{
- /* Load an 8x8 block of samples into Neon registers. De-interleaving loads */
- /* are used followed by vuzp to transpose the block such that we have a */
- /* column of samples per vector - allowing all rows to be processed at */
- /* once. */
+ /* Load an 8x8 block of samples into Neon registers. De-interleaving loads
+ * are used, followed by vuzp to transpose the block such that we have a
+ * column of samples per vector - allowing all rows to be processed at once.
+ */
int16x8x4_t data1 = vld4q_s16(data);
int16x8x4_t data2 = vld4q_s16(data + 4 * DCTSIZE);
@@ -80,10 +81,11 @@ void jsimd_fdct_ifast_neon(DCTELEM *data)
int16x8_t col6 = cols_26.val[1];
int16x8_t col7 = cols_37.val[1];
+ /* Pass 1: process rows. */
+
/* Load DCT conversion constants. */
const int16x4_t consts = vld1_s16(jsimd_fdct_ifast_neon_consts);
- /* Pass 1: process rows. */
int16x8_t tmp0 = vaddq_s16(col0, col7);
int16x8_t tmp7 = vsubq_s16(col0, col7);
int16x8_t tmp1 = vaddq_s16(col1, col6);
@@ -157,6 +159,7 @@ void jsimd_fdct_ifast_neon(DCTELEM *data)
int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
/* Pass 2: process columns. */
+
tmp0 = vaddq_s16(row0, row7);
tmp7 = vsubq_s16(row0, row7);
tmp1 = vaddq_s16(row1, row6);
diff --git a/simd/arm/common/jfdctint-neon.c b/simd/arm/jfdctint-neon.c
index 55abb1b..ccfc07b 100644
--- a/simd/arm/common/jfdctint-neon.c
+++ b/simd/arm/jfdctint-neon.c
@@ -1,7 +1,8 @@
/*
- * jfdctint-neon.c - accurate DCT (Arm NEON)
+ * jfdctint-neon.c - accurate integer FDCT (Arm Neon)
*
- * Copyright 2020 The Chromium Aruthors. All Rights Reserved.
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
@@ -21,21 +22,22 @@
*/
#define JPEG_INTERNALS
-#include "../../../jconfigint.h"
-#include "../../../jinclude.h"
-#include "../../../jpeglib.h"
-#include "../../../jsimd.h"
-#include "../../../jdct.h"
-#include "../../../jsimddct.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
#include <arm_neon.h>
-/*
- * 'jsimd_fdct_islow_neon' performs a slow-but-accurate forward DCT (Discrete
- * Cosine Transform) on one block of samples. It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_fdct_islow'
- * function, which can be found in jfdctint.c.
+
+/* jsimd_fdct_islow_neon() performs a slower but more accurate forward DCT
+ * (Discrete Cosine Transform) on one block of samples. It uses the same
+ * calculations and produces exactly the same output as IJG's original
+ * jpeg_fdct_islow() function, which can be found in jfdctint.c.
*
* Scaled integer constants are used to avoid floating-point arithmetic:
* 0.298631336 = 2446 * 2^-13
@@ -51,9 +53,9 @@
* 2.562915447 = 20995 * 2^-13
* 3.072711026 = 25172 * 2^-13
*
- * See jfdctint.c for further details of the DCT algorithm. Where possible,
- * the variable names and comments here in 'jsimd_fdct_islow_neon' match up
- * with those in 'jpeg_fdct_islow'.
+ * See jfdctint.c for further details of the DCT algorithm. Where possible,
+ * the variable names and comments here in jsimd_fdct_islow_neon() match up
+ * with those in jpeg_fdct_islow().
*/
#define CONST_BITS 13
@@ -75,6 +77,7 @@
#define F_2_562 20995
#define F_3_072 25172
+
ALIGN(16) static const int16_t jsimd_fdct_islow_neon_consts[] = {
F_0_298, -F_0_390, F_0_541, F_0_765,
-F_0_899, F_1_175, F_1_501, -F_1_847,
@@ -84,20 +87,20 @@ ALIGN(16) static const int16_t jsimd_fdct_islow_neon_consts[] = {
void jsimd_fdct_islow_neon(DCTELEM *data)
{
/* Load DCT constants. */
-#if defined(__clang__) || defined(_MSC_VER)
+#ifdef HAVE_VLD1_S16_X3
const int16x4x3_t consts = vld1_s16_x3(jsimd_fdct_islow_neon_consts);
#else
/* GCC does not currently support the intrinsic vld1_<type>_x3(). */
const int16x4_t consts1 = vld1_s16(jsimd_fdct_islow_neon_consts);
const int16x4_t consts2 = vld1_s16(jsimd_fdct_islow_neon_consts + 4);
const int16x4_t consts3 = vld1_s16(jsimd_fdct_islow_neon_consts + 8);
- const int16x4x3_t consts = { consts1, consts2, consts3 };
+ const int16x4x3_t consts = { { consts1, consts2, consts3 } };
#endif
- /* Load an 8x8 block of samples into Neon registers. De-interleaving loads */
- /* are used followed by vuzp to transpose the block such that we have a */
- /* column of samples per vector - allowing all rows to be processed at */
- /* once. */
+ /* Load an 8x8 block of samples into Neon registers. De-interleaving loads
+ * are used, followed by vuzp to transpose the block such that we have a
+ * column of samples per vector - allowing all rows to be processed at once.
+ */
int16x8x4_t s_rows_0123 = vld4q_s16(data);
int16x8x4_t s_rows_4567 = vld4q_s16(data + 4 * DCTSIZE);
@@ -116,6 +119,7 @@ void jsimd_fdct_islow_neon(DCTELEM *data)
int16x8_t col7 = cols_37.val[1];
/* Pass 1: process rows. */
+
int16x8_t tmp0 = vaddq_s16(col0, col7);
int16x8_t tmp7 = vsubq_s16(col0, col7);
int16x8_t tmp1 = vaddq_s16(col1, col6);
@@ -125,7 +129,7 @@ void jsimd_fdct_islow_neon(DCTELEM *data)
int16x8_t tmp3 = vaddq_s16(col3, col4);
int16x8_t tmp4 = vsubq_s16(col3, col4);
- /* Even part. */
+ /* Even part */
int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);
int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
@@ -135,26 +139,26 @@ void jsimd_fdct_islow_neon(DCTELEM *data)
col4 = vshlq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
int16x8_t tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
- int32x4_t z1_l = vmull_lane_s16(vget_low_s16(tmp12_add_tmp13),
- consts.val[0], 2);
- int32x4_t z1_h = vmull_lane_s16(vget_high_s16(tmp12_add_tmp13),
- consts.val[0], 2);
-
- int32x4_t col2_scaled_l = vmlal_lane_s16(z1_l, vget_low_s16(tmp13),
- consts.val[0], 3);
- int32x4_t col2_scaled_h = vmlal_lane_s16(z1_h, vget_high_s16(tmp13),
- consts.val[0], 3);
+ int32x4_t z1_l =
+ vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
+ int32x4_t z1_h =
+ vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
+
+ int32x4_t col2_scaled_l =
+ vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
+ int32x4_t col2_scaled_h =
+ vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
col2 = vcombine_s16(vrshrn_n_s32(col2_scaled_l, DESCALE_P1),
vrshrn_n_s32(col2_scaled_h, DESCALE_P1));
- int32x4_t col6_scaled_l = vmlal_lane_s16(z1_l, vget_low_s16(tmp12),
- consts.val[1], 3);
- int32x4_t col6_scaled_h = vmlal_lane_s16(z1_h, vget_high_s16(tmp12),
- consts.val[1], 3);
+ int32x4_t col6_scaled_l =
+ vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
+ int32x4_t col6_scaled_h =
+ vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
col6 = vcombine_s16(vrshrn_n_s32(col6_scaled_l, DESCALE_P1),
vrshrn_n_s32(col6_scaled_h, DESCALE_P1));
- /* Odd part. */
+ /* Odd part */
int16x8_t z1 = vaddq_s16(tmp4, tmp7);
int16x8_t z2 = vaddq_s16(tmp5, tmp6);
int16x8_t z3 = vaddq_s16(tmp4, tmp6);
@@ -253,7 +257,8 @@ void jsimd_fdct_islow_neon(DCTELEM *data)
int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
- /* Pass 2. */
+ /* Pass 2: process columns. */
+
tmp0 = vaddq_s16(row0, row7);
tmp7 = vsubq_s16(row0, row7);
tmp1 = vaddq_s16(row1, row6);
@@ -263,7 +268,7 @@ void jsimd_fdct_islow_neon(DCTELEM *data)
tmp3 = vaddq_s16(row3, row4);
tmp4 = vsubq_s16(row3, row4);
- /* Even part. */
+ /* Even part */
tmp10 = vaddq_s16(tmp0, tmp3);
tmp13 = vsubq_s16(tmp0, tmp3);
tmp11 = vaddq_s16(tmp1, tmp2);
@@ -276,21 +281,21 @@ void jsimd_fdct_islow_neon(DCTELEM *data)
z1_l = vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
z1_h = vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
- int32x4_t row2_scaled_l = vmlal_lane_s16(z1_l, vget_low_s16(tmp13),
- consts.val[0], 3);
- int32x4_t row2_scaled_h = vmlal_lane_s16(z1_h, vget_high_s16(tmp13),
- consts.val[0], 3);
+ int32x4_t row2_scaled_l =
+ vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
+ int32x4_t row2_scaled_h =
+ vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
row2 = vcombine_s16(vrshrn_n_s32(row2_scaled_l, DESCALE_P2),
vrshrn_n_s32(row2_scaled_h, DESCALE_P2));
- int32x4_t row6_scaled_l = vmlal_lane_s16(z1_l, vget_low_s16(tmp12),
- consts.val[1], 3);
- int32x4_t row6_scaled_h = vmlal_lane_s16(z1_h, vget_high_s16(tmp12),
- consts.val[1], 3);
+ int32x4_t row6_scaled_l =
+ vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
+ int32x4_t row6_scaled_h =
+ vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
row6 = vcombine_s16(vrshrn_n_s32(row6_scaled_l, DESCALE_P2),
vrshrn_n_s32(row6_scaled_h, DESCALE_P2));
- /* Odd part. */
+ /* Odd part */
z1 = vaddq_s16(tmp4, tmp7);
z2 = vaddq_s16(tmp5, tmp6);
z3 = vaddq_s16(tmp4, tmp6);
diff --git a/simd/arm/common/jidctfst-neon.c b/simd/arm/jidctfst-neon.c
index 87806fd..a91be53 100644
--- a/simd/arm/common/jidctfst-neon.c
+++ b/simd/arm/jidctfst-neon.c
@@ -1,7 +1,7 @@
/*
- * jidctfst-neon.c - fast IDCT (Arm NEON)
+ * jidctfst-neon.c - fast integer IDCT (Arm Neon)
*
- * Copyright 2019 The Chromium Authors. All Rights Reserved.
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
@@ -21,20 +21,21 @@
*/
#define JPEG_INTERNALS
-#include "../../../jinclude.h"
-#include "../../../jpeglib.h"
-#include "../../../jsimd.h"
-#include "../../../jdct.h"
-#include "../../../jsimddct.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
#include <arm_neon.h>
-/*
- * 'jsimd_idct_ifast_neon' performs dequantization and a fast, not so accurate
- * inverse DCT (Discrete Cosine Transform) on one block of coefficients. It
+
+/* jsimd_idct_ifast_neon() performs dequantization and a fast, not so accurate
+ * inverse DCT (Discrete Cosine Transform) on one block of coefficients. It
* uses the same calculations and produces exactly the same output as IJG's
- * original 'jpeg_idct_ifast' function, which can be found in jidctfst.c.
+ * original jpeg_idct_ifast() function, which can be found in jidctfst.c.
*
* Scaled integer constants are used to avoid floating-point arithmetic:
* 0.082392200 = 2688 * 2^-15
@@ -42,9 +43,9 @@
* 0.847759065 = 27776 * 2^-15
* 0.613125930 = 20096 * 2^-15
*
- * See jidctfst.c for further details of the IDCT algorithm. Where possible,
- * the variable names and comments here in 'jsimd_idct_ifast_neon' match up
- * with those in 'jpeg_idct_ifast'.
+ * See jidctfst.c for further details of the IDCT algorithm. Where possible,
+ * the variable names and comments here in jsimd_idct_ifast_neon() match up
+ * with those in jpeg_idct_ifast().
*/
#define PASS1_BITS 2
@@ -54,10 +55,13 @@
#define F_0_847 27776
#define F_0_613 20096
-void jsimd_idct_ifast_neon(void *dct_table,
- JCOEFPTR coef_block,
- JSAMPARRAY output_buf,
- JDIMENSION output_col)
+
+ALIGN(16) static const int16_t jsimd_idct_ifast_neon_consts[] = {
+ F_0_082, F_0_414, F_0_847, F_0_613
+};
+
+void jsimd_idct_ifast_neon(void *dct_table, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
{
IFAST_MULT_TYPE *quantptr = dct_table;
@@ -87,9 +91,13 @@ void jsimd_idct_ifast_neon(void *dct_table,
int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
+ /* Load IDCT conversion constants. */
+ const int16x4_t consts = vld1_s16(jsimd_idct_ifast_neon_consts);
+
if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
- /* All AC coefficients are zero. */
- /* Compute DC values and duplicate into vectors. */
+ /* All AC coefficients are zero.
+ * Compute DC values and duplicate into vectors.
+ */
int16x8_t dcval = row0;
row1 = dcval;
row2 = dcval;
@@ -99,12 +107,14 @@ void jsimd_idct_ifast_neon(void *dct_table,
row6 = dcval;
row7 = dcval;
} else if (left_ac_bitmap == 0) {
- /* AC coefficients are zero for columns 0, 1, 2 and 3. */
- /* Use DC values for these columns. */
+ /* AC coefficients are zero for columns 0, 1, 2, and 3.
+ * Use DC values for these columns.
+ */
int16x4_t dcval = vget_low_s16(row0);
- /* Commence regular fast IDCT computation for columns 4, 5, 6 and 7. */
- /* Load quantization table.*/
+ /* Commence regular fast IDCT computation for columns 4, 5, 6, and 7. */
+
+ /* Load quantization table. */
int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
@@ -124,7 +134,7 @@ void jsimd_idct_ifast_neon(void *dct_table,
int16x4_t tmp13 = vadd_s16(tmp1, tmp3); /* phases 5-3 */
int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
- int16x4_t tmp12 = vqdmulh_n_s16(tmp1_sub_tmp3, F_0_414);
+ int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
tmp12 = vsub_s16(tmp12, tmp13);
@@ -146,16 +156,16 @@ void jsimd_idct_ifast_neon(void *dct_table,
tmp7 = vadd_s16(z11, z13); /* phase 5 */
int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
- tmp11 = vqdmulh_n_s16(z11_sub_z13, F_0_414);
+ tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
tmp11 = vadd_s16(tmp11, z11_sub_z13);
int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
- int16x4_t z5 = vqdmulh_n_s16(z10_add_z12, F_0_847);
+ int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
z5 = vadd_s16(z5, z10_add_z12);
- tmp10 = vqdmulh_n_s16(z12, F_0_082);
+ tmp10 = vqdmulh_lane_s16(z12, consts, 0);
tmp10 = vadd_s16(tmp10, z12);
tmp10 = vsub_s16(tmp10, z5);
- tmp12 = vqdmulh_n_s16(neg_z10, F_0_613);
+ tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
tmp12 = vadd_s16(tmp12, z5);
@@ -172,12 +182,14 @@ void jsimd_idct_ifast_neon(void *dct_table,
row4 = vcombine_s16(dcval, vadd_s16(tmp3, tmp4));
row3 = vcombine_s16(dcval, vsub_s16(tmp3, tmp4));
} else if (right_ac_bitmap == 0) {
- /* AC coefficients are zero for columns 4, 5, 6 and 7. */
- /* Use DC values for these columns. */
+ /* AC coefficients are zero for columns 4, 5, 6, and 7.
+ * Use DC values for these columns.
+ */
int16x4_t dcval = vget_high_s16(row0);
- /* Commence regular fast IDCT computation for columns 0, 1, 2 and 3. */
- /* Load quantization table.*/
+ /* Commence regular fast IDCT computation for columns 0, 1, 2, and 3. */
+
+ /* Load quantization table. */
int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
@@ -197,7 +209,7 @@ void jsimd_idct_ifast_neon(void *dct_table,
int16x4_t tmp13 = vadd_s16(tmp1, tmp3); /* phases 5-3 */
int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
- int16x4_t tmp12 = vqdmulh_n_s16(tmp1_sub_tmp3, F_0_414);
+ int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
tmp12 = vsub_s16(tmp12, tmp13);
@@ -219,16 +231,16 @@ void jsimd_idct_ifast_neon(void *dct_table,
tmp7 = vadd_s16(z11, z13); /* phase 5 */
int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
- tmp11 = vqdmulh_n_s16(z11_sub_z13, F_0_414);
+ tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
tmp11 = vadd_s16(tmp11, z11_sub_z13);
int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
- int16x4_t z5 = vqdmulh_n_s16(z10_add_z12, F_0_847);
+ int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
z5 = vadd_s16(z5, z10_add_z12);
- tmp10 = vqdmulh_n_s16(z12, F_0_082);
+ tmp10 = vqdmulh_lane_s16(z12, consts, 0);
tmp10 = vadd_s16(tmp10, z12);
tmp10 = vsub_s16(tmp10, z5);
- tmp12 = vqdmulh_n_s16(neg_z10, F_0_613);
+ tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
tmp12 = vadd_s16(tmp12, z5);
@@ -246,7 +258,8 @@ void jsimd_idct_ifast_neon(void *dct_table,
row3 = vcombine_s16(vsub_s16(tmp3, tmp4), dcval);
} else {
/* Some AC coefficients are non-zero; full IDCT calculation required. */
- /* Load quantization table.*/
+
+ /* Load quantization table. */
int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
@@ -266,7 +279,7 @@ void jsimd_idct_ifast_neon(void *dct_table,
int16x8_t tmp13 = vaddq_s16(tmp1, tmp3); /* phases 5-3 */
int16x8_t tmp1_sub_tmp3 = vsubq_s16(tmp1, tmp3);
- int16x8_t tmp12 = vqdmulhq_n_s16(tmp1_sub_tmp3, F_0_414);
+ int16x8_t tmp12 = vqdmulhq_lane_s16(tmp1_sub_tmp3, consts, 1);
tmp12 = vaddq_s16(tmp12, tmp1_sub_tmp3);
tmp12 = vsubq_s16(tmp12, tmp13);
@@ -288,16 +301,16 @@ void jsimd_idct_ifast_neon(void *dct_table,
tmp7 = vaddq_s16(z11, z13); /* phase 5 */
int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
- tmp11 = vqdmulhq_n_s16(z11_sub_z13, F_0_414);
+ tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
tmp11 = vaddq_s16(tmp11, z11_sub_z13);
int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
- int16x8_t z5 = vqdmulhq_n_s16(z10_add_z12, F_0_847);
+ int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
z5 = vaddq_s16(z5, z10_add_z12);
- tmp10 = vqdmulhq_n_s16(z12, F_0_082);
+ tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
tmp10 = vaddq_s16(tmp10, z12);
tmp10 = vsubq_s16(tmp10, z5);
- tmp12 = vqdmulhq_n_s16(neg_z10, F_0_613);
+ tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
tmp12 = vaddq_s16(tmp12, z5);
@@ -315,7 +328,7 @@ void jsimd_idct_ifast_neon(void *dct_table,
row3 = vsubq_s16(tmp3, tmp4);
}
- /* Tranpose rows to work on columns in pass 2. */
+ /* Transpose rows to work on columns in pass 2. */
int16x8x2_t rows_01 = vtrnq_s16(row0, row1);
int16x8x2_t rows_23 = vtrnq_s16(row2, row3);
int16x8x2_t rows_45 = vtrnq_s16(row4, row5);
@@ -344,14 +357,15 @@ void jsimd_idct_ifast_neon(void *dct_table,
int16x8_t col6 = vreinterpretq_s16_s32(cols_26.val[1]);
int16x8_t col7 = vreinterpretq_s16_s32(cols_37.val[1]);
- /* 1-D IDCT, pass 2. */
- /* Even part. */
+ /* 1-D IDCT, pass 2 */
+
+ /* Even part */
int16x8_t tmp10 = vaddq_s16(col0, col4);
int16x8_t tmp11 = vsubq_s16(col0, col4);
int16x8_t tmp13 = vaddq_s16(col2, col6);
int16x8_t col2_sub_col6 = vsubq_s16(col2, col6);
- int16x8_t tmp12 = vqdmulhq_n_s16(col2_sub_col6, F_0_414);
+ int16x8_t tmp12 = vqdmulhq_lane_s16(col2_sub_col6, consts, 1);
tmp12 = vaddq_s16(tmp12, col2_sub_col6);
tmp12 = vsubq_s16(tmp12, tmp13);
@@ -360,7 +374,7 @@ void jsimd_idct_ifast_neon(void *dct_table,
int16x8_t tmp1 = vaddq_s16(tmp11, tmp12);
int16x8_t tmp2 = vsubq_s16(tmp11, tmp12);
- /* Odd part. */
+ /* Odd part */
int16x8_t z13 = vaddq_s16(col5, col3);
int16x8_t neg_z10 = vsubq_s16(col3, col5);
int16x8_t z11 = vaddq_s16(col1, col7);
@@ -368,16 +382,16 @@ void jsimd_idct_ifast_neon(void *dct_table,
int16x8_t tmp7 = vaddq_s16(z11, z13); /* phase 5 */
int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
- tmp11 = vqdmulhq_n_s16(z11_sub_z13, F_0_414);
+ tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
tmp11 = vaddq_s16(tmp11, z11_sub_z13);
int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
- int16x8_t z5 = vqdmulhq_n_s16(z10_add_z12, F_0_847);
+ int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
z5 = vaddq_s16(z5, z10_add_z12);
- tmp10 = vqdmulhq_n_s16(z12, F_0_082);
+ tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
tmp10 = vaddq_s16(tmp10, z12);
tmp10 = vsubq_s16(tmp10, z5);
- tmp12 = vqdmulhq_n_s16(neg_z10, F_0_613);
+ tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
tmp12 = vaddq_s16(tmp12, z5);
@@ -394,7 +408,7 @@ void jsimd_idct_ifast_neon(void *dct_table,
col4 = vaddq_s16(tmp3, tmp4);
col3 = vsubq_s16(tmp3, tmp4);
- /* Scale down by factor of 8, narrowing to 8-bit. */
+ /* Scale down by a factor of 8, narrowing to 8-bit. */
int8x16_t cols_01_s8 = vcombine_s8(vqshrn_n_s16(col0, PASS1_BITS + 3),
vqshrn_n_s16(col1, PASS1_BITS + 3));
int8x16_t cols_45_s8 = vcombine_s8(vqshrn_n_s16(col4, PASS1_BITS + 3),
@@ -404,16 +418,20 @@ void jsimd_idct_ifast_neon(void *dct_table,
int8x16_t cols_67_s8 = vcombine_s8(vqshrn_n_s16(col6, PASS1_BITS + 3),
vqshrn_n_s16(col7, PASS1_BITS + 3));
/* Clamp to range [0-255]. */
- uint8x16_t cols_01 = vreinterpretq_u8_s8(
- vaddq_s8(cols_01_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
- uint8x16_t cols_45 = vreinterpretq_u8_s8(
- vaddq_s8(cols_45_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
- uint8x16_t cols_23 = vreinterpretq_u8_s8(
- vaddq_s8(cols_23_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
- uint8x16_t cols_67 = vreinterpretq_u8_s8(
- vaddq_s8(cols_67_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
-
- /* Transpose block ready for store. */
+ uint8x16_t cols_01 =
+ vreinterpretq_u8_s8
+ (vaddq_s8(cols_01_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+ uint8x16_t cols_45 =
+ vreinterpretq_u8_s8
+ (vaddq_s8(cols_45_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+ uint8x16_t cols_23 =
+ vreinterpretq_u8_s8
+ (vaddq_s8(cols_23_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+ uint8x16_t cols_67 =
+ vreinterpretq_u8_s8
+ (vaddq_s8(cols_67_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+
+ /* Transpose block to prepare for store. */
uint32x4x2_t cols_0415 = vzipq_u32(vreinterpretq_u32_u8(cols_01),
vreinterpretq_u32_u8(cols_45));
uint32x4x2_t cols_2637 = vzipq_u32(vreinterpretq_u32_u8(cols_23),
diff --git a/simd/arm/common/jidctint-neon.c b/simd/arm/jidctint-neon.c
index 0fd4a36..043b652 100644
--- a/simd/arm/common/jidctint-neon.c
+++ b/simd/arm/jidctint-neon.c
@@ -1,7 +1,8 @@
/*
- * jidctint-neon.c - slow IDCT (Arm NEON)
+ * jidctint-neon.c - accurate integer IDCT (Arm Neon)
*
- * Copyright 2019 The Chromium Authors. All Rights Reserved.
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
@@ -21,16 +22,19 @@
*/
#define JPEG_INTERNALS
-#include "../../../jconfigint.h"
-#include "../../../jinclude.h"
-#include "../../../jpeglib.h"
-#include "../../../jsimd.h"
-#include "../../../jdct.h"
-#include "../../../jsimddct.h"
+#include "jconfigint.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
#include <arm_neon.h>
+
#define CONST_BITS 13
#define PASS1_BITS 2
@@ -38,7 +42,7 @@
#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
/* The computation of the inverse DCT requires the use of constants known at
- * compile-time. Scaled integer constants are used to avoid floating-point
+ * compile time. Scaled integer constants are used to avoid floating-point
* arithmetic:
* 0.298631336 = 2446 * 2^-13
* 0.390180644 = 3196 * 2^-13
@@ -76,19 +80,21 @@
#define F_2_053_MINUS_2_562 (F_2_053 - F_2_562)
#define F_0_541_PLUS_0_765 (F_0_541 + F_0_765)
+
ALIGN(16) static const int16_t jsimd_idct_islow_neon_consts[] = {
- F_0_899, F_0_541,
- F_2_562, F_0_298_MINUS_0_899,
- F_1_501_MINUS_0_899, F_2_053_MINUS_2_562,
- F_0_541_PLUS_0_765, F_1_175,
- F_1_175_MINUS_0_390, F_0_541_MINUS_1_847,
- F_3_072_MINUS_2_562, F_1_175_MINUS_1_961,
- 0, 0, 0, 0
- };
-
-/* Forward declaration of regular and sparse IDCT helper functions. */
-
-static inline void jsimd_idct_islow_pass1_regular(int16x4_t row0,
+ F_0_899, F_0_541,
+ F_2_562, F_0_298_MINUS_0_899,
+ F_1_501_MINUS_0_899, F_2_053_MINUS_2_562,
+ F_0_541_PLUS_0_765, F_1_175,
+ F_1_175_MINUS_0_390, F_0_541_MINUS_1_847,
+ F_3_072_MINUS_2_562, F_1_175_MINUS_1_961,
+ 0, 0, 0, 0
+};
+
+
+/* Forward declaration of regular and sparse IDCT helper functions */
+
+static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
int16x4_t row1,
int16x4_t row2,
int16x4_t row3,
@@ -107,7 +113,7 @@ static inline void jsimd_idct_islow_pass1_regular(int16x4_t row0,
int16_t *workspace_1,
int16_t *workspace_2);
-static inline void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
+static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
int16x4_t row1,
int16x4_t row2,
int16x4_t row3,
@@ -118,32 +124,33 @@ static inline void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
int16_t *workspace_1,
int16_t *workspace_2);
-static inline void jsimd_idct_islow_pass2_regular(int16_t *workspace,
+static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
JSAMPARRAY output_buf,
JDIMENSION output_col,
unsigned buf_offset);
-static inline void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
+static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
JSAMPARRAY output_buf,
JDIMENSION output_col,
unsigned buf_offset);
-/* Performs dequantization and inverse DCT on one block of coefficients. For
- * reference, the C implementation 'jpeg_idct_slow' can be found jidctint.c.
+/* Perform dequantization and inverse DCT on one block of coefficients. For
+ * reference, the C implementation (jpeg_idct_slow()) can be found in
+ * jidctint.c.
*
- * Optimization techniques used for data access:
+ * Optimization techniques used for fast data access:
*
- * In each pass, the inverse DCT is computed on the left and right 4x8 halves
- * of the DCT block. This avoids spilling due to register pressure and the
- * increased granularity allows an optimized calculation depending on the
- * values of the DCT coefficients. Between passes, intermediate data is stored
+ * In each pass, the inverse DCT is computed for the left and right 4x8 halves
+ * of the DCT block. This avoids spilling due to register pressure, and the
+ * increased granularity allows for an optimized calculation depending on the
+ * values of the DCT coefficients. Between passes, intermediate data is stored
* in 4x8 workspace buffers.
*
* Transposing the 8x8 DCT block after each pass can be achieved by transposing
- * each of the four 4x4 quadrants, and swapping quadrants 1 and 2 (in the
- * diagram below.) Swapping quadrants is cheap as the second pass can just load
- * from the other workspace buffer.
+ * each of the four 4x4 quadrants and swapping quadrants 1 and 2 (refer to the
+ * diagram below.) Swapping quadrants is cheap, since the second pass can just
+ * swap the workspace buffer pointers.
*
* +-------+-------+ +-------+-------+
* | | | | | |
@@ -158,32 +165,30 @@ static inline void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
* Optimization techniques used to accelerate the inverse DCT calculation:
*
* In a DCT coefficient block, the coefficients are increasingly likely to be 0
- * moving diagonally from top left to bottom right. If whole rows of
- * coefficients are 0, the inverse DCT calculation can be simplified. In this
- * NEON implementation, on the first pass of the inverse DCT, we test for three
- * special cases before defaulting to a full 'regular' inverse DCT:
+ * as you move diagonally from top left to bottom right. If whole rows of
+ * coefficients are 0, then the inverse DCT calculation can be simplified. On
+ * the first pass of the inverse DCT, we test for three special cases before
+ * defaulting to a full "regular" inverse DCT:
*
- * i) AC and DC coefficients are all zero. (Only tested for the right 4x8
- * half of the DCT coefficient block.) In this case the inverse DCT result
- * is all zero. We do no work here, signalling that the 'sparse' case is
- * required in the second pass.
- * ii) AC coefficients (all but the top row) are zero. In this case, the value
- * of the inverse DCT of the AC coefficients is just the DC coefficients.
- * iii) Coefficients of rows 4, 5, 6 and 7 are all zero. In this case we opt to
- * execute a 'sparse' simplified inverse DCT.
+ * 1) Coefficients in rows 4-7 are all zero. In this case, we perform a
+ * "sparse" simplified inverse DCT on rows 0-3.
+ * 2) AC coefficients (rows 1-7) are all zero. In this case, the inverse DCT
+ * result is equal to the dequantized DC coefficients.
+ * 3) AC and DC coefficients are all zero. In this case, the inverse DCT
+ * result is all zero. For the left 4x8 half, this is handled identically
+ * to Case 2 above. For the right 4x8 half, we do no work and signal that
+ * the "sparse" algorithm is required for the second pass.
*
- * In the second pass, only a single special case is tested: whether the the AC
- * and DC coefficients were all zero in the right 4x8 block in the first pass
- * (case 'i'). If this is the case, a 'sparse' variant of the second pass
- * inverse DCT is executed for both the left and right halves of the DCT block.
- * (The transposition after the first pass would have made the bottom half of
- * the block all zero.)
+ * In the second pass, only a single special case is tested: whether the AC and
+ * DC coefficients were all zero in the right 4x8 block during the first pass
+ * (refer to Case 3 above.) If this is the case, then a "sparse" variant of
+ * the second pass is performed for both the left and right halves of the DCT
+ * block. (The transposition after the first pass means that the right 4x8
+ * block during the first pass becomes rows 4-7 during the second pass.)
*/
-void jsimd_idct_islow_neon(void *dct_table,
- JCOEFPTR coef_block,
- JSAMPARRAY output_buf,
- JDIMENSION output_col)
+void jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
{
ISLOW_MULT_TYPE *quantptr = dct_table;
@@ -191,6 +196,7 @@ void jsimd_idct_islow_neon(void *dct_table,
int16_t workspace_r[8 * DCTSIZE / 2];
/* Compute IDCT first pass on left 4x8 coefficient block. */
+
/* Load DCT coefficients in left 4x8 block. */
int16x4_t row0 = vld1_s16(coef_block + 0 * DCTSIZE);
int16x4_t row1 = vld1_s16(coef_block + 1 * DCTSIZE);
@@ -225,7 +231,7 @@ void jsimd_idct_islow_neon(void *dct_table,
if (left_ac_bitmap == 0) {
int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
- int16x4x4_t quadrant = { dcval, dcval, dcval, dcval };
+ int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
/* Store 4x4 blocks to workspace, transposing in the process. */
vst4_s16(workspace_l, quadrant);
vst4_s16(workspace_r, quadrant);
@@ -242,8 +248,9 @@ void jsimd_idct_islow_neon(void *dct_table,
workspace_l, workspace_r);
}
- /* Compute IDCT first pass on right 4x8 coefficient block.*/
- /* Load DCT coefficients for right 4x8 block. */
+ /* Compute IDCT first pass on right 4x8 coefficient block. */
+
+ /* Load DCT coefficients in right 4x8 block. */
row0 = vld1_s16(coef_block + 0 * DCTSIZE + 4);
row1 = vld1_s16(coef_block + 1 * DCTSIZE + 4);
row2 = vld1_s16(coef_block + 2 * DCTSIZE + 4);
@@ -273,7 +280,7 @@ void jsimd_idct_islow_neon(void *dct_table,
bitmap = vorr_s16(bitmap, row1);
int64_t right_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
- /* Initialise to non-zero value: defaults to regular second pass. */
+ /* If this remains non-zero, a "regular" second pass will be performed. */
int64_t right_ac_dc_bitmap = 1;
if (right_ac_bitmap == 0) {
@@ -282,7 +289,7 @@ void jsimd_idct_islow_neon(void *dct_table,
if (right_ac_dc_bitmap != 0) {
int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
- int16x4x4_t quadrant = { dcval, dcval, dcval, dcval };
+ int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
/* Store 4x4 blocks to workspace, transposing in the process. */
vst4_s16(workspace_l + 4 * DCTSIZE / 2, quadrant);
vst4_s16(workspace_r + 4 * DCTSIZE / 2, quadrant);
@@ -304,7 +311,8 @@ void jsimd_idct_islow_neon(void *dct_table,
}
/* Second pass: compute IDCT on rows in workspace. */
- /* If all coefficients in right 4x8 block are 0, use 'sparse' second pass. */
+
+ /* If all coefficients in right 4x8 block are 0, use "sparse" second pass. */
if (right_ac_dc_bitmap == 0) {
jsimd_idct_islow_pass2_sparse(workspace_l, output_buf, output_col, 0);
jsimd_idct_islow_pass2_sparse(workspace_r, output_buf, output_col, 4);
@@ -315,19 +323,19 @@ void jsimd_idct_islow_neon(void *dct_table,
}
-/* Performs dequantization and the first pass of the slow-but-accurate inverse
- * DCT on a 4x8 block of coefficients. (To process the full 8x8 DCT block this
- * function - or some other optimized variant - needs to be called on both the
- * right and left 4x8 blocks.)
+/* Perform dequantization and the first pass of the accurate inverse DCT on a
+ * 4x8 block of coefficients. (To process the full 8x8 DCT block, this
+ * function-- or some other optimized variant-- needs to be called for both the
+ * left and right 4x8 blocks.)
*
- * This 'regular' version assumes that no optimization can be made to the IDCT
- * calculation since no useful set of AC coefficients are all 0.
+ * This "regular" version assumes that no optimization can be made to the IDCT
+ * calculation, since no useful set of AC coefficients is all 0.
*
- * The original C implementation of the slow IDCT 'jpeg_idct_slow' can be found
- * in jidctint.c. Algorithmic changes made here are documented inline.
+ * The original C implementation of the accurate IDCT (jpeg_idct_slow()) can be
+ * found in jidctint.c. Algorithmic changes made here are documented inline.
*/
-static inline void jsimd_idct_islow_pass1_regular(int16x4_t row0,
+static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
int16x4_t row1,
int16x4_t row2,
int16x4_t row3,
@@ -346,10 +354,17 @@ static inline void jsimd_idct_islow_pass1_regular(int16x4_t row0,
int16_t *workspace_1,
int16_t *workspace_2)
{
- /* Load constants for IDCT calculation. */
+ /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
-
- /* Even part. */
+#else
+ const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+ const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+ const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+ const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+ /* Even part */
int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
int16x4_t z3_s16 = vmul_s16(row6, quant_row6);
@@ -369,7 +384,7 @@ static inline void jsimd_idct_islow_pass1_regular(int16x4_t row0,
int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
- /* Odd part. */
+ /* Odd part */
int16x4_t tmp0_s16 = vmul_s16(row7, quant_row7);
int16x4_t tmp1_s16 = vmul_s16(row5, quant_row5);
int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
@@ -378,7 +393,7 @@ static inline void jsimd_idct_islow_pass1_regular(int16x4_t row0,
z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
- /* Implementation as per 'jpeg_idct_islow' in jidctint.c:
+ /* Implementation as per jpeg_idct_islow() in jidctint.c:
* z5 = (z3 + z4) * 1.175875602;
* z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
* z3 += z5; z4 += z5;
@@ -393,7 +408,7 @@ static inline void jsimd_idct_islow_pass1_regular(int16x4_t row0,
z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
- /* Implementation as per 'jpeg_idct_islow' in jidctint.c:
+ /* Implementation as per jpeg_idct_islow() in jidctint.c:
* z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
* tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
* tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
@@ -426,33 +441,36 @@ static inline void jsimd_idct_islow_pass1_regular(int16x4_t row0,
tmp3 = vaddq_s32(tmp3, z4);
/* Final output stage: descale and narrow to 16-bit. */
- int16x4x4_t rows_0123 = { vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
- vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
- vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
- vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
- };
- int16x4x4_t rows_4567 = { vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
- vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
- vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
- vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
- };
-
- /* Store 4x4 blocks to the intermediate workspace ready for second pass. */
- /* (VST4 transposes the blocks - we need to operate on rows in next pass.) */
+ int16x4x4_t rows_0123 = { {
+ vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
+ } };
+ int16x4x4_t rows_4567 = { {
+ vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
+ } };
+
+ /* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
+ * (VST4 transposes the blocks. We need to operate on rows in the next
+ * pass.)
+ */
vst4_s16(workspace_1, rows_0123);
vst4_s16(workspace_2, rows_4567);
}
-/* Performs dequantization and the first pass of the slow-but-accurate inverse
- * DCT on a 4x8 block of coefficients.
+/* Perform dequantization and the first pass of the accurate inverse DCT on a
+ * 4x8 block of coefficients.
*
- * This 'sparse' version assumes that the AC coefficients in rows 4, 5, 6 and 7
- * are all 0. This simplifies the IDCT calculation, accelerating overall
- * performance.
+ * This "sparse" version assumes that the AC coefficients in rows 4-7 are all
+ * 0. This simplifies the IDCT calculation, accelerating overall performance.
*/
-static inline void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
+static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
int16x4_t row1,
int16x4_t row2,
int16x4_t row3,
@@ -464,11 +482,17 @@ static inline void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
int16_t *workspace_2)
{
/* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
-
- /* Even part. */
+#else
+ const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+ const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+ const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+ const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+ /* Even part (z3 is all 0) */
int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
- /* z3 is all 0. */
int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
@@ -482,8 +506,7 @@ static inline void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
- /* Odd part. */
- /* tmp0 and tmp1 are both all 0. */
+ /* Odd part (tmp0 and tmp1 are both all 0) */
int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
@@ -501,46 +524,58 @@ static inline void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
/* Final output stage: descale and narrow to 16-bit. */
- int16x4x4_t rows_0123 = { vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
- vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
- vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
- vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
- };
- int16x4x4_t rows_4567 = { vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
- vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
- vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
- vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
- };
-
- /* Store 4x4 blocks to the intermediate workspace ready for second pass. */
- /* (VST4 transposes the blocks - we need to operate on rows in next pass.) */
+ int16x4x4_t rows_0123 = { {
+ vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
+ vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
+ } };
+ int16x4x4_t rows_4567 = { {
+ vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
+ vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
+ } };
+
+ /* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
+ * (VST4 transposes the blocks. We need to operate on rows in the next
+ * pass.)
+ */
vst4_s16(workspace_1, rows_0123);
vst4_s16(workspace_2, rows_4567);
}
-/* Performs the second pass of the slow-but-accurate inverse DCT on a 4x8 block
- * of coefficients. (To process the full 8x8 DCT block this function - or some
- * other optimized variant - needs to be called on both the right and left 4x8
+/* Perform the second pass of the accurate inverse DCT on a 4x8 block of
+ * coefficients. (To process the full 8x8 DCT block, this function-- or some
+ * other optimized variant-- needs to be called for both the right and left 4x8
* blocks.)
*
- * This 'regular' version assumes that no optimization can be made to the IDCT
- * calculation since no useful set of coefficient values are all 0 after the
+ * This "regular" version assumes that no optimization can be made to the IDCT
+ * calculation, since no useful set of coefficient values are all 0 after the
* first pass.
*
- * Again, the original C implementation of the slow IDCT 'jpeg_idct_slow' can
- * be found in jidctint.c. Algorithmic changes made here are documented inline.
+ * Again, the original C implementation of the accurate IDCT (jpeg_idct_slow())
+ * can be found in jidctint.c. Algorithmic changes made here are documented
+ * inline.
*/
-static inline void jsimd_idct_islow_pass2_regular(int16_t *workspace,
+static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
JSAMPARRAY output_buf,
JDIMENSION output_col,
unsigned buf_offset)
{
/* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
-
- /* Even part. */
+#else
+ const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+ const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+ const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+ const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+ /* Even part */
int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
int16x4_t z3_s16 = vld1_s16(workspace + 6 * DCTSIZE / 2);
@@ -560,7 +595,7 @@ static inline void jsimd_idct_islow_pass2_regular(int16_t *workspace,
int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
- /* Odd part. */
+ /* Odd part */
int16x4_t tmp0_s16 = vld1_s16(workspace + 7 * DCTSIZE / 2);
int16x4_t tmp1_s16 = vld1_s16(workspace + 5 * DCTSIZE / 2);
int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
@@ -569,7 +604,7 @@ static inline void jsimd_idct_islow_pass2_regular(int16_t *workspace,
z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
- /* Implementation as per 'jpeg_idct_islow' in jidctint.c:
+ /* Implementation as per jpeg_idct_islow() in jidctint.c:
* z5 = (z3 + z4) * 1.175875602;
* z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
* z3 += z5; z4 += z5;
@@ -584,7 +619,7 @@ static inline void jsimd_idct_islow_pass2_regular(int16_t *workspace,
z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
- /* Implementation as per 'jpeg_idct_islow' in jidctint.c:
+ /* Implementation as per jpeg_idct_islow() in jidctint.c:
* z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
* tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
* tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
@@ -640,15 +675,17 @@ static inline void jsimd_idct_islow_pass2_regular(int16_t *workspace,
uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
vdup_n_u8(CENTERJSAMPLE));
- /* Transpose 4x8 block and store to memory. */
- /* Zipping adjacent columns together allows us to store 16-bit elements. */
+ /* Transpose 4x8 block and store to memory. (Zipping adjacent columns
+ * together allows us to store 16-bit elements.)
+ */
uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
- uint16x4x4_t cols_01_23_45_67 = { vreinterpret_u16_u8(cols_01_23.val[0]),
- vreinterpret_u16_u8(cols_01_23.val[1]),
- vreinterpret_u16_u8(cols_45_67.val[0]),
- vreinterpret_u16_u8(cols_45_67.val[1])
- };
+ uint16x4x4_t cols_01_23_45_67 = { {
+ vreinterpret_u16_u8(cols_01_23.val[0]),
+ vreinterpret_u16_u8(cols_01_23.val[1]),
+ vreinterpret_u16_u8(cols_45_67.val[0]),
+ vreinterpret_u16_u8(cols_45_67.val[1])
+ } };
JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
@@ -662,25 +699,31 @@ static inline void jsimd_idct_islow_pass2_regular(int16_t *workspace,
}
-/* Performs the second pass of the slow-but-accurate inverse DCT on a 4x8 block
+/* Performs the second pass of the accurate inverse DCT on a 4x8 block
* of coefficients.
*
- * This 'sparse' version assumes that the coefficient values (after the first
- * pass) in rows 4, 5, 6 and 7 are all 0. This simplifies the IDCT calculation,
+ * This "sparse" version assumes that the coefficient values (after the first
+ * pass) in rows 4-7 are all 0. This simplifies the IDCT calculation,
* accelerating overall performance.
*/
-static inline void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
+static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
JSAMPARRAY output_buf,
JDIMENSION output_col,
unsigned buf_offset)
{
/* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
-
- /* Even part. */
+#else
+ const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+ const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+ const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+ const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+ /* Even part (z3 is all 0) */
int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
- /* z3 is all 0. */
int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
@@ -694,8 +737,7 @@ static inline void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
- /* Odd part. */
- /* tmp0 and tmp1 are both all 0. */
+ /* Odd part (tmp0 and tmp1 are both all 0) */
int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
@@ -736,15 +778,17 @@ static inline void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
vdup_n_u8(CENTERJSAMPLE));
- /* Transpose 4x8 block and store to memory. */
- /* Zipping adjacent columns together allow us to store 16-bit elements. */
+ /* Transpose 4x8 block and store to memory. (Zipping adjacent columns
+ * together allows us to store 16-bit elements.)
+ */
uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
- uint16x4x4_t cols_01_23_45_67 = { vreinterpret_u16_u8(cols_01_23.val[0]),
- vreinterpret_u16_u8(cols_01_23.val[1]),
- vreinterpret_u16_u8(cols_45_67.val[0]),
- vreinterpret_u16_u8(cols_45_67.val[1])
- };
+ uint16x4x4_t cols_01_23_45_67 = { {
+ vreinterpret_u16_u8(cols_01_23.val[0]),
+ vreinterpret_u16_u8(cols_01_23.val[1]),
+ vreinterpret_u16_u8(cols_45_67.val[0]),
+ vreinterpret_u16_u8(cols_45_67.val[1])
+ } };
JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
diff --git a/simd/arm/common/jidctred-neon.c b/simd/arm/jidctred-neon.c
index ed4232c..be9627e 100644
--- a/simd/arm/common/jidctred-neon.c
+++ b/simd/arm/jidctred-neon.c
@@ -1,7 +1,8 @@
/*
- * jidctred-neon.c - reduced-size IDCT (Arm NEON)
+ * jidctred-neon.c - reduced-size IDCT (Arm Neon)
*
- * Copyright 2019 The Chromium Authors. All Rights Reserved.
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
@@ -21,16 +22,18 @@
*/
#define JPEG_INTERNALS
-#include "../../../jconfigint.h"
-#include "../../../jinclude.h"
-#include "../../../jpeglib.h"
-#include "../../../jsimd.h"
-#include "../../../jdct.h"
-#include "../../../jsimddct.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
#include <arm_neon.h>
+
#define CONST_BITS 13
#define PASS1_BITS 2
@@ -49,10 +52,10 @@
#define F_2_562 20995
#define F_3_624 29692
-/*
- * 'jsimd_idct_2x2_neon' is an inverse-DCT function for getting reduced-size
- * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations and
- * produces exactly the same output as IJG's original 'jpeg_idct_2x2' function
+
+/* jsimd_idct_2x2_neon() is an inverse DCT function that produces reduced-size
+ * 2x2 output from an 8x8 DCT block. It uses the same calculations and
+ * produces exactly the same output as IJG's original jpeg_idct_2x2() function
* from jpeg-6b, which can be found in jidctred.c.
*
* Scaled integer constants are used to avoid floating-point arithmetic:
@@ -61,20 +64,17 @@
* 1.272758580 = 10426 * 2^-13
* 3.624509785 = 29692 * 2^-13
*
- * See jidctred.c for further details of the 2x2 reduced IDCT algorithm. Where
- * possible, the variable names and comments here in 'jsimd_idct_2x2_neon'
- * match up with those in 'jpeg_idct_2x2'.
- *
- * NOTE: jpeg-8 has an improved implementation of the 2x2 inverse-DCT which
- * requires fewer arithmetic operations and hence should be faster. The
- * primary purpose of this particular NEON optimized function is bit
- * exact compatibility with jpeg-6b.
+ * See jidctred.c for further details of the 2x2 IDCT algorithm. Where
+ * possible, the variable names and comments here in jsimd_idct_2x2_neon()
+ * match up with those in jpeg_idct_2x2().
*/
-void jsimd_idct_2x2_neon(void *dct_table,
- JCOEFPTR coef_block,
- JSAMPARRAY restrict output_buf,
- JDIMENSION output_col)
+ALIGN(16) static const int16_t jsimd_idct_2x2_neon_consts[] = {
+ -F_0_720, F_0_850, -F_1_272, F_3_624
+};
+
+void jsimd_idct_2x2_neon(void *dct_table, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
{
ISLOW_MULT_TYPE *quantptr = dct_table;
@@ -85,7 +85,7 @@ void jsimd_idct_2x2_neon(void *dct_table,
int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
- /* Load DCT quantization table. */
+ /* Load quantization table values. */
int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
@@ -99,20 +99,26 @@ void jsimd_idct_2x2_neon(void *dct_table,
row5 = vmulq_s16(row5, quant_row5);
row7 = vmulq_s16(row7, quant_row7);
- /* Pass 1: process input columns; put results in vectors row0 and row1. */
- /* Even part. */
+ /* Load IDCT conversion constants. */
+ const int16x4_t consts = vld1_s16(jsimd_idct_2x2_neon_consts);
+
+ /* Pass 1: process columns from input, put results in vectors row0 and
+ * row1.
+ */
+
+ /* Even part */
int32x4_t tmp10_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 2);
int32x4_t tmp10_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 2);
- /* Odd part. */
- int32x4_t tmp0_l = vmull_n_s16(vget_low_s16(row1), F_3_624);
- tmp0_l = vmlal_n_s16(tmp0_l, vget_low_s16(row3), -F_1_272);
- tmp0_l = vmlal_n_s16(tmp0_l, vget_low_s16(row5), F_0_850);
- tmp0_l = vmlal_n_s16(tmp0_l, vget_low_s16(row7), -F_0_720);
- int32x4_t tmp0_h = vmull_n_s16(vget_high_s16(row1), F_3_624);
- tmp0_h = vmlal_n_s16(tmp0_h, vget_high_s16(row3), -F_1_272);
- tmp0_h = vmlal_n_s16(tmp0_h, vget_high_s16(row5), F_0_850);
- tmp0_h = vmlal_n_s16(tmp0_h, vget_high_s16(row7), -F_0_720);
+ /* Odd part */
+ int32x4_t tmp0_l = vmull_lane_s16(vget_low_s16(row1), consts, 3);
+ tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row3), consts, 2);
+ tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row5), consts, 1);
+ tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row7), consts, 0);
+ int32x4_t tmp0_h = vmull_lane_s16(vget_high_s16(row1), consts, 3);
+ tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row3), consts, 2);
+ tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row5), consts, 1);
+ tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row7), consts, 0);
/* Final output stage: descale and narrow to 16-bit. */
row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp0_l), CONST_BITS),
@@ -120,7 +126,7 @@ void jsimd_idct_2x2_neon(void *dct_table,
row1 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp0_l), CONST_BITS),
vrshrn_n_s32(vsubq_s32(tmp10_h, tmp0_h), CONST_BITS));
- /* Transpose two rows ready for second pass. */
+ /* Transpose two rows, ready for second pass. */
int16x8x2_t cols_0246_1357 = vtrnq_s16(row0, row1);
int16x8_t cols_0246 = cols_0246_1357.val[0];
int16x8_t cols_1357 = cols_0246_1357.val[1];
@@ -130,15 +136,18 @@ void jsimd_idct_2x2_neon(void *dct_table,
int16x8_t cols_1155 = vreinterpretq_s16_s32(cols_1155_3377.val[0]);
int16x8_t cols_3377 = vreinterpretq_s16_s32(cols_1155_3377.val[1]);
- /* Pass 2: process 2 rows, store to output array. */
- /* Even part: only interested in col0; top half of tmp10 is "don't care". */
+ /* Pass 2: process two rows, store to output array. */
+
+ /* Even part: we're only interested in col0; the top half of tmp10 is "don't
+ * care."
+ */
int32x4_t tmp10 = vshll_n_s16(vget_low_s16(cols_0246), CONST_BITS + 2);
- /* Odd part. Only interested in bottom half of tmp0. */
- int32x4_t tmp0 = vmull_n_s16(vget_low_s16(cols_1155), F_3_624);
- tmp0 = vmlal_n_s16(tmp0, vget_low_s16(cols_3377), -F_1_272);
- tmp0 = vmlal_n_s16(tmp0, vget_high_s16(cols_1155), F_0_850);
- tmp0 = vmlal_n_s16(tmp0, vget_high_s16(cols_3377), -F_0_720);
+ /* Odd part: we're only interested in the bottom half of tmp0. */
+ int32x4_t tmp0 = vmull_lane_s16(vget_low_s16(cols_1155), consts, 3);
+ tmp0 = vmlal_lane_s16(tmp0, vget_low_s16(cols_3377), consts, 2);
+ tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_1155), consts, 1);
+ tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_3377), consts, 0);
/* Final output stage: descale and clamp to range [0-255]. */
int16x8_t output_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp0),
@@ -156,10 +165,9 @@ void jsimd_idct_2x2_neon(void *dct_table,
}
-/*
- * 'jsimd_idct_4x4_neon' is an inverse-DCT function for getting reduced-size
- * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations and
- * produces exactly the same output as IJG's original 'jpeg_idct_4x4' function
+/* jsimd_idct_4x4_neon() is an inverse DCT function that produces reduced-size
+ * 4x4 output from an 8x8 DCT block. It uses the same calculations and
+ * produces exactly the same output as IJG's original jpeg_idct_4x4() function
* from jpeg-6b, which can be found in jidctred.c.
*
* Scaled integer constants are used to avoid floating-point arithmetic:
@@ -174,26 +182,19 @@ void jsimd_idct_2x2_neon(void *dct_table,
* 2.172734803 = 17799 * 2^-13
* 2.562915447 = 20995 * 2^-13
*
- * See jidctred.c for further details of the 4x4 reduced IDCT algorithm. Where
- * possible, the variable names and comments here in 'jsimd_idct_4x4_neon'
- * match up with those in 'jpeg_idct_4x4'.
- *
- * NOTE: jpeg-8 has an improved implementation of the 4x4 inverse-DCT which
- * requires fewer arithmetic operations and hence should be faster. The
- * primary purpose of this particular NEON optimized function is bit
- * exact compatibility with jpeg-6b.
+ * See jidctred.c for further details of the 4x4 IDCT algorithm. Where
+ * possible, the variable names and comments here in jsimd_idct_4x4_neon()
+ * match up with those in jpeg_idct_4x4().
*/
ALIGN(16) static const int16_t jsimd_idct_4x4_neon_consts[] = {
- F_1_847, -F_0_765, -F_0_211, F_1_451,
- -F_2_172, F_1_061, -F_0_509, -F_0_601,
- F_0_899, F_2_562, 0, 0
- };
-
-void jsimd_idct_4x4_neon(void *dct_table,
- JCOEFPTR coef_block,
- JSAMPARRAY restrict output_buf,
- JDIMENSION output_col)
+ F_1_847, -F_0_765, -F_0_211, F_1_451,
+ -F_2_172, F_1_061, -F_0_509, -F_0_601,
+ F_0_899, F_2_562, 0, 0
+};
+
+void jsimd_idct_4x4_neon(void *dct_table, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
{
ISLOW_MULT_TYPE *quantptr = dct_table;
@@ -207,7 +208,7 @@ void jsimd_idct_4x4_neon(void *dct_table,
int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
/* Load quantization table values for DC coefficients. */
- int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+ int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
/* Dequantize DC coefficients. */
row0 = vmulq_s16(row0, quant_row0);
@@ -222,22 +223,33 @@ void jsimd_idct_4x4_neon(void *dct_table,
int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
/* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_4x4_neon_consts);
+#else
+ /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
+ const int16x4_t consts1 = vld1_s16(jsimd_idct_4x4_neon_consts);
+ const int16x4_t consts2 = vld1_s16(jsimd_idct_4x4_neon_consts + 4);
+ const int16x4_t consts3 = vld1_s16(jsimd_idct_4x4_neon_consts + 8);
+ const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
- /* All AC coefficients are zero. */
- /* Compute DC values and duplicate into row vectors 0, 1, 2 and 3. */
+ /* All AC coefficients are zero.
+ * Compute DC values and duplicate into row vectors 0, 1, 2, and 3.
+ */
int16x8_t dcval = vshlq_n_s16(row0, PASS1_BITS);
row0 = dcval;
row1 = dcval;
row2 = dcval;
row3 = dcval;
} else if (left_ac_bitmap == 0) {
- /* AC coefficients are zero for columns 0, 1, 2 and 3. */
- /* Compute DC values for these columns. */
+ /* AC coefficients are zero for columns 0, 1, 2, and 3.
+ * Compute DC values for these columns.
+ */
int16x4_t dcval = vshl_n_s16(vget_low_s16(row0), PASS1_BITS);
- /* Commence regular IDCT computation for columns 4, 5, 6 and 7. */
+ /* Commence regular IDCT computation for columns 4, 5, 6, and 7. */
+
/* Load quantization table. */
int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
@@ -246,7 +258,7 @@ void jsimd_idct_4x4_neon(void *dct_table,
int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
- /* Even part. */
+ /* Even part */
int32x4_t tmp0 = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
int16x4_t z2 = vmul_s16(vget_high_s16(row2), quant_row2);
@@ -258,7 +270,7 @@ void jsimd_idct_4x4_neon(void *dct_table,
int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
- /* Odd part. */
+ /* Odd part */
int16x4_t z1 = vmul_s16(vget_high_s16(row7), quant_row7);
z2 = vmul_s16(vget_high_s16(row5), quant_row5);
z3 = vmul_s16(vget_high_s16(row3), quant_row3);
@@ -284,11 +296,13 @@ void jsimd_idct_4x4_neon(void *dct_table,
row2 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
CONST_BITS - PASS1_BITS + 1));
} else if (right_ac_bitmap == 0) {
- /* AC coefficients are zero for columns 4, 5, 6 and 7. */
- /* Compute DC values for these columns. */
+ /* AC coefficients are zero for columns 4, 5, 6, and 7.
+ * Compute DC values for these columns.
+ */
int16x4_t dcval = vshl_n_s16(vget_high_s16(row0), PASS1_BITS);
- /* Commence regular IDCT computation for columns 0, 1, 2 and 3. */
+ /* Commence regular IDCT computation for columns 0, 1, 2, and 3. */
+
/* Load quantization table. */
int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
@@ -297,7 +311,7 @@ void jsimd_idct_4x4_neon(void *dct_table,
int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
- /* Even part. */
+ /* Even part */
int32x4_t tmp0 = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
int16x4_t z2 = vmul_s16(vget_low_s16(row2), quant_row2);
@@ -309,7 +323,7 @@ void jsimd_idct_4x4_neon(void *dct_table,
int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
- /* Odd part. */
+ /* Odd part */
int16x4_t z1 = vmul_s16(vget_low_s16(row7), quant_row7);
z2 = vmul_s16(vget_low_s16(row5), quant_row5);
z3 = vmul_s16(vget_low_s16(row3), quant_row3);
@@ -343,7 +357,7 @@ void jsimd_idct_4x4_neon(void *dct_table,
int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
- /* Even part. */
+ /* Even part */
int32x4_t tmp0_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
int32x4_t tmp0_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
@@ -360,7 +374,7 @@ void jsimd_idct_4x4_neon(void *dct_table,
int32x4_t tmp12_l = vsubq_s32(tmp0_l, tmp2_l);
int32x4_t tmp12_h = vsubq_s32(tmp0_h, tmp2_h);
- /* Odd part. */
+ /* Odd part */
int16x8_t z1 = vmulq_s16(row7, quant_row7);
z2 = vmulq_s16(row5, quant_row5);
z3 = vmulq_s16(row3, quant_row3);
@@ -421,7 +435,8 @@ void jsimd_idct_4x4_neon(void *dct_table,
int16x4_t col7 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[1]));
/* Commence second pass of IDCT. */
- /* Even part. */
+
+ /* Even part */
int32x4_t tmp0 = vshll_n_s16(col0, CONST_BITS + 1);
int32x4_t tmp2 = vmull_lane_s16(col2, consts.val[0], 0);
tmp2 = vmlal_lane_s16(tmp2, col6, consts.val[0], 1);
@@ -429,7 +444,7 @@ void jsimd_idct_4x4_neon(void *dct_table,
int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
- /* Odd part. */
+ /* Odd part */
tmp0 = vmull_lane_s16(col7, consts.val[0], 2);
tmp0 = vmlal_lane_s16(tmp0, col5, consts.val[0], 3);
tmp0 = vmlal_lane_s16(tmp0, col3, consts.val[1], 0);
@@ -449,13 +464,15 @@ void jsimd_idct_4x4_neon(void *dct_table,
CONST_BITS + PASS1_BITS + 3 + 1 - 16);
output_cols_13 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_13,
CONST_BITS + PASS1_BITS + 3 + 1 - 16);
- /* Narrow to 8-bit and convert to unsigned while zipping 8-bit elements. */
- /* Interleaving store completes the transpose. */
+ /* Narrow to 8-bit and convert to unsigned while zipping 8-bit elements.
+ * An interleaving store completes the transpose.
+ */
uint8x8x2_t output_0123 = vzip_u8(vqmovun_s16(output_cols_02),
vqmovun_s16(output_cols_13));
- uint16x4x2_t output_01_23 = { vreinterpret_u16_u8(output_0123.val[0]),
- vreinterpret_u16_u8(output_0123.val[1])
- };
+ uint16x4x2_t output_01_23 = { {
+ vreinterpret_u16_u8(output_0123.val[0]),
+ vreinterpret_u16_u8(output_0123.val[1])
+ } };
/* Store 4x4 block to memory. */
JSAMPROW outptr0 = output_buf[0] + output_col;
diff --git a/simd/arm/common/jquanti-neon.c b/simd/arm/jquanti-neon.c
index 6f8a3ab..a7eb6f1 100644
--- a/simd/arm/common/jquanti-neon.c
+++ b/simd/arm/jquanti-neon.c
@@ -1,7 +1,7 @@
/*
- * jquanti-neon.c - sample conversion and integer quantization (Arm NEON)
+ * jquanti-neon.c - sample data conversion and quantization (Arm Neon)
*
- * Copyright 2020 The Chromium Authors. All Rights Reserved.
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
@@ -21,28 +21,28 @@
*/
#define JPEG_INTERNALS
-#include "../../../jinclude.h"
-#include "../../../jpeglib.h"
-#include "../../../jsimd.h"
-#include "../../../jdct.h"
-#include "../../../jsimddct.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
#include <arm_neon.h>
-/*
- * Pixel channel sample values have range [0,255]. The Discrete Cosine
- * Transform (DCT) operates on values centered around 0.
+
+/* After downsampling, the resulting sample values are in the range [0, 255],
+ * but the Discrete Cosine Transform (DCT) operates on values centered around
+ * 0.
*
* To prepare sample values for the DCT, load samples into a DCT workspace,
- * subtracting CENTREJSAMPLE (128). The samples, now in range [-128, 127],
+ * subtracting CENTERJSAMPLE (128). The samples, now in the range [-128, 127],
* are also widened from 8- to 16-bit.
*
- * The equivalent scalar C function 'convsamp' can be found in jcdctmgr.c.
+ * The equivalent scalar C function convsamp() can be found in jcdctmgr.c.
*/
-void jsimd_convsamp_neon(JSAMPARRAY sample_data,
- JDIMENSION start_col,
+void jsimd_convsamp_neon(JSAMPARRAY sample_data, JDIMENSION start_col,
DCTELEM *workspace)
{
uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col);
@@ -54,22 +54,22 @@ void jsimd_convsamp_neon(JSAMPARRAY sample_data,
uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col);
uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col);
- int16x8_t row0 = vreinterpretq_s16_u16(vsubl_u8(samp_row0,
- vdup_n_u8(CENTERJSAMPLE)));
- int16x8_t row1 = vreinterpretq_s16_u16(vsubl_u8(samp_row1,
- vdup_n_u8(CENTERJSAMPLE)));
- int16x8_t row2 = vreinterpretq_s16_u16(vsubl_u8(samp_row2,
- vdup_n_u8(CENTERJSAMPLE)));
- int16x8_t row3 = vreinterpretq_s16_u16(vsubl_u8(samp_row3,
- vdup_n_u8(CENTERJSAMPLE)));
- int16x8_t row4 = vreinterpretq_s16_u16(vsubl_u8(samp_row4,
- vdup_n_u8(CENTERJSAMPLE)));
- int16x8_t row5 = vreinterpretq_s16_u16(vsubl_u8(samp_row5,
- vdup_n_u8(CENTERJSAMPLE)));
- int16x8_t row6 = vreinterpretq_s16_u16(vsubl_u8(samp_row6,
- vdup_n_u8(CENTERJSAMPLE)));
- int16x8_t row7 = vreinterpretq_s16_u16(vsubl_u8(samp_row7,
- vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row0 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row0, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row1 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row1, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row2 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row2, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row3 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row3, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row4 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row4, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row5 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row5, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row6 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row6, vdup_n_u8(CENTERJSAMPLE)));
+ int16x8_t row7 =
+ vreinterpretq_s16_u16(vsubl_u8(samp_row7, vdup_n_u8(CENTERJSAMPLE)));
vst1q_s16(workspace + 0 * DCTSIZE, row0);
vst1q_s16(workspace + 1 * DCTSIZE, row1);
@@ -82,26 +82,25 @@ void jsimd_convsamp_neon(JSAMPARRAY sample_data,
}
-/*
- * After the DCT, the resulting coefficient values need to be divided by a
- * quantization value.
+/* After the DCT, the resulting array of coefficient values needs to be divided
+ * by an array of quantization values.
*
* To avoid a slow division operation, the DCT coefficients are multiplied by
- * the (scaled) reciprocal of the quantization values and then right-shifted.
+ * the (scaled) reciprocals of the quantization values and then right-shifted.
*
- * The equivalent scalar C function 'quantize' can be found in jcdctmgr.c.
+ * The equivalent scalar C function quantize() can be found in jcdctmgr.c.
*/
-void jsimd_quantize_neon(JCOEFPTR coef_block,
- DCTELEM *divisors,
+void jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
DCTELEM *workspace)
{
JCOEFPTR out_ptr = coef_block;
UDCTELEM *recip_ptr = (UDCTELEM *)divisors;
UDCTELEM *corr_ptr = (UDCTELEM *)divisors + DCTSIZE2;
DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2;
+ int i;
- for (int i = 0; i < DCTSIZE; i += DCTSIZE / 2) {
+ for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) {
/* Load DCT coefficients. */
int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE);
int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE);
@@ -137,7 +136,7 @@ void jsimd_quantize_neon(JCOEFPTR coef_block,
abs_row2 = vaddq_u16(abs_row2, corr2);
abs_row3 = vaddq_u16(abs_row3, corr3);
- /* Multiply DCT coefficients by quantization reciprocal. */
+ /* Multiply DCT coefficients by quantization reciprocals. */
int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0),
vget_low_u16(recip0)));
int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0),
@@ -160,8 +159,9 @@ void jsimd_quantize_neon(JCOEFPTR coef_block,
row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16));
row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16));
- /* Since VSHR only supports an immediate as its second argument, negate */
- /* the shift value and shift left. */
+ /* Since VSHR only supports an immediate as its second argument, negate the
+ * shift value and shift left.
+ */
row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0),
vnegq_s16(shift0)));
row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1),
diff --git a/simd/arm/neon-compat.h b/simd/arm/neon-compat.h
new file mode 100644
index 0000000..3ce3bcb
--- /dev/null
+++ b/simd/arm/neon-compat.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#if defined(__clang__) || defined(_MSC_VER)
+#define HAVE_VLD1_S16_X3
+#define HAVE_VLD1_U16_X2
+#define HAVE_VLD1Q_U8_X4
+#endif
+
+/* Define compiler-independent count-leading-zeros macros */
+#if defined(_MSC_VER) && !defined(__clang__)
+#define BUILTIN_CLZ(x) _CountLeadingZeros(x)
+#define BUILTIN_CLZL(x) _CountLeadingZeros64(x)
+#elif defined(__clang__) || defined(__GNUC__)
+#define BUILTIN_CLZ(x) __builtin_clz(x)
+#define BUILTIN_CLZL(x) __builtin_clzl(x)
+#else
+#error "Unknown compiler"
+#endif
diff --git a/simd/arm/neon-compat.h.in b/simd/arm/neon-compat.h.in
new file mode 100644
index 0000000..e2347b9
--- /dev/null
+++ b/simd/arm/neon-compat.h.in
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#cmakedefine HAVE_VLD1_S16_X3
+#cmakedefine HAVE_VLD1_U16_X2
+#cmakedefine HAVE_VLD1Q_U8_X4
+
+/* Define compiler-independent count-leading-zeros macros */
+#if defined(_MSC_VER) && !defined(__clang__)
+#define BUILTIN_CLZ(x) _CountLeadingZeros(x)
+#define BUILTIN_CLZL(x) _CountLeadingZeros64(x)
+#elif defined(__clang__) || defined(__GNUC__)
+#define BUILTIN_CLZ(x) __builtin_clz(x)
+#define BUILTIN_CLZL(x) __builtin_clzl(x)
+#else
+#error "Unknown compiler"
+#endif
diff --git a/simd/gas-preprocessor.in b/simd/gas-preprocessor.in
deleted file mode 100755
index 560f788..0000000
--- a/simd/gas-preprocessor.in
+++ /dev/null
@@ -1 +0,0 @@
-gas-preprocessor.pl @CMAKE_ASM_COMPILER@ ${1+"$@"}
diff --git a/simd/i386/jchuff-sse2.asm b/simd/i386/jchuff-sse2.asm
index d0112e6..278cf5e 100644
--- a/simd/i386/jchuff-sse2.asm
+++ b/simd/i386/jchuff-sse2.asm
@@ -1,8 +1,9 @@
;
; jchuff-sse2.asm - Huffman entropy encoding (SSE2)
;
-; Copyright (C) 2009-2011, 2014-2017, D. R. Commander.
+; Copyright (C) 2009-2011, 2014-2017, 2019, D. R. Commander.
; Copyright (C) 2015, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -15,133 +16,255 @@
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; This file contains an SSE2 implementation for Huffman coding of one block.
-; The following code is based directly on jchuff.c; see jchuff.c for more
-; details.
+; The following code is based on jchuff.c; see jchuff.c for more details.
%include "jsimdext.inc"
+struc working_state
+.next_output_byte: resp 1 ; => next byte to write in buffer
+.free_in_buffer: resp 1 ; # of byte spaces remaining in buffer
+.cur.put_buffer.simd resq 1 ; current bit accumulation buffer
+.cur.free_bits resd 1 ; # of bits available in it
+.cur.last_dc_val resd 4 ; last DC coef for each component
+.cinfo: resp 1 ; dump_buffer needs access to this
+endstruc
+
+struc c_derived_tbl
+.ehufco: resd 256 ; code for each symbol
+.ehufsi: resb 256 ; length of code for each symbol
+; If no code has been allocated for a symbol S, ehufsi[S] contains 0
+endstruc
+
; --------------------------------------------------------------------------
SECTION SEG_CONST
- alignz 32
GLOBAL_DATA(jconst_huff_encode_one_block)
- EXTERN EXTN(jpeg_nbits_table)
EXTN(jconst_huff_encode_one_block):
alignz 32
+jpeg_mask_bits dq 0x0000, 0x0001, 0x0003, 0x0007
+ dq 0x000f, 0x001f, 0x003f, 0x007f
+ dq 0x00ff, 0x01ff, 0x03ff, 0x07ff
+ dq 0x0fff, 0x1fff, 0x3fff, 0x7fff
+
+times 1 << 14 db 15
+times 1 << 13 db 14
+times 1 << 12 db 13
+times 1 << 11 db 12
+times 1 << 10 db 11
+times 1 << 9 db 10
+times 1 << 8 db 9
+times 1 << 7 db 8
+times 1 << 6 db 7
+times 1 << 5 db 6
+times 1 << 4 db 5
+times 1 << 3 db 4
+times 1 << 2 db 3
+times 1 << 1 db 2
+times 1 << 0 db 1
+times 1 db 0
+jpeg_nbits_table:
+times 1 db 0
+times 1 << 0 db 1
+times 1 << 1 db 2
+times 1 << 2 db 3
+times 1 << 3 db 4
+times 1 << 4 db 5
+times 1 << 5 db 6
+times 1 << 6 db 7
+times 1 << 7 db 8
+times 1 << 8 db 9
+times 1 << 9 db 10
+times 1 << 10 db 11
+times 1 << 11 db 12
+times 1 << 12 db 13
+times 1 << 13 db 14
+times 1 << 14 db 15
+
+ alignz 32
+
+%ifdef PIC
+%define NBITS(x) nbits_base + x
+%else
+%define NBITS(x) jpeg_nbits_table + x
+%endif
+%define MASK_BITS(x) NBITS((x) * 8) + (jpeg_mask_bits - jpeg_nbits_table)
+
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
-; These macros perform the same task as the emit_bits() function in the
-; original libjpeg code. In addition to reducing overhead by explicitly
-; inlining the code, additional performance is achieved by taking into
-; account the size of the bit buffer and waiting until it is almost full
-; before emptying it. This mostly benefits 64-bit platforms, since 6
-; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
-
-%macro EMIT_BYTE 0
- sub put_bits, 8 ; put_bits -= 8;
- mov edx, put_buffer
- mov ecx, put_bits
- shr edx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
- mov byte [eax], dl ; *buffer++ = c;
- add eax, 1
- cmp dl, 0xFF ; need to stuff a zero byte?
- jne %%.EMIT_BYTE_END
- mov byte [eax], 0 ; *buffer++ = 0;
- add eax, 1
-%%.EMIT_BYTE_END:
-%endmacro
-
-%macro PUT_BITS 1
- add put_bits, ecx ; put_bits += size;
- shl put_buffer, cl ; put_buffer = (put_buffer << size);
- or put_buffer, %1
+%define mm_put_buffer mm0
+%define mm_all_0xff mm1
+%define mm_temp mm2
+%define mm_nbits mm3
+%define mm_code_bits mm3
+%define mm_code mm4
+%define mm_overflow_bits mm5
+%define mm_save_nbits mm6
+
+; Shorthand used to describe SIMD operations:
+; wN: xmmN treated as eight signed 16-bit values
+; wN[i]: perform the same operation on all eight signed 16-bit values, i=0..7
+; bN: xmmN treated as 16 unsigned 8-bit values, or
+; mmN treated as eight unsigned 8-bit values
+; bN[i]: perform the same operation on all unsigned 8-bit values,
+; i=0..15 (SSE register) or i=0..7 (MMX register)
+; Contents of SIMD registers are shown in memory order.
+
+; Fill the bit buffer to capacity with the leading bits from code, then output
+; the bit buffer and put the remaining bits from code into the bit buffer.
+;
+; Usage:
+; code - contains the bits to shift into the bit buffer (LSB-aligned)
+; %1 - temp register
+; %2 - low byte of temp register
+; %3 - second byte of temp register
+; %4-%8 (optional) - extra instructions to execute before the macro completes
+; %9 - the label to which to jump when the macro completes
+;
+; Upon completion, free_bits will be set to the number of remaining bits from
+; code, and put_buffer will contain those remaining bits. temp and code will
+; be clobbered.
+;
+; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
+; macro in jchuff.c.
+
+%macro EMIT_QWORD 9
+%define %%temp %1
+%define %%tempb %2
+%define %%temph %3
+ add nbits, free_bits ; nbits += free_bits;
+ neg free_bits ; free_bits = -free_bits;
+ movq mm_temp, mm_code ; temp = code;
+ movd mm_nbits, nbits ; nbits --> MMX register
+ movd mm_overflow_bits, free_bits ; overflow_bits (temp register) = free_bits;
+ neg free_bits ; free_bits = -free_bits;
+ psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
+ psrlq mm_temp, mm_overflow_bits ; temp >>= overflow_bits;
+ add free_bits, 64 ; free_bits += 64;
+ por mm_temp, mm_put_buffer ; temp |= put_buffer;
+%ifidn %%temp, nbits_base
+ movd mm_save_nbits, nbits_base ; save nbits_base
+%endif
+ movq mm_code_bits, mm_temp ; code_bits (temp register) = temp;
+ movq mm_put_buffer, mm_code ; put_buffer = code;
+ pcmpeqb mm_temp, mm_all_0xff ; b_temp[i] = (b_temp[i] == 0xFF ? 0xFF : 0);
+ movq mm_code, mm_code_bits ; code = code_bits;
+ psrlq mm_code_bits, 32 ; code_bits >>= 32;
+ pmovmskb nbits, mm_temp ; nbits = 0; nbits |= ((b_temp[i] >> 7) << i);
+ movd %%temp, mm_code_bits ; temp = code_bits;
+ bswap %%temp ; temp = htonl(temp);
+ test nbits, nbits ; if (nbits != 0) /* Some 0xFF bytes */
+ jnz %%.SLOW ; goto %%.SLOW
+ mov dword [buffer], %%temp ; *(uint32_t)buffer = temp;
+%ifidn %%temp, nbits_base
+ movd nbits_base, mm_save_nbits ; restore nbits_base
+%endif
+ %4
+ movd nbits, mm_code ; nbits = (uint32_t)(code);
+ %5
+ bswap nbits ; nbits = htonl(nbits);
+ mov dword [buffer + 4], nbits ; *(uint32_t)(buffer + 4) = nbits;
+ lea buffer, [buffer + 8] ; buffer += 8;
+ %6
+ %7
+ %8
+ jmp %9 ; return
+%%.SLOW:
+ ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
+ ; bytes in the qword.
+ mov byte [buffer], %%tempb ; buffer[0] = temp[0];
+ cmp %%tempb, 0xFF ; Set CF if temp[0] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb buffer, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+ mov byte [buffer], %%temph ; buffer[0] = temp[1];
+ cmp %%temph, 0xFF ; Set CF if temp[1] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb buffer, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+ shr %%temp, 16 ; temp >>= 16;
+ mov byte [buffer], %%tempb ; buffer[0] = temp[0];
+ cmp %%tempb, 0xFF ; Set CF if temp[0] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb buffer, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+ mov byte [buffer], %%temph ; buffer[0] = temp[1];
+ cmp %%temph, 0xFF ; Set CF if temp[1] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb buffer, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+ movd nbits, mm_code ; nbits (temp register) = (uint32_t)(code)
+%ifidn %%temp, nbits_base
+ movd nbits_base, mm_save_nbits ; restore nbits_base
+%endif
+ bswap nbits ; nbits = htonl(nbits)
+ mov byte [buffer], nbitsb ; buffer[0] = nbits[0];
+ cmp nbitsb, 0xFF ; Set CF if nbits[0] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb buffer, -2 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
+ mov byte [buffer], nbitsh ; buffer[0] = nbits[1];
+ cmp nbitsh, 0xFF ; Set CF if nbits[1] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb buffer, -2 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
+ shr nbits, 16 ; nbits >>= 16;
+ mov byte [buffer], nbitsb ; buffer[0] = nbits[0];
+ cmp nbitsb, 0xFF ; Set CF if nbits[0] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb buffer, -2 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
+ mov byte [buffer], nbitsh ; buffer[0] = nbits[1];
+ %4
+ cmp nbitsh, 0xFF ; Set CF if nbits[1] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb buffer, -2 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
+ %5
+ %6
+ %7
+ %8
+ jmp %9 ; return;
%endmacro
-%macro CHECKBUF15 0
- cmp put_bits, 16 ; if (put_bits > 31) {
- jl %%.CHECKBUF15_END
- mov eax, POINTER [esp+buffer]
- EMIT_BYTE
- EMIT_BYTE
- mov POINTER [esp+buffer], eax
-%%.CHECKBUF15_END:
+%macro PUSH 1
+ push %1
+%assign stack_offset stack_offset + 4
%endmacro
-%macro EMIT_BITS 1
- PUT_BITS %1
- CHECKBUF15
+%macro POP 1
+ pop %1
+%assign stack_offset stack_offset - 4
%endmacro
-%macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
- pxor xmm4, xmm4 ; __m128i neg = _mm_setzero_si128();
- pxor xmm5, xmm5 ; __m128i neg = _mm_setzero_si128();
- pxor xmm6, xmm6 ; __m128i neg = _mm_setzero_si128();
- pxor xmm7, xmm7 ; __m128i neg = _mm_setzero_si128();
- pinsrw %34, word [esi + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0];
- pinsrw %35, word [esi + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8];
- pinsrw %36, word [esi + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16];
- pinsrw %37, word [esi + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24];
- pinsrw %34, word [esi + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1];
- pinsrw %35, word [esi + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9];
- pinsrw %36, word [esi + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17];
- pinsrw %37, word [esi + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25];
- pinsrw %34, word [esi + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2];
- pinsrw %35, word [esi + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10];
- pinsrw %36, word [esi + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18];
- pinsrw %37, word [esi + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26];
- pinsrw %34, word [esi + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3];
- pinsrw %35, word [esi + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11];
- pinsrw %36, word [esi + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19];
- pinsrw %37, word [esi + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27];
- pinsrw %34, word [esi + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4];
- pinsrw %35, word [esi + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12];
- pinsrw %36, word [esi + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20];
- pinsrw %37, word [esi + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28];
- pinsrw %34, word [esi + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5];
- pinsrw %35, word [esi + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13];
- pinsrw %36, word [esi + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21];
- pinsrw %37, word [esi + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29];
- pinsrw %34, word [esi + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6];
- pinsrw %35, word [esi + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14];
- pinsrw %36, word [esi + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22];
- pinsrw %37, word [esi + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30];
- pinsrw %34, word [esi + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7];
- pinsrw %35, word [esi + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15];
- pinsrw %36, word [esi + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23];
-%if %1 != 32
- pinsrw %37, word [esi + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31];
+; If PIC is defined, load the address of a symbol defined in this file into a
+; register. Equivalent to
+; get_GOT %1
+; lea %1, [GOTOFF(%1, %2)]
+; without using the GOT.
+;
+; Usage:
+; %1 - register into which to load the address of the symbol
+; %2 - symbol whose address should be loaded
+; %3 - optional multi-line macro to execute before the symbol address is loaded
+; %4 - optional multi-line macro to execute after the symbol address is loaded
+;
+; If PIC is not defined, then %3 and %4 are executed in order.
+
+%macro GET_SYM 2-4
+%ifdef PIC
+ call %%.geteip
+%%.ref:
+ %4
+ add %1, %2 - %%.ref
+ jmp short %%.done
+ align 32
+%%.geteip:
+ %3 4 ; must adjust stack pointer because of call
+ mov %1, POINTER [esp]
+ ret
+ align 32
+%%.done:
%else
- pinsrw %37, ecx, 7 ; xmm_shadow[31] = block[jno31];
+ %3 0
+ %4
%endif
- pcmpgtw xmm4, %34 ; neg = _mm_cmpgt_epi16(neg, x1);
- pcmpgtw xmm5, %35 ; neg = _mm_cmpgt_epi16(neg, x1);
- pcmpgtw xmm6, %36 ; neg = _mm_cmpgt_epi16(neg, x1);
- pcmpgtw xmm7, %37 ; neg = _mm_cmpgt_epi16(neg, x1);
- paddw %34, xmm4 ; x1 = _mm_add_epi16(x1, neg);
- paddw %35, xmm5 ; x1 = _mm_add_epi16(x1, neg);
- paddw %36, xmm6 ; x1 = _mm_add_epi16(x1, neg);
- paddw %37, xmm7 ; x1 = _mm_add_epi16(x1, neg);
- pxor %34, xmm4 ; x1 = _mm_xor_si128(x1, neg);
- pxor %35, xmm5 ; x1 = _mm_xor_si128(x1, neg);
- pxor %36, xmm6 ; x1 = _mm_xor_si128(x1, neg);
- pxor %37, xmm7 ; x1 = _mm_xor_si128(x1, neg);
- pxor xmm4, %34 ; neg = _mm_xor_si128(neg, x1);
- pxor xmm5, %35 ; neg = _mm_xor_si128(neg, x1);
- pxor xmm6, %36 ; neg = _mm_xor_si128(neg, x1);
- pxor xmm7, %37 ; neg = _mm_xor_si128(neg, x1);
- movdqa XMMWORD [esp + t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
- movdqa XMMWORD [esp + t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
- movdqa XMMWORD [esp + t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
- movdqa XMMWORD [esp + t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
- movdqa XMMWORD [esp + t2 + %1 * SIZEOF_WORD], xmm4 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
- movdqa XMMWORD [esp + t2 + (%1 + 8) * SIZEOF_WORD], xmm5 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
- movdqa XMMWORD [esp + t2 + (%1 + 16) * SIZEOF_WORD], xmm6 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
- movdqa XMMWORD [esp + t2 + (%1 + 24) * SIZEOF_WORD], xmm7 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
%endmacro
;
@@ -152,272 +275,487 @@ EXTN(jconst_huff_encode_one_block):
; JCOEFPTR block, int last_dc_val,
; c_derived_tbl *dctbl, c_derived_tbl *actbl)
;
-
-; eax + 8 = working_state *state
-; eax + 12 = JOCTET *buffer
-; eax + 16 = JCOEFPTR block
-; eax + 20 = int last_dc_val
-; eax + 24 = c_derived_tbl *dctbl
-; eax + 28 = c_derived_tbl *actbl
-
-%define pad 6 * SIZEOF_DWORD ; Align to 16 bytes
-%define t1 pad
-%define t2 t1 + (DCTSIZE2 * SIZEOF_WORD)
-%define block t2 + (DCTSIZE2 * SIZEOF_WORD)
-%define actbl block + SIZEOF_DWORD
-%define buffer actbl + SIZEOF_DWORD
-%define temp buffer + SIZEOF_DWORD
-%define temp2 temp + SIZEOF_DWORD
-%define temp3 temp2 + SIZEOF_DWORD
-%define temp4 temp3 + SIZEOF_DWORD
-%define temp5 temp4 + SIZEOF_DWORD
-%define gotptr temp5 + SIZEOF_DWORD ; void *gotptr
-%define put_buffer ebx
-%define put_bits edi
+; Stack layout:
+; Function args
+; Return address
+; Saved ebx
+; Saved ebp
+; Saved esi
+; Saved edi <-- esp_save
+; ...
+; esp_save
+; t_ 64*2 bytes (aligned to 128 bytes)
+;
+; esp is used (as t) to point into t_ (data in lower indices is not used once
+; esp passes over them, so this is signal-safe.) Aligning to 128 bytes allows
+; us to find the rest of the data again.
+;
+; NOTES:
+; When shuffling data, we try to avoid pinsrw as much as possible, since it is
+; slow on many CPUs. Its reciprocal throughput (issue latency) is 1 even on
+; modern CPUs, so chains of pinsrw instructions (even with different outputs)
+; can limit performance. pinsrw is a VectorPath instruction on AMD K8 and
+; requires 2 µops (with memory operand) on Intel. In either case, only one
+; pinsrw instruction can be decoded per cycle (and nothing else if they are
+; back-to-back), so out-of-order execution cannot be used to work around long
+; pinsrw chains (though for Sandy Bridge and later, this may be less of a
+; problem if the code runs from the µop cache.)
+;
+; We use tzcnt instead of bsf without checking for support. The instruction is
+; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
+; rep bsf.) The destination (first) operand of bsf (and tzcnt on some CPUs) is
+; an input dependency (although the behavior is not formally defined, Intel
+; CPUs usually leave the destination unmodified if the source is zero.) This
+; can prevent out-of-order execution, so we clear the destination before
+; invoking tzcnt.
+;
+; Initial register allocation
+; eax - frame --> buffer
+; ebx - nbits_base (PIC) / emit_temp
+; ecx - dctbl --> size --> state
+; edx - block --> nbits
+; esi - code_temp --> state --> actbl
+; edi - index_temp --> free_bits
+; esp - t
+; ebp - index
+
+%define frame eax
+%ifdef PIC
+%define nbits_base ebx
+%endif
+%define emit_temp ebx
+%define emit_tempb bl
+%define emit_temph bh
+%define dctbl ecx
+%define block edx
+%define code_temp esi
+%define index_temp edi
+%define t esp
+%define index ebp
+
+%assign save_frame DCTSIZE2 * SIZEOF_WORD
+
+; Step 1: Re-arrange input data according to jpeg_natural_order
+; xx 01 02 03 04 05 06 07 xx 01 08 16 09 02 03 10
+; 08 09 10 11 12 13 14 15 17 24 32 25 18 11 04 05
+; 16 17 18 19 20 21 22 23 12 19 26 33 40 48 41 34
+; 24 25 26 27 28 29 30 31 ==> 27 20 13 06 07 14 21 28
+; 32 33 34 35 36 37 38 39 35 42 49 56 57 50 43 36
+; 40 41 42 43 44 45 46 47 29 22 15 23 30 37 44 51
+; 48 49 50 51 52 53 54 55 58 59 52 45 38 31 39 46
+; 56 57 58 59 60 61 62 63 53 60 61 54 47 55 62 63
align 32
GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
EXTN(jsimd_huff_encode_one_block_sse2):
- push ebp
- mov eax, esp ; eax = original ebp
- sub esp, byte 4
- and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [esp], eax
- mov ebp, esp ; ebp = aligned ebp
- sub esp, temp5+9*SIZEOF_DWORD-pad
- push ebx
- push ecx
-; push edx ; need not be preserved
- push esi
- push edi
- push ebp
-
- mov esi, POINTER [eax+8] ; (working_state *state)
- mov put_buffer, dword [esi+8] ; put_buffer = state->cur.put_buffer;
- mov put_bits, dword [esi+12] ; put_bits = state->cur.put_bits;
- push esi ; esi is now scratch
-
- get_GOT edx ; get GOT address
- movpic POINTER [esp+gotptr], edx ; save GOT address
-
- mov ecx, POINTER [eax+28]
- mov edx, POINTER [eax+16]
- mov esi, POINTER [eax+12]
- mov POINTER [esp+actbl], ecx
- mov POINTER [esp+block], edx
- mov POINTER [esp+buffer], esi
-
- ; Encode the DC coefficient difference per section F.1.2.1
- mov esi, POINTER [esp+block] ; block
- movsx ecx, word [esi] ; temp = temp2 = block[0] - last_dc_val;
- sub ecx, dword [eax+20]
- mov esi, ecx
-
- ; This is a well-known technique for obtaining the absolute value
- ; with out a branch. It is derived from an assembly language technique
- ; presented in "How to Optimize for the Pentium Processors",
- ; Copyright (c) 1996, 1997 by Agner Fog.
- mov edx, ecx
- sar edx, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
- xor ecx, edx ; temp ^= temp3;
- sub ecx, edx ; temp -= temp3;
-
- ; For a negative input, want temp2 = bitwise complement of abs(input)
- ; This code assumes we are on a two's complement machine
- add esi, edx ; temp2 += temp3;
- mov dword [esp+temp], esi ; backup temp2 in temp
-
- ; Find the number of bits needed for the magnitude of the coefficient
- movpic ebp, POINTER [esp+gotptr] ; load GOT address (ebp)
- movzx edx, byte [GOTOFF(ebp, EXTN(jpeg_nbits_table) + ecx)] ; nbits = JPEG_NBITS(temp);
- mov dword [esp+temp2], edx ; backup nbits in temp2
-
- ; Emit the Huffman-coded symbol for the number of bits
- mov ebp, POINTER [eax+24] ; After this point, arguments are not accessible anymore
- mov eax, INT [ebp + edx * 4] ; code = dctbl->ehufco[nbits];
- movzx ecx, byte [ebp + edx + 1024] ; size = dctbl->ehufsi[nbits];
- EMIT_BITS eax ; EMIT_BITS(code, size)
-
- mov ecx, dword [esp+temp2] ; restore nbits
-
- ; Mask off any extra bits in code
- mov eax, 1
- shl eax, cl
- dec eax
- and eax, dword [esp+temp] ; temp2 &= (((JLONG)1)<<nbits) - 1;
-
- ; Emit that number of bits of the value, if positive,
- ; or the complement of its magnitude, if negative.
- EMIT_BITS eax ; EMIT_BITS(temp2, nbits)
-
- ; Prepare data
- xor ecx, ecx
- mov esi, POINTER [esp+block]
- kloop_prepare 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, \
- 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, \
- 27, 20, 13, 6, 7, 14, 21, 28, 35, \
- xmm0, xmm1, xmm2, xmm3
- kloop_prepare 32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
- 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
- 53, 60, 61, 54, 47, 55, 62, 63, 63, \
- xmm0, xmm1, xmm2, xmm3
-
- pxor xmm7, xmm7
- movdqa xmm0, XMMWORD [esp + t1 + 0 * SIZEOF_WORD] ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
- movdqa xmm1, XMMWORD [esp + t1 + 8 * SIZEOF_WORD] ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
- movdqa xmm2, XMMWORD [esp + t1 + 16 * SIZEOF_WORD] ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
- movdqa xmm3, XMMWORD [esp + t1 + 24 * SIZEOF_WORD] ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
- pcmpeqw xmm0, xmm7 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
- pcmpeqw xmm1, xmm7 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
- pcmpeqw xmm2, xmm7 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
- pcmpeqw xmm3, xmm7 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
- packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
- packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
- pmovmskb edx, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
- pmovmskb ecx, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
- shl ecx, 16
- or edx, ecx
- not edx ; index = ~index;
-
- lea esi, [esp+t1]
- mov ebp, POINTER [esp+actbl] ; ebp = actbl
-
-.BLOOP:
- bsf ecx, edx ; r = __builtin_ctzl(index);
- jz near .ELOOP
- lea esi, [esi+ecx*2] ; k += r;
- shr edx, cl ; index >>= r;
- mov dword [esp+temp3], edx
-.BRLOOP:
- cmp ecx, 16 ; while (r > 15) {
- jl near .ERLOOP
- sub ecx, 16 ; r -= 16;
- mov dword [esp+temp], ecx
- mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0];
- movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0];
- EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0)
- mov ecx, dword [esp+temp]
- jmp .BRLOOP
-.ERLOOP:
- movsx eax, word [esi] ; temp = t1[k];
- movpic edx, POINTER [esp+gotptr] ; load GOT address (edx)
- movzx eax, byte [GOTOFF(edx, EXTN(jpeg_nbits_table) + eax)] ; nbits = JPEG_NBITS(temp);
- mov dword [esp+temp2], eax
- ; Emit Huffman symbol for run length / number of bits
- shl ecx, 4 ; temp3 = (r << 4) + nbits;
- add ecx, eax
- mov eax, INT [ebp + ecx * 4] ; code = actbl->ehufco[temp3];
- movzx ecx, byte [ebp + ecx + 1024] ; size = actbl->ehufsi[temp3];
- EMIT_BITS eax
-
- movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k];
- ; Mask off any extra bits in code
- mov ecx, dword [esp+temp2]
- mov eax, 1
- shl eax, cl
- dec eax
- and eax, edx ; temp2 &= (((JLONG)1)<<nbits) - 1;
- EMIT_BITS eax ; PUT_BITS(temp2, nbits)
- mov edx, dword [esp+temp3]
- add esi, 2 ; ++k;
- shr edx, 1 ; index >>= 1;
-
- jmp .BLOOP
-.ELOOP:
- movdqa xmm0, XMMWORD [esp + t1 + 32 * SIZEOF_WORD] ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
- movdqa xmm1, XMMWORD [esp + t1 + 40 * SIZEOF_WORD] ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
- movdqa xmm2, XMMWORD [esp + t1 + 48 * SIZEOF_WORD] ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
- movdqa xmm3, XMMWORD [esp + t1 + 56 * SIZEOF_WORD] ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
- pcmpeqw xmm0, xmm7 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
- pcmpeqw xmm1, xmm7 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
- pcmpeqw xmm2, xmm7 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
- pcmpeqw xmm3, xmm7 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
- packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
- packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
- pmovmskb edx, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
- pmovmskb ecx, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
- shl ecx, 16
- or edx, ecx
- not edx ; index = ~index;
-
- lea eax, [esp + t1 + (DCTSIZE2/2) * 2]
- sub eax, esi
- shr eax, 1
- bsf ecx, edx ; r = __builtin_ctzl(index);
- jz near .ELOOP2
- shr edx, cl ; index >>= r;
- add ecx, eax
- lea esi, [esi+ecx*2] ; k += r;
- mov dword [esp+temp3], edx
- jmp .BRLOOP2
-.BLOOP2:
- bsf ecx, edx ; r = __builtin_ctzl(index);
- jz near .ELOOP2
- lea esi, [esi+ecx*2] ; k += r;
- shr edx, cl ; index >>= r;
- mov dword [esp+temp3], edx
-.BRLOOP2:
- cmp ecx, 16 ; while (r > 15) {
- jl near .ERLOOP2
- sub ecx, 16 ; r -= 16;
- mov dword [esp+temp], ecx
- mov eax, INT [ebp + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0];
- movzx ecx, byte [ebp + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0];
- EMIT_BITS eax ; EMIT_BITS(code_0xf0, size_0xf0)
- mov ecx, dword [esp+temp]
- jmp .BRLOOP2
-.ERLOOP2:
- movsx eax, word [esi] ; temp = t1[k];
- bsr eax, eax ; nbits = 32 - __builtin_clz(temp);
- inc eax
- mov dword [esp+temp2], eax
- ; Emit Huffman symbol for run length / number of bits
- shl ecx, 4 ; temp3 = (r << 4) + nbits;
- add ecx, eax
- mov eax, INT [ebp + ecx * 4] ; code = actbl->ehufco[temp3];
- movzx ecx, byte [ebp + ecx + 1024] ; size = actbl->ehufsi[temp3];
- EMIT_BITS eax
-
- movsx edx, word [esi+DCTSIZE2*2] ; temp2 = t2[k];
- ; Mask off any extra bits in code
- mov ecx, dword [esp+temp2]
- mov eax, 1
- shl eax, cl
- dec eax
- and eax, edx ; temp2 &= (((JLONG)1)<<nbits) - 1;
- EMIT_BITS eax ; PUT_BITS(temp2, nbits)
- mov edx, dword [esp+temp3]
- add esi, 2 ; ++k;
- shr edx, 1 ; index >>= 1;
-
- jmp .BLOOP2
-.ELOOP2:
- ; If the last coef(s) were zero, emit an end-of-block code
- lea edx, [esp + t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k;
- cmp edx, esi ; if (r > 0) {
- je .EFN
- mov eax, INT [ebp] ; code = actbl->ehufco[0];
- movzx ecx, byte [ebp + 1024] ; size = actbl->ehufsi[0];
- EMIT_BITS eax
-.EFN:
- mov eax, [esp+buffer]
- pop esi
- ; Save put_buffer & put_bits
- mov dword [esi+8], put_buffer ; state->cur.put_buffer = put_buffer;
- mov dword [esi+12], put_bits ; state->cur.put_bits = put_bits;
-
- pop ebp
- pop edi
- pop esi
-; pop edx ; need not be preserved
- pop ecx
- pop ebx
- mov esp, ebp ; esp <- aligned ebp
- pop esp ; esp <- original ebp
- pop ebp
+
+%assign stack_offset 0
+%define arg_state 4 + stack_offset
+%define arg_buffer 8 + stack_offset
+%define arg_block 12 + stack_offset
+%define arg_last_dc_val 16 + stack_offset
+%define arg_dctbl 20 + stack_offset
+%define arg_actbl 24 + stack_offset
+
+ ;X: X = code stream
+ mov block, [esp + arg_block]
+ PUSH ebx
+ PUSH ebp
+ movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07
+ PUSH esi
+ PUSH edi
+ movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07
+ mov frame, esp
+ lea t, [frame - (save_frame + 4)]
+ movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15
+ and t, -DCTSIZE2 * SIZEOF_WORD ; t = &t_[0]
+ mov [t + save_frame], frame
+ pxor xmm4, xmm4 ;A: w4[i] = 0;
+ punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11
+ pshuflw xmm0, xmm0, 11001001b ;A: w0 = 01 08 xx 09 02 03 10 11
+ pinsrw xmm0, word [block + 16 * SIZEOF_WORD], 2 ;A: w0 = 01 08 16 09 02 03 10 11
+ punpckhdq xmm3, xmm1 ;D: w3 = 04 05 12 13 06 07 14 15
+ punpcklqdq xmm1, xmm3 ;B: w1 = 08 09 10 11 04 05 12 13
+ pinsrw xmm0, word [block + 17 * SIZEOF_WORD], 7 ;A: w0 = 01 08 16 09 02 03 10 17
+ ;A: (Row 0, offset 1)
+ pcmpgtw xmm4, xmm0 ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
+ paddw xmm0, xmm4 ;A: w0[i] += w4[i];
+ movaps XMMWORD [t + 0 * SIZEOF_WORD], xmm0 ;A: t[i] = w0[i];
+
+ movq xmm2, qword [block + 24 * SIZEOF_WORD] ;B: w2 = 24 25 26 27 -- -- -- --
+ pshuflw xmm2, xmm2, 11011000b ;B: w2 = 24 26 25 27 -- -- -- --
+ pslldq xmm1, 1 * SIZEOF_WORD ;B: w1 = -- 08 09 10 11 04 05 12
+ movups xmm5, XMMWORD [block + 48 * SIZEOF_WORD] ;H: w5 = 48 49 50 51 52 53 54 55
+ movsd xmm1, xmm2 ;B: w1 = 24 26 25 27 11 04 05 12
+ punpcklqdq xmm2, xmm5 ;C: w2 = 24 26 25 27 48 49 50 51
+ pinsrw xmm1, word [block + 32 * SIZEOF_WORD], 1 ;B: w1 = 24 32 25 27 11 04 05 12
+ pxor xmm4, xmm4 ;A: w4[i] = 0;
+ psrldq xmm3, 2 * SIZEOF_WORD ;D: w3 = 12 13 06 07 14 15 -- --
+ pcmpeqw xmm0, xmm4 ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
+ pinsrw xmm1, word [block + 18 * SIZEOF_WORD], 3 ;B: w1 = 24 32 25 18 11 04 05 12
+ ; (Row 1, offset 1)
+ pcmpgtw xmm4, xmm1 ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
+ paddw xmm1, xmm4 ;B: w1[i] += w4[i];
+ movaps XMMWORD [t + 8 * SIZEOF_WORD], xmm1 ;B: t[i+8] = w1[i];
+ pxor xmm4, xmm4 ;B: w4[i] = 0;
+ pcmpeqw xmm1, xmm4 ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
+
+ packsswb xmm0, xmm1 ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
+ ; w/ signed saturation
+
+ pinsrw xmm3, word [block + 20 * SIZEOF_WORD], 0 ;D: w3 = 20 13 06 07 14 15 -- --
+ pinsrw xmm3, word [block + 21 * SIZEOF_WORD], 5 ;D: w3 = 20 13 06 07 14 21 -- --
+ pinsrw xmm3, word [block + 28 * SIZEOF_WORD], 6 ;D: w3 = 20 13 06 07 14 21 28 --
+ pinsrw xmm3, word [block + 35 * SIZEOF_WORD], 7 ;D: w3 = 20 13 06 07 14 21 28 35
+ ; (Row 3, offset 1)
+ pcmpgtw xmm4, xmm3 ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
+ paddw xmm3, xmm4 ;D: w3[i] += w4[i];
+ movaps XMMWORD [t + 24 * SIZEOF_WORD], xmm3 ;D: t[i+24] = w3[i];
+ pxor xmm4, xmm4 ;D: w4[i] = 0;
+ pcmpeqw xmm3, xmm4 ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
+
+ pinsrw xmm2, word [block + 19 * SIZEOF_WORD], 0 ;C: w2 = 19 26 25 27 48 49 50 51
+ pinsrw xmm2, word [block + 33 * SIZEOF_WORD], 2 ;C: w2 = 19 26 33 27 48 49 50 51
+ pinsrw xmm2, word [block + 40 * SIZEOF_WORD], 3 ;C: w2 = 19 26 33 40 48 49 50 51
+ pinsrw xmm2, word [block + 41 * SIZEOF_WORD], 5 ;C: w2 = 19 26 33 40 48 41 50 51
+ pinsrw xmm2, word [block + 34 * SIZEOF_WORD], 6 ;C: w2 = 19 26 33 40 48 41 34 51
+ pinsrw xmm2, word [block + 27 * SIZEOF_WORD], 7 ;C: w2 = 19 26 33 40 48 41 34 27
+ ; (Row 2, offset 1)
+ pcmpgtw xmm4, xmm2 ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
+ paddw xmm2, xmm4 ;C: w2[i] += w4[i];
+ movsx code_temp, word [block] ;Z: code_temp = block[0];
+
+; %1 - stack pointer adjustment
+%macro GET_SYM_BEFORE 1
+ movaps XMMWORD [t + 16 * SIZEOF_WORD + %1], xmm2
+ ;C: t[i+16] = w2[i];
+ pxor xmm4, xmm4 ;C: w4[i] = 0;
+ pcmpeqw xmm2, xmm4 ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
+ sub code_temp, [frame + arg_last_dc_val] ;Z: code_temp -= last_dc_val;
+
+ packsswb xmm2, xmm3 ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
+ ; w/ signed saturation
+
+ movdqa xmm3, xmm5 ;H: w3 = 48 49 50 51 52 53 54 55
+ pmovmskb index_temp, xmm2 ;Z: index_temp = 0; index_temp |= ((b2[i] >> 7) << i);
+ pmovmskb index, xmm0 ;Z: index = 0; index |= ((b0[i] >> 7) << i);
+ movups xmm0, XMMWORD [block + 56 * SIZEOF_WORD] ;H: w0 = 56 57 58 59 60 61 62 63
+ punpckhdq xmm3, xmm0 ;H: w3 = 52 53 60 61 54 55 62 63
+ shl index_temp, 16 ;Z: index_temp <<= 16;
+ psrldq xmm3, 1 * SIZEOF_WORD ;H: w3 = 53 60 61 54 55 62 63 --
+ pxor xmm2, xmm2 ;H: w2[i] = 0;
+ pshuflw xmm3, xmm3, 00111001b ;H: w3 = 60 61 54 53 55 62 63 --
+ or index, index_temp ;Z: index |= index_temp;
+%undef index_temp
+%define free_bits edi
+%endmacro
+
+%macro GET_SYM_AFTER 0
+ movq xmm1, qword [block + 44 * SIZEOF_WORD] ;G: w1 = 44 45 46 47 -- -- -- --
+ unpcklps xmm5, xmm0 ;E: w5 = 48 49 56 57 50 51 58 59
+ pxor xmm0, xmm0 ;H: w0[i] = 0;
+ not index ;Z: index = ~index;
+ pinsrw xmm3, word [block + 47 * SIZEOF_WORD], 3 ;H: w3 = 60 61 54 47 55 62 63 --
+ ; (Row 7, offset 1)
+ pcmpgtw xmm2, xmm3 ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
+ mov dctbl, [frame + arg_dctbl]
+ paddw xmm3, xmm2 ;H: w3[i] += w2[i];
+ movaps XMMWORD [t + 56 * SIZEOF_WORD], xmm3 ;H: t[i+56] = w3[i];
+ movq xmm4, qword [block + 36 * SIZEOF_WORD] ;G: w4 = 36 37 38 39 -- -- -- --
+ pcmpeqw xmm3, xmm0 ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
+ punpckldq xmm4, xmm1 ;G: w4 = 36 37 44 45 38 39 46 47
+ movdqa xmm1, xmm4 ;F: w1 = 36 37 44 45 38 39 46 47
+ pcmpeqw mm_all_0xff, mm_all_0xff ;Z: all_0xff[i] = 0xFF;
+%endmacro
+
+ GET_SYM nbits_base, jpeg_nbits_table, GET_SYM_BEFORE, GET_SYM_AFTER
+
+ psrldq xmm4, 1 * SIZEOF_WORD ;G: w4 = 37 44 45 38 39 46 47 --
+ shufpd xmm1, xmm5, 10b ;F: w1 = 36 37 44 45 50 51 58 59
+ pshufhw xmm4, xmm4, 11010011b ;G: w4 = 37 44 45 38 -- 39 46 --
+ pslldq xmm1, 1 * SIZEOF_WORD ;F: w1 = -- 36 37 44 45 50 51 58
+ pinsrw xmm4, word [block + 59 * SIZEOF_WORD], 0 ;G: w4 = 59 44 45 38 -- 39 46 --
+ pshufd xmm1, xmm1, 11011000b ;F: w1 = -- 36 45 50 37 44 51 58
+ cmp code_temp, 1 << 31 ;Z: Set CF if code_temp < 0x80000000,
+ ;Z: i.e. if code_temp is positive
+ pinsrw xmm4, word [block + 52 * SIZEOF_WORD], 1 ;G: w4 = 59 52 45 38 -- 39 46 --
+ movlps xmm1, qword [block + 20 * SIZEOF_WORD] ;F: w1 = 20 21 22 23 37 44 51 58
+ pinsrw xmm4, word [block + 31 * SIZEOF_WORD], 4 ;G: w4 = 59 52 45 38 31 39 46 --
+ pshuflw xmm1, xmm1, 01110010b ;F: w1 = 22 20 23 21 37 44 51 58
+ pinsrw xmm4, word [block + 53 * SIZEOF_WORD], 7 ;G: w4 = 59 52 45 38 31 39 46 53
+ ; (Row 6, offset 1)
+ adc code_temp, -1 ;Z: code_temp += -1 + (code_temp >= 0 ? 1 : 0);
+ pxor xmm2, xmm2 ;G: w2[i] = 0;
+ pcmpgtw xmm0, xmm4 ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
+ pinsrw xmm1, word [block + 15 * SIZEOF_WORD], 1 ;F: w1 = 22 15 23 21 37 44 51 58
+ paddw xmm4, xmm0 ;G: w4[i] += w0[i];
+ movaps XMMWORD [t + 48 * SIZEOF_WORD], xmm4 ;G: t[48+i] = w4[i];
+ movd mm_temp, code_temp ;Z: temp = code_temp
+ pinsrw xmm1, word [block + 30 * SIZEOF_WORD], 3 ;F: w1 = 22 15 23 30 37 44 51 58
+ ; (Row 5, offset 1)
+ pcmpeqw xmm4, xmm2 ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
+
+ packsswb xmm4, xmm3 ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
+ ; w/ signed saturation
+
+ lea t, [t - SIZEOF_WORD] ;Z: t = &t[-1]
+ pxor xmm0, xmm0 ;F: w0[i] = 0;
+ pcmpgtw xmm2, xmm1 ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
+ paddw xmm1, xmm2 ;F: w1[i] += w2[i];
+ movaps XMMWORD [t + (40+1) * SIZEOF_WORD], xmm1 ;F: t[40+i] = w1[i];
+ pcmpeqw xmm1, xmm0 ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
+ pinsrw xmm5, word [block + 42 * SIZEOF_WORD], 0 ;E: w5 = 42 49 56 57 50 51 58 59
+ pinsrw xmm5, word [block + 43 * SIZEOF_WORD], 5 ;E: w5 = 42 49 56 57 50 43 58 59
+ pinsrw xmm5, word [block + 36 * SIZEOF_WORD], 6 ;E: w5 = 42 49 56 57 50 43 36 59
+ pinsrw xmm5, word [block + 29 * SIZEOF_WORD], 7 ;E: w5 = 42 49 56 57 50 43 36 29
+ ; (Row 4, offset 1)
+%undef block
+%define nbits edx
+%define nbitsb dl
+%define nbitsh dh
+ movzx nbits, byte [NBITS(code_temp)] ;Z: nbits = JPEG_NBITS(code_temp);
+%undef code_temp
+%define state esi
+ pxor xmm2, xmm2 ;E: w2[i] = 0;
+ mov state, [frame + arg_state]
+ movd mm_nbits, nbits ;Z: nbits --> MMX register
+ pcmpgtw xmm0, xmm5 ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
+ movd mm_code, dword [dctbl + c_derived_tbl.ehufco + nbits * 4]
+ ;Z: code = dctbl->ehufco[nbits];
+%define size ecx
+%define sizeb cl
+%define sizeh ch
+ paddw xmm5, xmm0 ;E: w5[i] += w0[i];
+ movaps XMMWORD [t + (32+1) * SIZEOF_WORD], xmm5 ;E: t[32+i] = w5[i];
+ movzx size, byte [dctbl + c_derived_tbl.ehufsi + nbits]
+ ;Z: size = dctbl->ehufsi[nbits];
+%undef dctbl
+ pcmpeqw xmm5, xmm2 ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
+
+ packsswb xmm5, xmm1 ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
+ ; w/ signed saturation
+
+ movq mm_put_buffer, [state + working_state.cur.put_buffer.simd]
+ ;Z: put_buffer = state->cur.put_buffer.simd;
+ mov free_bits, [state + working_state.cur.free_bits]
+ ;Z: free_bits = state->cur.free_bits;
+%undef state
+%define actbl esi
+ mov actbl, [frame + arg_actbl]
+%define buffer eax
+ mov buffer, [frame + arg_buffer]
+%undef frame
+ jmp .BEGIN
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ align 16
+; size <= 32, so this is not really a loop
+.BRLOOP1: ; .BRLOOP1:
+ movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
+ ; nbits = actbl->ehufsi[0xf0];
+ movd mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
+ ; code = actbl->ehufco[0xf0];
+ and index, 0x7ffffff ; clear index if size == 32
+ sub size, 16 ; size -= 16;
+ sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
+ jle .EMIT_BRLOOP1 ; goto .EMIT_BRLOOP1;
+ movd mm_nbits, nbits ; nbits --> MMX register
+ psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
+ por mm_put_buffer, mm_code ; put_buffer |= code;
+ jmp .ERLOOP1 ; goto .ERLOOP1;
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ align 16
+%ifdef PIC
+ times 6 nop
+%else
+ times 2 nop
+%endif
+.BLOOP1: ; do { /* size = # of zero bits/elements to skip */
+; if size == 32, index remains unchanged. Correct in .BRLOOP.
+ shr index, sizeb ; index >>= size;
+ lea t, [t + size * SIZEOF_WORD] ; t += size;
+ cmp size, 16 ; if (size > 16)
+ jg .BRLOOP1 ; goto .BRLOOP1;
+.ERLOOP1: ; .ERLOOP1:
+ movsx nbits, word [t] ; nbits = *t;
+%ifdef PIC
+ add size, size ; size += size;
+%else
+ lea size, [size * 2] ; size += size;
+%endif
+ movd mm_temp, nbits ; temp = nbits;
+ movzx nbits, byte [NBITS(nbits)] ; nbits = JPEG_NBITS(nbits);
+ lea size, [size * 8 + nbits] ; size = size * 8 + nbits;
+ movd mm_nbits, nbits ; nbits --> MMX register
+ movd mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
+ ; code = actbl->ehufco[size-16];
+ movzx size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
+ ; size = actbl->ehufsi[size-16];
+.BEGIN: ; .BEGIN:
+ pand mm_temp, [MASK_BITS(nbits)] ; temp &= (1 << nbits) - 1;
+ psllq mm_code, mm_nbits ; code <<= nbits;
+ add nbits, size ; nbits += size;
+ por mm_code, mm_temp ; code |= temp;
+ sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
+ jle .EMIT_ERLOOP1 ; insert code, flush buffer, init size, goto .BLOOP1
+ xor size, size ; size = 0; /* kill tzcnt input dependency */
+ tzcnt size, index ; size = # of trailing 0 bits in index
+ movd mm_nbits, nbits ; nbits --> MMX register
+ psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
+ inc size ; ++size;
+ por mm_put_buffer, mm_code ; put_buffer |= code;
+ test index, index
+ jnz .BLOOP1 ; } while (index != 0);
+; Round 2
+; t points to the last used word, possibly below t_ if the previous index had 32 zero bits.
+.ELOOP1: ; .ELOOP1:
+ pmovmskb size, xmm4 ; size = 0; size |= ((b4[i] >> 7) << i);
+ pmovmskb index, xmm5 ; index = 0; index |= ((b5[i] >> 7) << i);
+ shl size, 16 ; size <<= 16;
+ or index, size ; index |= size;
+ not index ; index = ~index;
+ lea nbits, [t + (1 + DCTSIZE2) * SIZEOF_WORD]
+ ; nbits = t + 1 + 64;
+ and nbits, -DCTSIZE2 * SIZEOF_WORD ; nbits &= -128; /* now points to &t_[64] */
+ sub nbits, t ; nbits -= t;
+ shr nbits, 1 ; nbits >>= 1; /* # of leading 0 bits in old index + 33 */
+ tzcnt size, index ; size = # of trailing 0 bits in index
+ inc size ; ++size;
+ test index, index ; if (index == 0)
+ jz .ELOOP2 ; goto .ELOOP2;
+; NOTE: size == 32 cannot happen, since the last element is always 0.
+ shr index, sizeb ; index >>= size;
+ lea size, [size + nbits - 33] ; size = size + nbits - 33;
+ lea t, [t + size * SIZEOF_WORD] ; t += size;
+ cmp size, 16 ; if (size <= 16)
+ jle .ERLOOP2 ; goto .ERLOOP2;
+.BRLOOP2: ; do {
+ movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
+ ; nbits = actbl->ehufsi[0xf0];
+ sub size, 16 ; size -= 16;
+ movd mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
+ ; code = actbl->ehufco[0xf0];
+ sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
+ jle .EMIT_BRLOOP2 ; insert code and flush put_buffer
+ movd mm_nbits, nbits ; else { nbits --> MMX register
+ psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
+ por mm_put_buffer, mm_code ; put_buffer |= code;
+ cmp size, 16 ; if (size <= 16)
+ jle .ERLOOP2 ; goto .ERLOOP2;
+ jmp .BRLOOP2 ; } while (1);
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ align 16
+.BLOOP2: ; do { /* size = # of zero bits/elements to skip */
+ shr index, sizeb ; index >>= size;
+ lea t, [t + size * SIZEOF_WORD] ; t += size;
+ cmp size, 16 ; if (size > 16)
+ jg .BRLOOP2 ; goto .BRLOOP2;
+.ERLOOP2: ; .ERLOOP2:
+ movsx nbits, word [t] ; nbits = *t;
+ add size, size ; size += size;
+ movd mm_temp, nbits ; temp = nbits;
+ movzx nbits, byte [NBITS(nbits)] ; nbits = JPEG_NBITS(nbits);
+ movd mm_nbits, nbits ; nbits --> MMX register
+ lea size, [size * 8 + nbits] ; size = size * 8 + nbits;
+ movd mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
+ ; code = actbl->ehufco[size-16];
+ movzx size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
+ ; size = actbl->ehufsi[size-16];
+ psllq mm_code, mm_nbits ; code <<= nbits;
+ pand mm_temp, [MASK_BITS(nbits)] ; temp &= (1 << nbits) - 1;
+ lea nbits, [nbits + size] ; nbits += size;
+ por mm_code, mm_temp ; code |= temp;
+ xor size, size ; size = 0; /* kill tzcnt input dependency */
+ sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
+ jle .EMIT_ERLOOP2 ; insert code, flush buffer, init size, goto .BLOOP2
+ tzcnt size, index ; size = # of trailing 0 bits in index
+ movd mm_nbits, nbits ; nbits --> MMX register
+ psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
+ inc size ; ++size;
+ por mm_put_buffer, mm_code ; put_buffer |= code;
+ test index, index
+ jnz .BLOOP2 ; } while (index != 0);
+.ELOOP2: ; .ELOOP2:
+ mov nbits, t ; nbits = t;
+ lea t, [t + SIZEOF_WORD] ; t = &t[1];
+ and nbits, DCTSIZE2 * SIZEOF_WORD - 1 ; nbits &= 127;
+ and t, -DCTSIZE2 * SIZEOF_WORD ; t &= -128; /* t = &t_[0]; */
+ cmp nbits, (DCTSIZE2 - 2) * SIZEOF_WORD ; if (nbits != 62 * 2)
+ je .EFN ; {
+ movd mm_code, dword [actbl + c_derived_tbl.ehufco + 0]
+ ; code = actbl->ehufco[0];
+ movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
+ ; nbits = actbl->ehufsi[0];
+ sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
+ jg .EFN_SKIP_EMIT_CODE ; {
+ EMIT_QWORD size, sizeb, sizeh, , , , , , .EFN ; insert code, flush put_buffer
+ align 16
+.EFN_SKIP_EMIT_CODE: ; } else {
+ movd mm_nbits, nbits ; nbits --> MMX register
+ psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
+ por mm_put_buffer, mm_code ; put_buffer |= code;
+.EFN: ; } }
+%define frame esp
+ mov frame, [t + save_frame]
+%define state ecx
+ mov state, [frame + arg_state]
+ movq [state + working_state.cur.put_buffer.simd], mm_put_buffer
+ ; state->cur.put_buffer.simd = put_buffer;
+ emms
+ mov [state + working_state.cur.free_bits], free_bits
+ ; state->cur.free_bits = free_bits;
+ POP edi
+ POP esi
+ POP ebp
+ POP ebx
ret
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ align 16
+.EMIT_BRLOOP1:
+ EMIT_QWORD emit_temp, emit_tempb, emit_temph, , , , , , \
+ .ERLOOP1
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ align 16
+.EMIT_ERLOOP1:
+ EMIT_QWORD size, sizeb, sizeh, \
+ { xor size, size }, \
+ { tzcnt size, index }, \
+ { inc size }, \
+ { test index, index }, \
+ { jnz .BLOOP1 }, \
+ .ELOOP1
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ align 16
+.EMIT_BRLOOP2:
+ EMIT_QWORD emit_temp, emit_tempb, emit_temph, , , , \
+ { cmp size, 16 }, \
+ { jle .ERLOOP2 }, \
+ .BRLOOP2
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ align 16
+.EMIT_ERLOOP2:
+ EMIT_QWORD size, sizeb, sizeh, \
+ { xor size, size }, \
+ { tzcnt size, index }, \
+ { inc size }, \
+ { test index, index }, \
+ { jnz .BLOOP2 }, \
+ .ELOOP2
+
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32
diff --git a/simd/i386/jfdctint-avx2.asm b/simd/i386/jfdctint-avx2.asm
index 97de230..23cf733 100644
--- a/simd/i386/jfdctint-avx2.asm
+++ b/simd/i386/jfdctint-avx2.asm
@@ -2,7 +2,7 @@
; jfdctint.asm - accurate integer FDCT (AVX2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, 2018, D. R. Commander.
+; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -14,7 +14,7 @@
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
-; This file contains a slow-but-accurate integer implementation of the
+; This file contains a slower but more accurate integer implementation of the
; forward DCT (Discrete Cosine Transform). The following code is based
; directly on the IJG's original jfdctint.c; see the jfdctint.c for
; more details.
@@ -103,7 +103,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
%endmacro
; --------------------------------------------------------------------------
-; In-place 8x8x16-bit slow integer forward DCT using AVX2 instructions
+; In-place 8x8x16-bit accurate integer forward DCT using AVX2 instructions
; %1-%4: Input/output registers
; %5-%8: Temp registers
; %9: Pass (1 or 2)
diff --git a/simd/i386/jfdctint-mmx.asm b/simd/i386/jfdctint-mmx.asm
index 3ade9d4..34a43b9 100644
--- a/simd/i386/jfdctint-mmx.asm
+++ b/simd/i386/jfdctint-mmx.asm
@@ -2,7 +2,7 @@
; jfdctint.asm - accurate integer FDCT (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2020, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -14,7 +14,7 @@
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
-; This file contains a slow-but-accurate integer implementation of the
+; This file contains a slower but more accurate integer implementation of the
; forward DCT (Discrete Cosine Transform). The following code is based
; directly on the IJG's original jfdctint.c; see the jfdctint.c for
; more details.
diff --git a/simd/i386/jfdctint-sse2.asm b/simd/i386/jfdctint-sse2.asm
index 71b684c..6f8e18c 100644
--- a/simd/i386/jfdctint-sse2.asm
+++ b/simd/i386/jfdctint-sse2.asm
@@ -2,7 +2,7 @@
; jfdctint.asm - accurate integer FDCT (SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2020, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -14,7 +14,7 @@
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
-; This file contains a slow-but-accurate integer implementation of the
+; This file contains a slower but more accurate integer implementation of the
; forward DCT (Discrete Cosine Transform). The following code is based
; directly on the IJG's original jfdctint.c; see the jfdctint.c for
; more details.
diff --git a/simd/i386/jidctint-avx2.asm b/simd/i386/jidctint-avx2.asm
index c371985..199c7df 100644
--- a/simd/i386/jidctint-avx2.asm
+++ b/simd/i386/jidctint-avx2.asm
@@ -2,7 +2,7 @@
; jidctint.asm - accurate integer IDCT (AVX2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, 2018, D. R. Commander.
+; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -14,7 +14,7 @@
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
-; This file contains a slow-but-accurate integer implementation of the
+; This file contains a slower but more accurate integer implementation of the
; inverse DCT (Discrete Cosine Transform). The following code is based
; directly on the IJG's original jidctint.c; see the jidctint.c for
; more details.
@@ -113,7 +113,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
%endmacro
; --------------------------------------------------------------------------
-; In-place 8x8x16-bit slow integer inverse DCT using AVX2 instructions
+; In-place 8x8x16-bit accurate integer inverse DCT using AVX2 instructions
; %1-%4: Input/output registers
; %5-%12: Temp registers
; %9: Pass (1 or 2)
diff --git a/simd/i386/jidctint-mmx.asm b/simd/i386/jidctint-mmx.asm
index 4f07f56..f15c8d3 100644
--- a/simd/i386/jidctint-mmx.asm
+++ b/simd/i386/jidctint-mmx.asm
@@ -2,7 +2,7 @@
; jidctint.asm - accurate integer IDCT (MMX)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2020, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -14,7 +14,7 @@
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
-; This file contains a slow-but-accurate integer implementation of the
+; This file contains a slower but more accurate integer implementation of the
; inverse DCT (Discrete Cosine Transform). The following code is based
; directly on the IJG's original jidctint.c; see the jidctint.c for
; more details.
diff --git a/simd/i386/jidctint-sse2.asm b/simd/i386/jidctint-sse2.asm
index e442fdd..43e3201 100644
--- a/simd/i386/jidctint-sse2.asm
+++ b/simd/i386/jidctint-sse2.asm
@@ -2,7 +2,7 @@
; jidctint.asm - accurate integer IDCT (SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2016, D. R. Commander.
+; Copyright (C) 2016, 2020, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -14,7 +14,7 @@
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
-; This file contains a slow-but-accurate integer implementation of the
+; This file contains a slower but more accurate integer implementation of the
; inverse DCT (Discrete Cosine Transform). The following code is based
; directly on the IJG's original jidctint.c; see the jidctint.c for
; more details.
diff --git a/simd/i386/jsimd.c b/simd/i386/jsimd.c
index 2f92f8c..563949a 100644
--- a/simd/i386/jsimd.c
+++ b/simd/i386/jsimd.c
@@ -543,12 +543,6 @@ jsimd_can_h2v1_fancy_upsample(void)
return 0;
}
-GLOBAL(int)
-jsimd_can_h1v2_fancy_upsample(void)
-{
- return 0;
-}
-
GLOBAL(void)
jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
@@ -585,12 +579,6 @@ jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
output_data_ptr);
}
-GLOBAL(void)
-jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
-{
-}
-
GLOBAL(int)
jsimd_can_h2v2_merged_upsample(void)
{
diff --git a/simd/jsimd.h b/simd/jsimd.h
index 99c8801..64747c6 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -2,11 +2,12 @@
* simd/jsimd.h
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2011, 2014-2016, 2018, D. R. Commander.
+ * Copyright (C) 2011, 2014-2016, 2018, 2020, D. R. Commander.
* Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
* Copyright (C) 2014, Linaro Limited.
* Copyright (C) 2015-2016, 2018, Matthieu Darbois.
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * Copyright (C) 2020, Arm Limited.
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -121,6 +122,17 @@ EXTERN(void) jsimd_extxrgb_ycc_convert_neon
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows);
+#ifndef NEON_INTRINSICS
+
+EXTERN(void) jsimd_extrgb_ycc_convert_neon_slowld3
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_neon_slowld3
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+
+#endif
+
EXTERN(void) jsimd_rgb_ycc_convert_dspr2
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows);
@@ -300,6 +312,28 @@ EXTERN(void) jsimd_extxrgb_gray_convert_dspr2
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_rgb_gray_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_mmi
+ (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows);
+
EXTERN(void) jsimd_rgb_gray_convert_altivec
(JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows);
@@ -416,6 +450,17 @@ EXTERN(void) jsimd_ycc_rgb565_convert_neon
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows);
+#ifndef NEON_INTRINSICS
+
+EXTERN(void) jsimd_ycc_extrgb_convert_neon_slowst3
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_neon_slowst3
+ (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows);
+
+#endif
+
EXTERN(void) jsimd_ycc_rgb_convert_dspr2
(JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
JSAMPARRAY output_buf, int num_rows);
@@ -637,6 +682,9 @@ EXTERN(void) jsimd_h2v2_fancy_upsample_dspr2
(int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v1_fancy_upsample_mmi
+ (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr);
EXTERN(void) jsimd_h2v2_fancy_upsample_mmi
(int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
JSAMPARRAY *output_data_ptr);
@@ -871,6 +919,50 @@ EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_dspr2
(JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mmi
+ (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+ JSAMPARRAY output_buf);
+
EXTERN(void) jsimd_h2v1_merged_upsample_altivec
(JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
JSAMPARRAY output_buf);
@@ -947,7 +1039,7 @@ EXTERN(void) jsimd_convsamp_float_sse2
EXTERN(void) jsimd_convsamp_float_dspr2
(JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
-/* Slow Integer Forward DCT */
+/* Accurate Integer Forward DCT */
EXTERN(void) jsimd_fdct_islow_mmx(DCTELEM *data);
extern const int jconst_fdct_islow_sse2[];
@@ -974,6 +1066,8 @@ EXTERN(void) jsimd_fdct_ifast_neon(DCTELEM *data);
EXTERN(void) jsimd_fdct_ifast_dspr2(DCTELEM *data);
+EXTERN(void) jsimd_fdct_ifast_mmi(DCTELEM *data);
+
EXTERN(void) jsimd_fdct_ifast_altivec(DCTELEM *data);
/* Floating Point Forward DCT */
@@ -1054,7 +1148,7 @@ EXTERN(void) jsimd_idct_12x12_pass1_dspr2
EXTERN(void) jsimd_idct_12x12_pass2_dspr2
(int *workspace, int *output);
-/* Slow Integer Inverse DCT */
+/* Accurate Integer Inverse DCT */
EXTERN(void) jsimd_idct_islow_mmx
(void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col);
@@ -1105,6 +1199,10 @@ EXTERN(void) jsimd_idct_ifast_rows_dspr2
(DCTELEM *wsptr, JSAMPARRAY output_buf, JDIMENSION output_col,
const int *idct_coefs);
+EXTERN(void) jsimd_idct_ifast_mmi
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+
EXTERN(void) jsimd_idct_ifast_altivec
(void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col);
@@ -1134,15 +1232,27 @@ EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon
(void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
c_derived_tbl *dctbl, c_derived_tbl *actbl);
+#ifndef NEON_INTRINSICS
+
EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon_slowtbl
(void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
c_derived_tbl *dctbl, c_derived_tbl *actbl);
+#endif
+
/* Progressive Huffman encoding */
EXTERN(void) jsimd_encode_mcu_AC_first_prepare_sse2
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
JCOEF *values, size_t *zerobits);
+EXTERN(void) jsimd_encode_mcu_AC_first_prepare_neon
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ JCOEF *values, size_t *zerobits);
+
EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_sse2
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
JCOEF *absvalues, size_t *bits);
+
+EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_neon
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ JCOEF *absvalues, size_t *bits);
diff --git a/simd/nasm/jsimdcfg.inc.h b/simd/nasm/jsimdcfg.inc.h
index 7ff7e29..bf2a45a 100644
--- a/simd/nasm/jsimdcfg.inc.h
+++ b/simd/nasm/jsimdcfg.inc.h
@@ -1,8 +1,10 @@
-// This file generates the include file for the assembly
-// implementations by abusing the C preprocessor.
-//
-// Note: Some things are manually defined as they need to
-// be mapped to NASM types.
+/*
+ * This file generates the include file for the assembly
+ * implementations by abusing the C preprocessor.
+ *
+ * Note: Some things are manually defined as they need to
+ * be mapped to NASM types.
+ */
;
; Automatically generated include file from jsimdcfg.inc.h
diff --git a/simd/nasm/jsimdext.inc b/simd/nasm/jsimdext.inc
index 9930d80..e8d50b0 100644
--- a/simd/nasm/jsimdext.inc
+++ b/simd/nasm/jsimdext.inc
@@ -2,8 +2,9 @@
; jsimdext.inc - common declarations
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2010, 2016, 2019, D. R. Commander.
+; Copyright (C) 2010, 2016, 2018-2019, D. R. Commander.
; Copyright (C) 2018, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
;
; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
;
@@ -130,13 +131,53 @@ section .note.GNU-stack noalloc noexec nowrite progbits
; Common types
;
%ifdef __x86_64__
+%ifnidn __OUTPUT_FORMAT__, elfx32
%define POINTER qword ; general pointer type
%define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER)
%define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT
-%else
+%define resp resq
+%define dp dq
+%define raxp rax
+%define rbxp rbx
+%define rcxp rcx
+%define rdxp rdx
+%define rsip rsi
+%define rdip rdi
+%define rbpp rbp
+%define rspp rsp
+%define r8p r8
+%define r9p r9
+%define r10p r10
+%define r11p r11
+%define r12p r12
+%define r13p r13
+%define r14p r14
+%define r15p r15
+%endif
+%endif
+%ifndef raxp
%define POINTER dword ; general pointer type
%define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER)
%define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT
+%define resp resd
+%define dp dd
+; x86_64 ILP32 ABI (x32)
+%define raxp eax
+%define rbxp ebx
+%define rcxp ecx
+%define rdxp edx
+%define rsip esi
+%define rdip edi
+%define rbpp ebp
+%define rspp esp
+%define r8p r8d
+%define r9p r9d
+%define r10p r10d
+%define r11p r11d
+%define r12p r12d
+%define r13p r13d
+%define r14p r14d
+%define r15p r15d
%endif
%define INT dword ; signed integer type
diff --git a/simd/x86_64/jccolext-avx2.asm b/simd/x86_64/jccolext-avx2.asm
index 10d2834..ffb527d 100644
--- a/simd/x86_64/jccolext-avx2.asm
+++ b/simd/x86_64/jccolext-avx2.asm
@@ -3,6 +3,7 @@
;
; Copyright (C) 2009, 2016, D. R. Commander.
; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -57,9 +58,9 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
mov rsi, r12
mov ecx, r13d
- mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
- mov rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
- mov rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
+ mov rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
@@ -77,10 +78,10 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
push rsi
push rcx ; col
- mov rsi, JSAMPROW [rsi] ; inptr
- mov rdi, JSAMPROW [rdi] ; outptr0
- mov rbx, JSAMPROW [rbx] ; outptr1
- mov rdx, JSAMPROW [rdx] ; outptr2
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rdip, JSAMPROW [rdi] ; outptr0
+ mov rbxp, JSAMPROW [rbx] ; outptr1
+ mov rdxp, JSAMPROW [rdx] ; outptr2
cmp rcx, byte SIZEOF_YMMWORD
jae near .columnloop
diff --git a/simd/x86_64/jccolext-sse2.asm b/simd/x86_64/jccolext-sse2.asm
index 2c914d3..af70ed6 100644
--- a/simd/x86_64/jccolext-sse2.asm
+++ b/simd/x86_64/jccolext-sse2.asm
@@ -2,6 +2,7 @@
; jccolext.asm - colorspace conversion (64-bit SSE2)
;
; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -56,9 +57,9 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
mov rsi, r12
mov ecx, r13d
- mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
- mov rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
- mov rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
+ mov rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
@@ -76,10 +77,10 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
push rsi
push rcx ; col
- mov rsi, JSAMPROW [rsi] ; inptr
- mov rdi, JSAMPROW [rdi] ; outptr0
- mov rbx, JSAMPROW [rbx] ; outptr1
- mov rdx, JSAMPROW [rdx] ; outptr2
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rdip, JSAMPROW [rdi] ; outptr0
+ mov rbxp, JSAMPROW [rbx] ; outptr1
+ mov rdxp, JSAMPROW [rdx] ; outptr2
cmp rcx, byte SIZEOF_XMMWORD
jae near .columnloop
diff --git a/simd/x86_64/jcgryext-avx2.asm b/simd/x86_64/jcgryext-avx2.asm
index 175b60d..ddcc2c0 100644
--- a/simd/x86_64/jcgryext-avx2.asm
+++ b/simd/x86_64/jcgryext-avx2.asm
@@ -3,6 +3,7 @@
;
; Copyright (C) 2011, 2016, D. R. Commander.
; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -57,7 +58,7 @@ EXTN(jsimd_rgb_gray_convert_avx2):
mov rsi, r12
mov ecx, r13d
- mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+ mov rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
pop rcx
@@ -71,8 +72,8 @@ EXTN(jsimd_rgb_gray_convert_avx2):
push rsi
push rcx ; col
- mov rsi, JSAMPROW [rsi] ; inptr
- mov rdi, JSAMPROW [rdi] ; outptr0
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rdip, JSAMPROW [rdi] ; outptr0
cmp rcx, byte SIZEOF_YMMWORD
jae near .columnloop
diff --git a/simd/x86_64/jcgryext-sse2.asm b/simd/x86_64/jcgryext-sse2.asm
index 873be80..f1d399a 100644
--- a/simd/x86_64/jcgryext-sse2.asm
+++ b/simd/x86_64/jcgryext-sse2.asm
@@ -2,6 +2,7 @@
; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2)
;
; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -56,7 +57,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
mov rsi, r12
mov ecx, r13d
- mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+ mov rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
lea rdi, [rdi+rcx*SIZEOF_JSAMPROW]
pop rcx
@@ -70,8 +71,8 @@ EXTN(jsimd_rgb_gray_convert_sse2):
push rsi
push rcx ; col
- mov rsi, JSAMPROW [rsi] ; inptr
- mov rdi, JSAMPROW [rdi] ; outptr0
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rdip, JSAMPROW [rdi] ; outptr0
cmp rcx, byte SIZEOF_XMMWORD
jae near .columnloop
diff --git a/simd/x86_64/jchuff-sse2.asm b/simd/x86_64/jchuff-sse2.asm
index 7deab58..0072028 100644
--- a/simd/x86_64/jchuff-sse2.asm
+++ b/simd/x86_64/jchuff-sse2.asm
@@ -1,8 +1,9 @@
;
; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2)
;
-; Copyright (C) 2009-2011, 2014-2016, D. R. Commander.
+; Copyright (C) 2009-2011, 2014-2016, 2019, D. R. Commander.
; Copyright (C) 2015, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -15,146 +16,164 @@
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; This file contains an SSE2 implementation for Huffman coding of one block.
-; The following code is based directly on jchuff.c; see jchuff.c for more
-; details.
+; The following code is based on jchuff.c; see jchuff.c for more details.
%include "jsimdext.inc"
+struc working_state
+.next_output_byte: resp 1 ; => next byte to write in buffer
+.free_in_buffer: resp 1 ; # of byte spaces remaining in buffer
+.cur.put_buffer.simd resq 1 ; current bit accumulation buffer
+.cur.free_bits resd 1 ; # of bits available in it
+.cur.last_dc_val resd 4 ; last DC coef for each component
+.cinfo: resp 1 ; dump_buffer needs access to this
+endstruc
+
+struc c_derived_tbl
+.ehufco: resd 256 ; code for each symbol
+.ehufsi: resb 256 ; length of code for each symbol
+; If no code has been allocated for a symbol S, ehufsi[S] contains 0
+endstruc
+
; --------------------------------------------------------------------------
SECTION SEG_CONST
alignz 32
GLOBAL_DATA(jconst_huff_encode_one_block)
- EXTERN EXTN(jpeg_nbits_table)
EXTN(jconst_huff_encode_one_block):
+jpeg_mask_bits dd 0x0000, 0x0001, 0x0003, 0x0007
+ dd 0x000f, 0x001f, 0x003f, 0x007f
+ dd 0x00ff, 0x01ff, 0x03ff, 0x07ff
+ dd 0x0fff, 0x1fff, 0x3fff, 0x7fff
+
alignz 32
-; --------------------------------------------------------------------------
- SECTION SEG_TEXT
- BITS 64
+times 1 << 14 db 15
+times 1 << 13 db 14
+times 1 << 12 db 13
+times 1 << 11 db 12
+times 1 << 10 db 11
+times 1 << 9 db 10
+times 1 << 8 db 9
+times 1 << 7 db 8
+times 1 << 6 db 7
+times 1 << 5 db 6
+times 1 << 4 db 5
+times 1 << 3 db 4
+times 1 << 2 db 3
+times 1 << 1 db 2
+times 1 << 0 db 1
+times 1 db 0
+jpeg_nbits_table:
+times 1 db 0
+times 1 << 0 db 1
+times 1 << 1 db 2
+times 1 << 2 db 3
+times 1 << 3 db 4
+times 1 << 4 db 5
+times 1 << 5 db 6
+times 1 << 6 db 7
+times 1 << 7 db 8
+times 1 << 8 db 9
+times 1 << 9 db 10
+times 1 << 10 db 11
+times 1 << 11 db 12
+times 1 << 12 db 13
+times 1 << 13 db 14
+times 1 << 14 db 15
-; These macros perform the same task as the emit_bits() function in the
-; original libjpeg code. In addition to reducing overhead by explicitly
-; inlining the code, additional performance is achieved by taking into
-; account the size of the bit buffer and waiting until it is almost full
-; before emptying it. This mostly benefits 64-bit platforms, since 6
-; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
-
-%macro EMIT_BYTE 0
- sub put_bits, 8 ; put_bits -= 8;
- mov rdx, put_buffer
- mov ecx, put_bits
- shr rdx, cl ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
- mov byte [buffer], dl ; *buffer++ = c;
- add buffer, 1
- cmp dl, 0xFF ; need to stuff a zero byte?
- jne %%.EMIT_BYTE_END
- mov byte [buffer], 0 ; *buffer++ = 0;
- add buffer, 1
-%%.EMIT_BYTE_END:
-%endmacro
+ alignz 32
-%macro PUT_BITS 1
- add put_bits, ecx ; put_bits += size;
- shl put_buffer, cl ; put_buffer = (put_buffer << size);
- or put_buffer, %1
-%endmacro
+%define NBITS(x) nbits_base + x
+%define MASK_BITS(x) NBITS((x) * 4) + (jpeg_mask_bits - jpeg_nbits_table)
-%macro CHECKBUF31 0
- cmp put_bits, 32 ; if (put_bits > 31) {
- jl %%.CHECKBUF31_END
- EMIT_BYTE
- EMIT_BYTE
- EMIT_BYTE
- EMIT_BYTE
-%%.CHECKBUF31_END:
-%endmacro
-
-%macro CHECKBUF47 0
- cmp put_bits, 48 ; if (put_bits > 47) {
- jl %%.CHECKBUF47_END
- EMIT_BYTE
- EMIT_BYTE
- EMIT_BYTE
- EMIT_BYTE
- EMIT_BYTE
- EMIT_BYTE
-%%.CHECKBUF47_END:
-%endmacro
+; --------------------------------------------------------------------------
+ SECTION SEG_TEXT
+ BITS 64
-%macro EMIT_BITS 2
- CHECKBUF47
- mov ecx, %2
- PUT_BITS %1
-%endmacro
+; Shorthand used to describe SIMD operations:
+; wN: xmmN treated as eight signed 16-bit values
+; wN[i]: perform the same operation on all eight signed 16-bit values, i=0..7
+; bN: xmmN treated as 16 unsigned 8-bit values
+; bN[i]: perform the same operation on all 16 unsigned 8-bit values, i=0..15
+; Contents of SIMD registers are shown in memory order.
-%macro kloop_prepare 37 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
- pxor xmm8, xmm8 ; __m128i neg = _mm_setzero_si128();
- pxor xmm9, xmm9 ; __m128i neg = _mm_setzero_si128();
- pxor xmm10, xmm10 ; __m128i neg = _mm_setzero_si128();
- pxor xmm11, xmm11 ; __m128i neg = _mm_setzero_si128();
- pinsrw %34, word [r12 + %2 * SIZEOF_WORD], 0 ; xmm_shadow[0] = block[jno0];
- pinsrw %35, word [r12 + %10 * SIZEOF_WORD], 0 ; xmm_shadow[8] = block[jno8];
- pinsrw %36, word [r12 + %18 * SIZEOF_WORD], 0 ; xmm_shadow[16] = block[jno16];
- pinsrw %37, word [r12 + %26 * SIZEOF_WORD], 0 ; xmm_shadow[24] = block[jno24];
- pinsrw %34, word [r12 + %3 * SIZEOF_WORD], 1 ; xmm_shadow[1] = block[jno1];
- pinsrw %35, word [r12 + %11 * SIZEOF_WORD], 1 ; xmm_shadow[9] = block[jno9];
- pinsrw %36, word [r12 + %19 * SIZEOF_WORD], 1 ; xmm_shadow[17] = block[jno17];
- pinsrw %37, word [r12 + %27 * SIZEOF_WORD], 1 ; xmm_shadow[25] = block[jno25];
- pinsrw %34, word [r12 + %4 * SIZEOF_WORD], 2 ; xmm_shadow[2] = block[jno2];
- pinsrw %35, word [r12 + %12 * SIZEOF_WORD], 2 ; xmm_shadow[10] = block[jno10];
- pinsrw %36, word [r12 + %20 * SIZEOF_WORD], 2 ; xmm_shadow[18] = block[jno18];
- pinsrw %37, word [r12 + %28 * SIZEOF_WORD], 2 ; xmm_shadow[26] = block[jno26];
- pinsrw %34, word [r12 + %5 * SIZEOF_WORD], 3 ; xmm_shadow[3] = block[jno3];
- pinsrw %35, word [r12 + %13 * SIZEOF_WORD], 3 ; xmm_shadow[11] = block[jno11];
- pinsrw %36, word [r12 + %21 * SIZEOF_WORD], 3 ; xmm_shadow[19] = block[jno19];
- pinsrw %37, word [r12 + %29 * SIZEOF_WORD], 3 ; xmm_shadow[27] = block[jno27];
- pinsrw %34, word [r12 + %6 * SIZEOF_WORD], 4 ; xmm_shadow[4] = block[jno4];
- pinsrw %35, word [r12 + %14 * SIZEOF_WORD], 4 ; xmm_shadow[12] = block[jno12];
- pinsrw %36, word [r12 + %22 * SIZEOF_WORD], 4 ; xmm_shadow[20] = block[jno20];
- pinsrw %37, word [r12 + %30 * SIZEOF_WORD], 4 ; xmm_shadow[28] = block[jno28];
- pinsrw %34, word [r12 + %7 * SIZEOF_WORD], 5 ; xmm_shadow[5] = block[jno5];
- pinsrw %35, word [r12 + %15 * SIZEOF_WORD], 5 ; xmm_shadow[13] = block[jno13];
- pinsrw %36, word [r12 + %23 * SIZEOF_WORD], 5 ; xmm_shadow[21] = block[jno21];
- pinsrw %37, word [r12 + %31 * SIZEOF_WORD], 5 ; xmm_shadow[29] = block[jno29];
- pinsrw %34, word [r12 + %8 * SIZEOF_WORD], 6 ; xmm_shadow[6] = block[jno6];
- pinsrw %35, word [r12 + %16 * SIZEOF_WORD], 6 ; xmm_shadow[14] = block[jno14];
- pinsrw %36, word [r12 + %24 * SIZEOF_WORD], 6 ; xmm_shadow[22] = block[jno22];
- pinsrw %37, word [r12 + %32 * SIZEOF_WORD], 6 ; xmm_shadow[30] = block[jno30];
- pinsrw %34, word [r12 + %9 * SIZEOF_WORD], 7 ; xmm_shadow[7] = block[jno7];
- pinsrw %35, word [r12 + %17 * SIZEOF_WORD], 7 ; xmm_shadow[15] = block[jno15];
- pinsrw %36, word [r12 + %25 * SIZEOF_WORD], 7 ; xmm_shadow[23] = block[jno23];
-%if %1 != 32
- pinsrw %37, word [r12 + %33 * SIZEOF_WORD], 7 ; xmm_shadow[31] = block[jno31];
-%else
- pinsrw %37, ebx, 7 ; xmm_shadow[31] = block[jno31];
-%endif
- pcmpgtw xmm8, %34 ; neg = _mm_cmpgt_epi16(neg, x1);
- pcmpgtw xmm9, %35 ; neg = _mm_cmpgt_epi16(neg, x1);
- pcmpgtw xmm10, %36 ; neg = _mm_cmpgt_epi16(neg, x1);
- pcmpgtw xmm11, %37 ; neg = _mm_cmpgt_epi16(neg, x1);
- paddw %34, xmm8 ; x1 = _mm_add_epi16(x1, neg);
- paddw %35, xmm9 ; x1 = _mm_add_epi16(x1, neg);
- paddw %36, xmm10 ; x1 = _mm_add_epi16(x1, neg);
- paddw %37, xmm11 ; x1 = _mm_add_epi16(x1, neg);
- pxor %34, xmm8 ; x1 = _mm_xor_si128(x1, neg);
- pxor %35, xmm9 ; x1 = _mm_xor_si128(x1, neg);
- pxor %36, xmm10 ; x1 = _mm_xor_si128(x1, neg);
- pxor %37, xmm11 ; x1 = _mm_xor_si128(x1, neg);
- pxor xmm8, %34 ; neg = _mm_xor_si128(neg, x1);
- pxor xmm9, %35 ; neg = _mm_xor_si128(neg, x1);
- pxor xmm10, %36 ; neg = _mm_xor_si128(neg, x1);
- pxor xmm11, %37 ; neg = _mm_xor_si128(neg, x1);
- movdqa XMMWORD [t1 + %1 * SIZEOF_WORD], %34 ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
- movdqa XMMWORD [t1 + (%1 + 8) * SIZEOF_WORD], %35 ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
- movdqa XMMWORD [t1 + (%1 + 16) * SIZEOF_WORD], %36 ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
- movdqa XMMWORD [t1 + (%1 + 24) * SIZEOF_WORD], %37 ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
- movdqa XMMWORD [t2 + %1 * SIZEOF_WORD], xmm8 ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
- movdqa XMMWORD [t2 + (%1 + 8) * SIZEOF_WORD], xmm9 ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
- movdqa XMMWORD [t2 + (%1 + 16) * SIZEOF_WORD], xmm10 ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
- movdqa XMMWORD [t2 + (%1 + 24) * SIZEOF_WORD], xmm11 ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
+; Fill the bit buffer to capacity with the leading bits from code, then output
+; the bit buffer and put the remaining bits from code into the bit buffer.
+;
+; Usage:
+; code - contains the bits to shift into the bit buffer (LSB-aligned)
+; %1 - the label to which to jump when the macro completes
+; %2 (optional) - extra instructions to execute after nbits has been set
+;
+; Upon completion, free_bits will be set to the number of remaining bits from
+; code, and put_buffer will contain those remaining bits. temp and code will
+; be clobbered.
+;
+; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
+; macro in jchuff.c.
+
+%macro EMIT_QWORD 1-2
+ add nbitsb, free_bitsb ; nbits += free_bits;
+ neg free_bitsb ; free_bits = -free_bits;
+ mov tempd, code ; temp = code;
+ shl put_buffer, nbitsb ; put_buffer <<= nbits;
+ mov nbitsb, free_bitsb ; nbits = free_bits;
+ neg free_bitsb ; free_bits = -free_bits;
+ shr tempd, nbitsb ; temp >>= nbits;
+ or tempq, put_buffer ; temp |= put_buffer;
+ movq xmm0, tempq ; xmm0.u64 = { temp, 0 };
+ bswap tempq ; temp = htonl(temp);
+ mov put_buffer, codeq ; put_buffer = code;
+ pcmpeqb xmm0, xmm1 ; b0[i] = (b0[i] == 0xFF ? 0xFF : 0);
+ %2
+ pmovmskb code, xmm0 ; code = 0; code |= ((b0[i] >> 7) << i);
+ mov qword [buffer], tempq ; memcpy(buffer, &temp, 8);
+ ; (speculative; will be overwritten if
+ ; code contains any 0xFF bytes)
+ add free_bitsb, 64 ; free_bits += 64;
+ add bufferp, 8 ; buffer += 8;
+ test code, code ; if (code == 0) /* No 0xFF bytes */
+ jz %1 ; return;
+ ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
+ ; bytes in the qword.
+ cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF
+ mov byte [buffer-7], 0 ; buffer[-7] = 0;
+ sbb bufferp, 6 ; buffer -= (6 + (temp[0] < 0xFF ? 1 : 0));
+ mov byte [buffer], temph ; buffer[0] = temp[1];
+ cmp temph, 0xFF ; Set CF if temp[1] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+ shr tempq, 16 ; temp >>= 16;
+ mov byte [buffer], tempb ; buffer[0] = temp[0];
+ cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+ mov byte [buffer], temph ; buffer[0] = temp[1];
+ cmp temph, 0xFF ; Set CF if temp[1] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+ shr tempq, 16 ; temp >>= 16;
+ mov byte [buffer], tempb ; buffer[0] = temp[0];
+ cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+ mov byte [buffer], temph ; buffer[0] = temp[1];
+ cmp temph, 0xFF ; Set CF if temp[1] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+ shr tempd, 16 ; temp >>= 16;
+ mov byte [buffer], tempb ; buffer[0] = temp[0];
+ cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+ mov byte [buffer], temph ; buffer[0] = temp[1];
+ cmp temph, 0xFF ; Set CF if temp[1] < 0xFF
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
+ sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+ jmp %1 ; return;
%endmacro
;
@@ -165,181 +184,399 @@ EXTN(jconst_huff_encode_one_block):
; JCOEFPTR block, int last_dc_val,
; c_derived_tbl *dctbl, c_derived_tbl *actbl)
;
-
-; r10 = working_state *state
-; r11 = JOCTET *buffer
-; r12 = JCOEFPTR block
-; r13d = int last_dc_val
-; r14 = c_derived_tbl *dctbl
-; r15 = c_derived_tbl *actbl
-
-%define t1 rbp - (DCTSIZE2 * SIZEOF_WORD)
-%define t2 t1 - (DCTSIZE2 * SIZEOF_WORD)
-%define put_buffer r8
-%define put_bits r9d
-%define buffer rax
+; NOTES:
+; When shuffling data, we try to avoid pinsrw as much as possible, since it is
+; slow on many CPUs. Its reciprocal throughput (issue latency) is 1 even on
+; modern CPUs, so chains of pinsrw instructions (even with different outputs)
+; can limit performance. pinsrw is a VectorPath instruction on AMD K8 and
+; requires 2 µops (with memory operand) on Intel. In either case, only one
+; pinsrw instruction can be decoded per cycle (and nothing else if they are
+; back-to-back), so out-of-order execution cannot be used to work around long
+; pinsrw chains (though for Sandy Bridge and later, this may be less of a
+; problem if the code runs from the µop cache.)
+;
+; We use tzcnt instead of bsf without checking for support. The instruction is
+; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
+; rep bsf.) The destination (first) operand of bsf (and tzcnt on some CPUs) is
+; an input dependency (although the behavior is not formally defined, Intel
+; CPUs usually leave the destination unmodified if the source is zero.) This
+; can prevent out-of-order execution, so we clear the destination before
+; invoking tzcnt.
+;
+; Initial register allocation
+; rax - buffer
+; rbx - temp
+; rcx - nbits
+; rdx - block --> free_bits
+; rsi - nbits_base
+; rdi - t
+; rbp - code
+; r8 - dctbl --> code_temp
+; r9 - actbl
+; r10 - state
+; r11 - index
+; r12 - put_buffer
+
+%define buffer rax
+%ifdef WIN64
+%define bufferp rax
+%else
+%define bufferp raxp
+%endif
+%define tempq rbx
+%define tempd ebx
+%define tempb bl
+%define temph bh
+%define nbitsq rcx
+%define nbits ecx
+%define nbitsb cl
+%define block rdx
+%define nbits_base rsi
+%define t rdi
+%define td edi
+%define codeq rbp
+%define code ebp
+%define dctbl r8
+%define actbl r9
+%define state r10
+%define index r11
+%define indexd r11d
+%define put_buffer r12
+%define put_bufferd r12d
+
+; Step 1: Re-arrange input data according to jpeg_natural_order
+; xx 01 02 03 04 05 06 07 xx 01 08 16 09 02 03 10
+; 08 09 10 11 12 13 14 15 17 24 32 25 18 11 04 05
+; 16 17 18 19 20 21 22 23 12 19 26 33 40 48 41 34
+; 24 25 26 27 28 29 30 31 ==> 27 20 13 06 07 14 21 28
+; 32 33 34 35 36 37 38 39 35 42 49 56 57 50 43 36
+; 40 41 42 43 44 45 46 47 29 22 15 23 30 37 44 51
+; 48 49 50 51 52 53 54 55 58 59 52 45 38 31 39 46
+; 56 57 58 59 60 61 62 63 53 60 61 54 47 55 62 63
align 32
GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
EXTN(jsimd_huff_encode_one_block_sse2):
+
+%ifdef WIN64
+
+; rcx = working_state *state
+; rdx = JOCTET *buffer
+; r8 = JCOEFPTR block
+; r9 = int last_dc_val
+; [rax+48] = c_derived_tbl *dctbl
+; [rax+56] = c_derived_tbl *actbl
+
+ ;X: X = code stream
+ mov buffer, rdx
+ mov block, r8
+ movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07
+ push rbx
push rbp
- mov rax, rsp ; rax = original rbp
- sub rsp, byte 4
- and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
- mov [rsp], rax
- mov rbp, rsp ; rbp = aligned rbp
- lea rsp, [t2]
- push_xmm 4
- collect_args 6
+ movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07
+ push rsi
+ push rdi
+ push r12
+ movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15
+ mov state, rcx
+ movsx code, word [block] ;Z: code = block[0];
+ pxor xmm4, xmm4 ;A: w4[i] = 0;
+ sub code, r9d ;Z: code -= last_dc_val;
+ mov dctbl, POINTER [rsp+6*8+4*8]
+ mov actbl, POINTER [rsp+6*8+5*8]
+ punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11
+ lea nbits_base, [rel jpeg_nbits_table]
+ add rsp, -DCTSIZE2 * SIZEOF_WORD
+ mov t, rsp
+
+%else
+
+; rdi = working_state *state
+; rsi = JOCTET *buffer
+; rdx = JCOEFPTR block
+; rcx = int last_dc_val
+; r8 = c_derived_tbl *dctbl
+; r9 = c_derived_tbl *actbl
+
+ ;X: X = code stream
+ movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07
push rbx
+ push rbp
+ movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07
+ push r12
+ mov state, rdi
+ mov buffer, rsi
+ movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15
+ movsx codeq, word [block] ;Z: code = block[0];
+ lea nbits_base, [rel jpeg_nbits_table]
+ pxor xmm4, xmm4 ;A: w4[i] = 0;
+ sub codeq, rcx ;Z: code -= last_dc_val;
+ punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11
+ lea t, [rsp - DCTSIZE2 * SIZEOF_WORD] ; use red zone for t_
- mov buffer, r11 ; r11 is now sratch
-
- mov put_buffer, MMWORD [r10+16] ; put_buffer = state->cur.put_buffer;
- mov put_bits, dword [r10+24] ; put_bits = state->cur.put_bits;
- push r10 ; r10 is now scratch
-
- ; Encode the DC coefficient difference per section F.1.2.1
- movsx edi, word [r12] ; temp = temp2 = block[0] - last_dc_val;
- sub edi, r13d ; r13 is not used anymore
- mov ebx, edi
-
- ; This is a well-known technique for obtaining the absolute value
- ; without a branch. It is derived from an assembly language technique
- ; presented in "How to Optimize for the Pentium Processors",
- ; Copyright (c) 1996, 1997 by Agner Fog.
- mov esi, edi
- sar esi, 31 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
- xor edi, esi ; temp ^= temp3;
- sub edi, esi ; temp -= temp3;
-
- ; For a negative input, want temp2 = bitwise complement of abs(input)
- ; This code assumes we are on a two's complement machine
- add ebx, esi ; temp2 += temp3;
-
- ; Find the number of bits needed for the magnitude of the coefficient
- lea r11, [rel EXTN(jpeg_nbits_table)]
- movzx rdi, byte [r11 + rdi] ; nbits = JPEG_NBITS(temp);
- ; Emit the Huffman-coded symbol for the number of bits
- mov r11d, INT [r14 + rdi * 4] ; code = dctbl->ehufco[nbits];
- movzx esi, byte [r14 + rdi + 1024] ; size = dctbl->ehufsi[nbits];
- EMIT_BITS r11, esi ; EMIT_BITS(code, size)
-
- ; Mask off any extra bits in code
- mov esi, 1
- mov ecx, edi
- shl esi, cl
- dec esi
- and ebx, esi ; temp2 &= (((JLONG)1)<<nbits) - 1;
-
- ; Emit that number of bits of the value, if positive,
- ; or the complement of its magnitude, if negative.
- EMIT_BITS rbx, edi ; EMIT_BITS(temp2, nbits)
-
- ; Prepare data
- xor ebx, ebx
- kloop_prepare 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, \
- 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, \
- 27, 20, 13, 6, 7, 14, 21, 28, 35, \
- xmm0, xmm1, xmm2, xmm3
- kloop_prepare 32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
- 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
- 53, 60, 61, 54, 47, 55, 62, 63, 63, \
- xmm4, xmm5, xmm6, xmm7
-
- pxor xmm8, xmm8
- pcmpeqw xmm0, xmm8 ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
- pcmpeqw xmm1, xmm8 ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
- pcmpeqw xmm2, xmm8 ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
- pcmpeqw xmm3, xmm8 ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
- pcmpeqw xmm4, xmm8 ; tmp4 = _mm_cmpeq_epi16(tmp4, zero);
- pcmpeqw xmm5, xmm8 ; tmp5 = _mm_cmpeq_epi16(tmp5, zero);
- pcmpeqw xmm6, xmm8 ; tmp6 = _mm_cmpeq_epi16(tmp6, zero);
- pcmpeqw xmm7, xmm8 ; tmp7 = _mm_cmpeq_epi16(tmp7, zero);
- packsswb xmm0, xmm1 ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
- packsswb xmm2, xmm3 ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
- packsswb xmm4, xmm5 ; tmp4 = _mm_packs_epi16(tmp4, tmp5);
- packsswb xmm6, xmm7 ; tmp6 = _mm_packs_epi16(tmp6, tmp7);
- pmovmskb r11d, xmm0 ; index = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
- pmovmskb r12d, xmm2 ; index = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
- pmovmskb r13d, xmm4 ; index = ((uint64_t)_mm_movemask_epi8(tmp4)) << 32;
- pmovmskb r14d, xmm6 ; index = ((uint64_t)_mm_movemask_epi8(tmp6)) << 48;
- shl r12, 16
- shl r14, 16
- or r11, r12
- or r13, r14
- shl r13, 32
- or r11, r13
- not r11 ; index = ~index;
-
- ;mov MMWORD [ t1 + DCTSIZE2 * SIZEOF_WORD ], r11
- ;jmp .EFN
-
- mov r13d, INT [r15 + 240 * 4] ; code_0xf0 = actbl->ehufco[0xf0];
- movzx r14d, byte [r15 + 1024 + 240] ; size_0xf0 = actbl->ehufsi[0xf0];
- lea rsi, [t1]
-.BLOOP:
- bsf r12, r11 ; r = __builtin_ctzl(index);
- jz .ELOOP
- mov rcx, r12
- lea rsi, [rsi+r12*2] ; k += r;
- shr r11, cl ; index >>= r;
- movzx rdi, word [rsi] ; temp = t1[k];
- lea rbx, [rel EXTN(jpeg_nbits_table)]
- movzx rdi, byte [rbx + rdi] ; nbits = JPEG_NBITS(temp);
-.BRLOOP:
- cmp r12, 16 ; while (r > 15) {
- jl .ERLOOP
- EMIT_BITS r13, r14d ; EMIT_BITS(code_0xf0, size_0xf0)
- sub r12, 16 ; r -= 16;
- jmp .BRLOOP
-.ERLOOP:
- ; Emit Huffman symbol for run length / number of bits
- CHECKBUF31 ; uses rcx, rdx
-
- shl r12, 4 ; temp3 = (r << 4) + nbits;
- add r12, rdi
- mov ebx, INT [r15 + r12 * 4] ; code = actbl->ehufco[temp3];
- movzx ecx, byte [r15 + r12 + 1024] ; size = actbl->ehufsi[temp3];
- PUT_BITS rbx
-
- ;EMIT_CODE(code, size)
-
- movsx ebx, word [rsi-DCTSIZE2*2] ; temp2 = t2[k];
- ; Mask off any extra bits in code
- mov rcx, rdi
- mov rdx, 1
- shl rdx, cl
- dec rdx
- and rbx, rdx ; temp2 &= (((JLONG)1)<<nbits) - 1;
- PUT_BITS rbx ; PUT_BITS(temp2, nbits)
-
- shr r11, 1 ; index >>= 1;
- add rsi, 2 ; ++k;
- jmp .BLOOP
-.ELOOP:
- ; If the last coef(s) were zero, emit an end-of-block code
- lea rdi, [t1 + (DCTSIZE2-1) * 2] ; r = DCTSIZE2-1-k;
- cmp rdi, rsi ; if (r > 0) {
- je .EFN
- mov ebx, INT [r15] ; code = actbl->ehufco[0];
- movzx r12d, byte [r15 + 1024] ; size = actbl->ehufsi[0];
- EMIT_BITS rbx, r12d
-.EFN:
- pop r10
- ; Save put_buffer & put_bits
- mov MMWORD [r10+16], put_buffer ; state->cur.put_buffer = put_buffer;
- mov dword [r10+24], put_bits ; state->cur.put_bits = put_bits;
+%endif
+ pshuflw xmm0, xmm0, 11001001b ;A: w0 = 01 08 xx 09 02 03 10 11
+ pinsrw xmm0, word [block + 16 * SIZEOF_WORD], 2 ;A: w0 = 01 08 16 09 02 03 10 11
+ punpckhdq xmm3, xmm1 ;D: w3 = 04 05 12 13 06 07 14 15
+ punpcklqdq xmm1, xmm3 ;B: w1 = 08 09 10 11 04 05 12 13
+ pinsrw xmm0, word [block + 17 * SIZEOF_WORD], 7 ;A: w0 = 01 08 16 09 02 03 10 17
+ ;A: (Row 0, offset 1)
+ pcmpgtw xmm4, xmm0 ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
+ paddw xmm0, xmm4 ;A: w0[i] += w4[i];
+ movaps XMMWORD [t + 0 * SIZEOF_WORD], xmm0 ;A: t[i] = w0[i];
+
+ movq xmm2, qword [block + 24 * SIZEOF_WORD] ;B: w2 = 24 25 26 27 -- -- -- --
+ pshuflw xmm2, xmm2, 11011000b ;B: w2 = 24 26 25 27 -- -- -- --
+ pslldq xmm1, 1 * SIZEOF_WORD ;B: w1 = -- 08 09 10 11 04 05 12
+ movups xmm5, XMMWORD [block + 48 * SIZEOF_WORD] ;H: w5 = 48 49 50 51 52 53 54 55
+ movsd xmm1, xmm2 ;B: w1 = 24 26 25 27 11 04 05 12
+ punpcklqdq xmm2, xmm5 ;C: w2 = 24 26 25 27 48 49 50 51
+ pinsrw xmm1, word [block + 32 * SIZEOF_WORD], 1 ;B: w1 = 24 32 25 27 11 04 05 12
+ pxor xmm4, xmm4 ;A: w4[i] = 0;
+ psrldq xmm3, 2 * SIZEOF_WORD ;D: w3 = 12 13 06 07 14 15 -- --
+ pcmpeqw xmm0, xmm4 ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
+ pinsrw xmm1, word [block + 18 * SIZEOF_WORD], 3 ;B: w1 = 24 32 25 18 11 04 05 12
+ ; (Row 1, offset 1)
+ pcmpgtw xmm4, xmm1 ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
+ paddw xmm1, xmm4 ;B: w1[i] += w4[i];
+ movaps XMMWORD [t + 8 * SIZEOF_WORD], xmm1 ;B: t[i+8] = w1[i];
+ pxor xmm4, xmm4 ;B: w4[i] = 0;
+ pcmpeqw xmm1, xmm4 ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
+
+ packsswb xmm0, xmm1 ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
+ ; w/ signed saturation
+
+ pinsrw xmm3, word [block + 20 * SIZEOF_WORD], 0 ;D: w3 = 20 13 06 07 14 15 -- --
+ pinsrw xmm3, word [block + 21 * SIZEOF_WORD], 5 ;D: w3 = 20 13 06 07 14 21 -- --
+ pinsrw xmm3, word [block + 28 * SIZEOF_WORD], 6 ;D: w3 = 20 13 06 07 14 21 28 --
+ pinsrw xmm3, word [block + 35 * SIZEOF_WORD], 7 ;D: w3 = 20 13 06 07 14 21 28 35
+ ; (Row 3, offset 1)
+ pcmpgtw xmm4, xmm3 ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
+ paddw xmm3, xmm4 ;D: w3[i] += w4[i];
+ movaps XMMWORD [t + 24 * SIZEOF_WORD], xmm3 ;D: t[i+24] = w3[i];
+ pxor xmm4, xmm4 ;D: w4[i] = 0;
+ pcmpeqw xmm3, xmm4 ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
+
+ pinsrw xmm2, word [block + 19 * SIZEOF_WORD], 0 ;C: w2 = 19 26 25 27 48 49 50 51
+ cmp code, 1 << 31 ;Z: Set CF if code < 0x80000000,
+ ;Z: i.e. if code is positive
+ pinsrw xmm2, word [block + 33 * SIZEOF_WORD], 2 ;C: w2 = 19 26 33 27 48 49 50 51
+ pinsrw xmm2, word [block + 40 * SIZEOF_WORD], 3 ;C: w2 = 19 26 33 40 48 49 50 51
+ adc code, -1 ;Z: code += -1 + (code >= 0 ? 1 : 0);
+ pinsrw xmm2, word [block + 41 * SIZEOF_WORD], 5 ;C: w2 = 19 26 33 40 48 41 50 51
+ pinsrw xmm2, word [block + 34 * SIZEOF_WORD], 6 ;C: w2 = 19 26 33 40 48 41 34 51
+ movsxd codeq, code ;Z: sign extend code
+ pinsrw xmm2, word [block + 27 * SIZEOF_WORD], 7 ;C: w2 = 19 26 33 40 48 41 34 27
+ ; (Row 2, offset 1)
+ pcmpgtw xmm4, xmm2 ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
+ paddw xmm2, xmm4 ;C: w2[i] += w4[i];
+ movaps XMMWORD [t + 16 * SIZEOF_WORD], xmm2 ;C: t[i+16] = w2[i];
+ pxor xmm4, xmm4 ;C: w4[i] = 0;
+ pcmpeqw xmm2, xmm4 ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
+
+ packsswb xmm2, xmm3 ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
+ ; w/ signed saturation
+
+ movzx nbitsq, byte [NBITS(codeq)] ;Z: nbits = JPEG_NBITS(code);
+ movdqa xmm3, xmm5 ;H: w3 = 48 49 50 51 52 53 54 55
+ pmovmskb tempd, xmm2 ;Z: temp = 0; temp |= ((b2[i] >> 7) << i);
+ pmovmskb put_bufferd, xmm0 ;Z: put_buffer = 0; put_buffer |= ((b0[i] >> 7) << i);
+ movups xmm0, XMMWORD [block + 56 * SIZEOF_WORD] ;H: w0 = 56 57 58 59 60 61 62 63
+ punpckhdq xmm3, xmm0 ;H: w3 = 52 53 60 61 54 55 62 63
+ shl tempd, 16 ;Z: temp <<= 16;
+ psrldq xmm3, 1 * SIZEOF_WORD ;H: w3 = 53 60 61 54 55 62 63 --
+ pxor xmm2, xmm2 ;H: w2[i] = 0;
+ or put_bufferd, tempd ;Z: put_buffer |= temp;
+ pshuflw xmm3, xmm3, 00111001b ;H: w3 = 60 61 54 53 55 62 63 --
+ movq xmm1, qword [block + 44 * SIZEOF_WORD] ;G: w1 = 44 45 46 47 -- -- -- --
+ unpcklps xmm5, xmm0 ;E: w5 = 48 49 56 57 50 51 58 59
+ pxor xmm0, xmm0 ;H: w0[i] = 0;
+ pinsrw xmm3, word [block + 47 * SIZEOF_WORD], 3 ;H: w3 = 60 61 54 47 55 62 63 --
+ ; (Row 7, offset 1)
+ pcmpgtw xmm2, xmm3 ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
+ paddw xmm3, xmm2 ;H: w3[i] += w2[i];
+ movaps XMMWORD [t + 56 * SIZEOF_WORD], xmm3 ;H: t[i+56] = w3[i];
+ movq xmm4, qword [block + 36 * SIZEOF_WORD] ;G: w4 = 36 37 38 39 -- -- -- --
+ pcmpeqw xmm3, xmm0 ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
+ punpckldq xmm4, xmm1 ;G: w4 = 36 37 44 45 38 39 46 47
+ mov tempd, [dctbl + c_derived_tbl.ehufco + nbitsq * 4]
+ ;Z: temp = dctbl->ehufco[nbits];
+ movdqa xmm1, xmm4 ;F: w1 = 36 37 44 45 38 39 46 47
+ psrldq xmm4, 1 * SIZEOF_WORD ;G: w4 = 37 44 45 38 39 46 47 --
+ shufpd xmm1, xmm5, 10b ;F: w1 = 36 37 44 45 50 51 58 59
+ and code, dword [MASK_BITS(nbitsq)] ;Z: code &= (1 << nbits) - 1;
+ pshufhw xmm4, xmm4, 11010011b ;G: w4 = 37 44 45 38 -- 39 46 --
+ pslldq xmm1, 1 * SIZEOF_WORD ;F: w1 = -- 36 37 44 45 50 51 58
+ shl tempq, nbitsb ;Z: temp <<= nbits;
+ pinsrw xmm4, word [block + 59 * SIZEOF_WORD], 0 ;G: w4 = 59 44 45 38 -- 39 46 --
+ pshufd xmm1, xmm1, 11011000b ;F: w1 = -- 36 45 50 37 44 51 58
+ pinsrw xmm4, word [block + 52 * SIZEOF_WORD], 1 ;G: w4 = 59 52 45 38 -- 39 46 --
+ or code, tempd ;Z: code |= temp;
+ movlps xmm1, qword [block + 20 * SIZEOF_WORD] ;F: w1 = 20 21 22 23 37 44 51 58
+ pinsrw xmm4, word [block + 31 * SIZEOF_WORD], 4 ;G: w4 = 59 52 45 38 31 39 46 --
+ pshuflw xmm1, xmm1, 01110010b ;F: w1 = 22 20 23 21 37 44 51 58
+ pinsrw xmm4, word [block + 53 * SIZEOF_WORD], 7 ;G: w4 = 59 52 45 38 31 39 46 53
+ ; (Row 6, offset 1)
+ pxor xmm2, xmm2 ;G: w2[i] = 0;
+ pcmpgtw xmm0, xmm4 ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
+ pinsrw xmm1, word [block + 15 * SIZEOF_WORD], 1 ;F: w1 = 22 15 23 21 37 44 51 58
+ paddw xmm4, xmm0 ;G: w4[i] += w0[i];
+ movaps XMMWORD [t + 48 * SIZEOF_WORD], xmm4 ;G: t[48+i] = w4[i];
+ pinsrw xmm1, word [block + 30 * SIZEOF_WORD], 3 ;F: w1 = 22 15 23 30 37 44 51 58
+ ; (Row 5, offset 1)
+ pcmpeqw xmm4, xmm2 ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
+ pinsrw xmm5, word [block + 42 * SIZEOF_WORD], 0 ;E: w5 = 42 49 56 57 50 51 58 59
+
+ packsswb xmm4, xmm3 ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
+ ; w/ signed saturation
+
+ pxor xmm0, xmm0 ;F: w0[i] = 0;
+ pinsrw xmm5, word [block + 43 * SIZEOF_WORD], 5 ;E: w5 = 42 49 56 57 50 43 58 59
+ pcmpgtw xmm2, xmm1 ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
+ pmovmskb tempd, xmm4 ;Z: temp = 0; temp |= ((b4[i] >> 7) << i);
+ pinsrw xmm5, word [block + 36 * SIZEOF_WORD], 6 ;E: w5 = 42 49 56 57 50 43 36 59
+ paddw xmm1, xmm2 ;F: w1[i] += w2[i];
+ movaps XMMWORD [t + 40 * SIZEOF_WORD], xmm1 ;F: t[40+i] = w1[i];
+ pinsrw xmm5, word [block + 29 * SIZEOF_WORD], 7 ;E: w5 = 42 49 56 57 50 43 36 29
+ ; (Row 4, offset 1)
+%undef block
+%define free_bitsq rdx
+%define free_bitsd edx
+%define free_bitsb dl
+ pcmpeqw xmm1, xmm0 ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
+ shl tempq, 48 ;Z: temp <<= 48;
+ pxor xmm2, xmm2 ;E: w2[i] = 0;
+ pcmpgtw xmm0, xmm5 ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
+ paddw xmm5, xmm0 ;E: w5[i] += w0[i];
+ or tempq, put_buffer ;Z: temp |= put_buffer;
+ movaps XMMWORD [t + 32 * SIZEOF_WORD], xmm5 ;E: t[32+i] = w5[i];
+ lea t, [dword t - 2] ;Z: t = &t[-1];
+ pcmpeqw xmm5, xmm2 ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
+
+ packsswb xmm5, xmm1 ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
+ ; w/ signed saturation
+
+ add nbitsb, byte [dctbl + c_derived_tbl.ehufsi + nbitsq]
+ ;Z: nbits += dctbl->ehufsi[nbits];
+%undef dctbl
+%define code_temp r8d
+ pmovmskb indexd, xmm5 ;Z: index = 0; index |= ((b5[i] >> 7) << i);
+ mov free_bitsd, [state+working_state.cur.free_bits]
+ ;Z: free_bits = state->cur.free_bits;
+ pcmpeqw xmm1, xmm1 ;Z: b1[i] = 0xFF;
+ shl index, 32 ;Z: index <<= 32;
+ mov put_buffer, [state+working_state.cur.put_buffer.simd]
+ ;Z: put_buffer = state->cur.put_buffer.simd;
+ or index, tempq ;Z: index |= temp;
+ not index ;Z: index = ~index;
+ sub free_bitsb, nbitsb ;Z: if ((free_bits -= nbits) >= 0)
+ jnl .ENTRY_SKIP_EMIT_CODE ;Z: goto .ENTRY_SKIP_EMIT_CODE;
+ align 16
+.EMIT_CODE: ;Z: .EMIT_CODE:
+ EMIT_QWORD .BLOOP_COND ;Z: insert code, flush buffer, goto .BLOOP_COND
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ align 16
+.BRLOOP: ; do {
+ lea code_temp, [nbitsq - 16] ; code_temp = nbits - 16;
+ movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
+ ; nbits = actbl->ehufsi[0xf0];
+ mov code, [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
+ ; code = actbl->ehufco[0xf0];
+ sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0)
+ jle .EMIT_BRLOOP_CODE ; goto .EMIT_BRLOOP_CODE;
+ shl put_buffer, nbitsb ; put_buffer <<= nbits;
+ mov nbits, code_temp ; nbits = code_temp;
+ or put_buffer, codeq ; put_buffer |= code;
+ cmp nbits, 16 ; if (nbits <= 16)
+ jle .ERLOOP ; break;
+ jmp .BRLOOP ; } while (1);
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ align 16
+ times 5 nop
+.ENTRY_SKIP_EMIT_CODE: ; .ENTRY_SKIP_EMIT_CODE:
+ shl put_buffer, nbitsb ; put_buffer <<= nbits;
+ or put_buffer, codeq ; put_buffer |= code;
+.BLOOP_COND: ; .BLOOP_COND:
+ test index, index ; if (index != 0)
+ jz .ELOOP ; {
+.BLOOP: ; do {
+ xor nbits, nbits ; nbits = 0; /* kill tzcnt input dependency */
+ tzcnt nbitsq, index ; nbits = # of trailing 0 bits in index
+ inc nbits ; ++nbits;
+ lea t, [t + nbitsq * 2] ; t = &t[nbits];
+ shr index, nbitsb ; index >>= nbits;
+.EMIT_BRLOOP_CODE_END: ; .EMIT_BRLOOP_CODE_END:
+ cmp nbits, 16 ; if (nbits > 16)
+ jg .BRLOOP ; goto .BRLOOP;
+.ERLOOP: ; .ERLOOP:
+ movsx codeq, word [t] ; code = *t;
+ lea tempd, [nbitsq * 2] ; temp = nbits * 2;
+ movzx nbits, byte [NBITS(codeq)] ; nbits = JPEG_NBITS(code);
+ lea tempd, [nbitsq + tempq * 8] ; temp = temp * 8 + nbits;
+ mov code_temp, [actbl + c_derived_tbl.ehufco + (tempq - 16) * 4]
+ ; code_temp = actbl->ehufco[temp-16];
+ shl code_temp, nbitsb ; code_temp <<= nbits;
+ and code, dword [MASK_BITS(nbitsq)] ; code &= (1 << nbits) - 1;
+ add nbitsb, [actbl + c_derived_tbl.ehufsi + (tempq - 16)]
+ ; free_bits -= actbl->ehufsi[temp-16];
+ or code, code_temp ; code |= code_temp;
+ sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0)
+ jle .EMIT_CODE ; goto .EMIT_CODE;
+ shl put_buffer, nbitsb ; put_buffer <<= nbits;
+ or put_buffer, codeq ; put_buffer |= code;
+ test index, index
+ jnz .BLOOP ; } while (index != 0);
+.ELOOP: ; } /* index != 0 */
+ sub td, esp ; t -= (WIN64: &t_[0], UNIX: &t_[64]);
+%ifdef WIN64
+ cmp td, (DCTSIZE2 - 2) * SIZEOF_WORD ; if (t != 62)
+%else
+ cmp td, -2 * SIZEOF_WORD ; if (t != -2)
+%endif
+ je .EFN ; {
+ movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
+ ; nbits = actbl->ehufsi[0];
+ mov code, [actbl + c_derived_tbl.ehufco + 0] ; code = actbl->ehufco[0];
+ sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0)
+ jg .EFN_SKIP_EMIT_CODE ; {
+ EMIT_QWORD .EFN ; insert code, flush buffer
+ align 16
+.EFN_SKIP_EMIT_CODE: ; } else {
+ shl put_buffer, nbitsb ; put_buffer <<= nbits;
+ or put_buffer, codeq ; put_buffer |= code;
+.EFN: ; } }
+ mov [state + working_state.cur.put_buffer.simd], put_buffer
+ ; state->cur.put_buffer.simd = put_buffer;
+ mov byte [state + working_state.cur.free_bits], free_bitsb
+ ; state->cur.free_bits = free_bits;
+%ifdef WIN64
+ sub rsp, -DCTSIZE2 * SIZEOF_WORD
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbp
pop rbx
- uncollect_args 6
- pop_xmm 4
- mov rsp, rbp ; rsp <- aligned rbp
- pop rsp ; rsp <- original rbp
+%else
+ pop r12
pop rbp
+ pop rbx
+%endif
ret
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ align 16
+.EMIT_BRLOOP_CODE:
+ EMIT_QWORD .EMIT_BRLOOP_CODE_END, { mov nbits, code_temp }
+ ; insert code, flush buffer,
+ ; nbits = code_temp, goto .EMIT_BRLOOP_CODE_END
+
; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
align 32
diff --git a/simd/x86_64/jcsample-avx2.asm b/simd/x86_64/jcsample-avx2.asm
index d9922bb..b32527a 100644
--- a/simd/x86_64/jcsample-avx2.asm
+++ b/simd/x86_64/jcsample-avx2.asm
@@ -4,6 +4,7 @@
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander.
; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -71,7 +72,7 @@ EXTN(jsimd_h2v1_downsample_avx2):
push rax
push rcx
- mov rdi, JSAMPROW [rsi]
+ mov rdip, JSAMPROW [rsi]
add rdi, rdx
mov al, JSAMPLE [rdi-1]
@@ -107,8 +108,8 @@ EXTN(jsimd_h2v1_downsample_avx2):
push rdi
push rsi
- mov rsi, JSAMPROW [rsi] ; inptr
- mov rdi, JSAMPROW [rdi] ; outptr
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rdip, JSAMPROW [rdi] ; outptr
cmp rcx, byte SIZEOF_YMMWORD
jae short .columnloop
@@ -233,7 +234,7 @@ EXTN(jsimd_h2v2_downsample_avx2):
push rax
push rcx
- mov rdi, JSAMPROW [rsi]
+ mov rdip, JSAMPROW [rsi]
add rdi, rdx
mov al, JSAMPLE [rdi-1]
@@ -269,9 +270,9 @@ EXTN(jsimd_h2v2_downsample_avx2):
push rdi
push rsi
- mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
- mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
- mov rdi, JSAMPROW [rdi] ; outptr
+ mov rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
+ mov rdip, JSAMPROW [rdi] ; outptr
cmp rcx, byte SIZEOF_YMMWORD
jae short .columnloop
diff --git a/simd/x86_64/jcsample-sse2.asm b/simd/x86_64/jcsample-sse2.asm
index 0f107e9..2fcfe45 100644
--- a/simd/x86_64/jcsample-sse2.asm
+++ b/simd/x86_64/jcsample-sse2.asm
@@ -3,6 +3,7 @@
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -70,7 +71,7 @@ EXTN(jsimd_h2v1_downsample_sse2):
push rax
push rcx
- mov rdi, JSAMPROW [rsi]
+ mov rdip, JSAMPROW [rsi]
add rdi, rdx
mov al, JSAMPLE [rdi-1]
@@ -105,8 +106,8 @@ EXTN(jsimd_h2v1_downsample_sse2):
push rdi
push rsi
- mov rsi, JSAMPROW [rsi] ; inptr
- mov rdi, JSAMPROW [rdi] ; outptr
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rdip, JSAMPROW [rdi] ; outptr
cmp rcx, byte SIZEOF_XMMWORD
jae short .columnloop
@@ -215,7 +216,7 @@ EXTN(jsimd_h2v2_downsample_sse2):
push rax
push rcx
- mov rdi, JSAMPROW [rsi]
+ mov rdip, JSAMPROW [rsi]
add rdi, rdx
mov al, JSAMPLE [rdi-1]
@@ -250,9 +251,9 @@ EXTN(jsimd_h2v2_downsample_sse2):
push rdi
push rsi
- mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
- mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
- mov rdi, JSAMPROW [rdi] ; outptr
+ mov rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
+ mov rdip, JSAMPROW [rdi] ; outptr
cmp rcx, byte SIZEOF_XMMWORD
jae short .columnloop
diff --git a/simd/x86_64/jdcolext-avx2.asm b/simd/x86_64/jdcolext-avx2.asm
index 677b8ed..2370fda 100644
--- a/simd/x86_64/jdcolext-avx2.asm
+++ b/simd/x86_64/jdcolext-avx2.asm
@@ -4,6 +4,7 @@
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2012, 2016, D. R. Commander.
; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -58,9 +59,9 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
mov rdi, r11
mov ecx, r12d
- mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
- mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
- mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+ mov rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
@@ -79,10 +80,10 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
push rsi
push rcx ; col
- mov rsi, JSAMPROW [rsi] ; inptr0
- mov rbx, JSAMPROW [rbx] ; inptr1
- mov rdx, JSAMPROW [rdx] ; inptr2
- mov rdi, JSAMPROW [rdi] ; outptr
+ mov rsip, JSAMPROW [rsi] ; inptr0
+ mov rbxp, JSAMPROW [rbx] ; inptr1
+ mov rdxp, JSAMPROW [rdx] ; inptr2
+ mov rdip, JSAMPROW [rdi] ; outptr
.columnloop:
vmovdqu ymm5, YMMWORD [rbx] ; ymm5=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
diff --git a/simd/x86_64/jdcolext-sse2.asm b/simd/x86_64/jdcolext-sse2.asm
index 071aa62..e07c8d7 100644
--- a/simd/x86_64/jdcolext-sse2.asm
+++ b/simd/x86_64/jdcolext-sse2.asm
@@ -3,6 +3,7 @@
;
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -57,9 +58,9 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
mov rdi, r11
mov ecx, r12d
- mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
- mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
- mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+ mov rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
lea rbx, [rbx+rcx*SIZEOF_JSAMPROW]
lea rdx, [rdx+rcx*SIZEOF_JSAMPROW]
@@ -78,10 +79,10 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
push rsi
push rcx ; col
- mov rsi, JSAMPROW [rsi] ; inptr0
- mov rbx, JSAMPROW [rbx] ; inptr1
- mov rdx, JSAMPROW [rdx] ; inptr2
- mov rdi, JSAMPROW [rdi] ; outptr
+ mov rsip, JSAMPROW [rsi] ; inptr0
+ mov rbxp, JSAMPROW [rbx] ; inptr1
+ mov rdxp, JSAMPROW [rdx] ; inptr2
+ mov rdip, JSAMPROW [rdi] ; outptr
.columnloop:
movdqa xmm5, XMMWORD [rbx] ; xmm5=Cb(0123456789ABCDEF)
diff --git a/simd/x86_64/jdmrgext-avx2.asm b/simd/x86_64/jdmrgext-avx2.asm
index bb733c5..8b264b4 100644
--- a/simd/x86_64/jdmrgext-avx2.asm
+++ b/simd/x86_64/jdmrgext-avx2.asm
@@ -4,6 +4,7 @@
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2012, 2016, D. R. Commander.
; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -58,14 +59,14 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
mov rdi, r11
mov ecx, r12d
- mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
- mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
- mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+ mov rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
mov rdi, r13
- mov rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0
- mov rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1
- mov rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2
- mov rdi, JSAMPROW [rdi] ; outptr
+ mov rsip, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0
+ mov rbxp, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1
+ mov rdxp, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2
+ mov rdip, JSAMPROW [rdi] ; outptr
pop rcx ; col
@@ -514,15 +515,16 @@ EXTN(jsimd_h2v2_merged_upsample_avx2):
mov rdi, r11
mov ecx, r12d
- mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
- mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
- mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+ mov rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
mov rdi, r13
lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
- push rdx ; inptr2
- push rbx ; inptr1
- push rsi ; inptr00
+ sub rsp, SIZEOF_JSAMPARRAY*4
+ mov JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip ; intpr00
+ mov JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp ; intpr1
+ mov JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp ; intpr2
mov rbx, rsp
push rdi
@@ -546,16 +548,16 @@ EXTN(jsimd_h2v2_merged_upsample_avx2):
pop rax
pop rcx
pop rdi
- pop rsi
- pop rbx
- pop rdx
+ mov rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
add rdi, byte SIZEOF_JSAMPROW ; outptr1
add rsi, byte SIZEOF_JSAMPROW ; inptr01
- push rdx ; inptr2
- push rbx ; inptr1
- push rsi ; inptr00
+ mov JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip ; intpr00
+ mov JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp ; intpr1
+ mov JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp ; intpr2
mov rbx, rsp
push rdi
@@ -579,9 +581,10 @@ EXTN(jsimd_h2v2_merged_upsample_avx2):
pop rax
pop rcx
pop rdi
- pop rsi
- pop rbx
- pop rdx
+ mov rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
+ add rsp, SIZEOF_JSAMPARRAY*4
pop rbx
uncollect_args 4
diff --git a/simd/x86_64/jdmrgext-sse2.asm b/simd/x86_64/jdmrgext-sse2.asm
index b176a4c..eb3ab9d 100644
--- a/simd/x86_64/jdmrgext-sse2.asm
+++ b/simd/x86_64/jdmrgext-sse2.asm
@@ -3,6 +3,7 @@
;
; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -57,14 +58,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
mov rdi, r11
mov ecx, r12d
- mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
- mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
- mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+ mov rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
mov rdi, r13
- mov rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0
- mov rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1
- mov rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2
- mov rdi, JSAMPROW [rdi] ; outptr
+ mov rsip, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0
+ mov rbxp, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1
+ mov rdxp, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2
+ mov rdip, JSAMPROW [rdi] ; outptr
pop rcx ; col
@@ -456,15 +457,16 @@ EXTN(jsimd_h2v2_merged_upsample_sse2):
mov rdi, r11
mov ecx, r12d
- mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
- mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
- mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+ mov rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
mov rdi, r13
lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
- push rdx ; inptr2
- push rbx ; inptr1
- push rsi ; inptr00
+ sub rsp, SIZEOF_JSAMPARRAY*4
+ mov JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip ; intpr00
+ mov JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp ; intpr1
+ mov JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp ; intpr2
mov rbx, rsp
push rdi
@@ -488,16 +490,16 @@ EXTN(jsimd_h2v2_merged_upsample_sse2):
pop rax
pop rcx
pop rdi
- pop rsi
- pop rbx
- pop rdx
+ mov rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
add rdi, byte SIZEOF_JSAMPROW ; outptr1
add rsi, byte SIZEOF_JSAMPROW ; inptr01
- push rdx ; inptr2
- push rbx ; inptr1
- push rsi ; inptr00
+ mov JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip ; intpr00
+ mov JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp ; intpr1
+ mov JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp ; intpr2
mov rbx, rsp
push rdi
@@ -521,9 +523,10 @@ EXTN(jsimd_h2v2_merged_upsample_sse2):
pop rax
pop rcx
pop rdi
- pop rsi
- pop rbx
- pop rdx
+ mov rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+ mov rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+ mov rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
+ add rsp, SIZEOF_JSAMPARRAY*4
pop rbx
uncollect_args 4
diff --git a/simd/x86_64/jdsample-avx2.asm b/simd/x86_64/jdsample-avx2.asm
index fc274a9..1e4979f 100644
--- a/simd/x86_64/jdsample-avx2.asm
+++ b/simd/x86_64/jdsample-avx2.asm
@@ -4,6 +4,7 @@
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander.
; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -76,7 +77,7 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2):
mov rsi, r12 ; input_data
mov rdi, r13
- mov rdi, JSAMPARRAY [rdi] ; output_data
+ mov rdip, JSAMPARRAY [rdi] ; output_data
vpxor ymm0, ymm0, ymm0 ; ymm0=(all 0's)
vpcmpeqb xmm9, xmm9, xmm9
@@ -90,8 +91,8 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2):
push rdi
push rsi
- mov rsi, JSAMPROW [rsi] ; inptr
- mov rdi, JSAMPROW [rdi] ; outptr
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rdip, JSAMPROW [rdi] ; outptr
test rax, SIZEOF_YMMWORD-1
jz short .skip
@@ -235,18 +236,18 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
mov rsi, r12 ; input_data
mov rdi, r13
- mov rdi, JSAMPARRAY [rdi] ; output_data
+ mov rdip, JSAMPARRAY [rdi] ; output_data
.rowloop:
push rax ; colctr
push rcx
push rdi
push rsi
- mov rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above)
- mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
- mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below)
- mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
- mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
+ mov rcxp, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above)
+ mov rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below)
+ mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
vpxor ymm8, ymm8, ymm8 ; ymm8=(all 0's)
vpcmpeqb xmm9, xmm9, xmm9
@@ -539,13 +540,13 @@ EXTN(jsimd_h2v1_upsample_avx2):
mov rsi, r12 ; input_data
mov rdi, r13
- mov rdi, JSAMPARRAY [rdi] ; output_data
+ mov rdip, JSAMPARRAY [rdi] ; output_data
.rowloop:
push rdi
push rsi
- mov rsi, JSAMPROW [rsi] ; inptr
- mov rdi, JSAMPROW [rdi] ; outptr
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rdip, JSAMPROW [rdi] ; outptr
mov rax, rdx ; colctr
.columnloop:
@@ -629,14 +630,14 @@ EXTN(jsimd_h2v2_upsample_avx2):
mov rsi, r12 ; input_data
mov rdi, r13
- mov rdi, JSAMPARRAY [rdi] ; output_data
+ mov rdip, JSAMPARRAY [rdi] ; output_data
.rowloop:
push rdi
push rsi
- mov rsi, JSAMPROW [rsi] ; inptr
- mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
- mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rbxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
mov rax, rdx ; colctr
.columnloop:
diff --git a/simd/x86_64/jdsample-sse2.asm b/simd/x86_64/jdsample-sse2.asm
index 20e0767..38dbcee 100644
--- a/simd/x86_64/jdsample-sse2.asm
+++ b/simd/x86_64/jdsample-sse2.asm
@@ -3,6 +3,7 @@
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -74,14 +75,14 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
mov rsi, r12 ; input_data
mov rdi, r13
- mov rdi, JSAMPARRAY [rdi] ; output_data
+ mov rdip, JSAMPARRAY [rdi] ; output_data
.rowloop:
push rax ; colctr
push rdi
push rsi
- mov rsi, JSAMPROW [rsi] ; inptr
- mov rdi, JSAMPROW [rdi] ; outptr
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rdip, JSAMPROW [rdi] ; outptr
test rax, SIZEOF_XMMWORD-1
jz short .skip
@@ -221,18 +222,18 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
mov rsi, r12 ; input_data
mov rdi, r13
- mov rdi, JSAMPARRAY [rdi] ; output_data
+ mov rdip, JSAMPARRAY [rdi] ; output_data
.rowloop:
push rax ; colctr
push rcx
push rdi
push rsi
- mov rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above)
- mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
- mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below)
- mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
- mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
+ mov rcxp, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above)
+ mov rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
+ mov rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below)
+ mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
test rax, SIZEOF_XMMWORD-1
jz short .skip
@@ -512,13 +513,13 @@ EXTN(jsimd_h2v1_upsample_sse2):
mov rsi, r12 ; input_data
mov rdi, r13
- mov rdi, JSAMPARRAY [rdi] ; output_data
+ mov rdip, JSAMPARRAY [rdi] ; output_data
.rowloop:
push rdi
push rsi
- mov rsi, JSAMPROW [rsi] ; inptr
- mov rdi, JSAMPROW [rdi] ; outptr
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rdip, JSAMPROW [rdi] ; outptr
mov rax, rdx ; colctr
.columnloop:
@@ -600,14 +601,14 @@ EXTN(jsimd_h2v2_upsample_sse2):
mov rsi, r12 ; input_data
mov rdi, r13
- mov rdi, JSAMPARRAY [rdi] ; output_data
+ mov rdip, JSAMPARRAY [rdi] ; output_data
.rowloop:
push rdi
push rsi
- mov rsi, JSAMPROW [rsi] ; inptr
- mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
- mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
+ mov rsip, JSAMPROW [rsi] ; inptr
+ mov rbxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
+ mov rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
mov rax, rdx ; colctr
.columnloop:
diff --git a/simd/x86_64/jfdctint-avx2.asm b/simd/x86_64/jfdctint-avx2.asm
index 6ad4cf0..e56258b 100644
--- a/simd/x86_64/jfdctint-avx2.asm
+++ b/simd/x86_64/jfdctint-avx2.asm
@@ -2,7 +2,7 @@
; jfdctint.asm - accurate integer FDCT (64-bit AVX2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, 2018, D. R. Commander.
+; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -14,7 +14,7 @@
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
-; This file contains a slow-but-accurate integer implementation of the
+; This file contains a slower but more accurate integer implementation of the
; forward DCT (Discrete Cosine Transform). The following code is based
; directly on the IJG's original jfdctint.c; see the jfdctint.c for
; more details.
@@ -103,7 +103,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
%endmacro
; --------------------------------------------------------------------------
-; In-place 8x8x16-bit slow integer forward DCT using AVX2 instructions
+; In-place 8x8x16-bit accurate integer forward DCT using AVX2 instructions
; %1-%4: Input/output registers
; %5-%8: Temp registers
; %9: Pass (1 or 2)
diff --git a/simd/x86_64/jfdctint-sse2.asm b/simd/x86_64/jfdctint-sse2.asm
index 5d0de3c..ec1f383 100644
--- a/simd/x86_64/jfdctint-sse2.asm
+++ b/simd/x86_64/jfdctint-sse2.asm
@@ -2,7 +2,7 @@
; jfdctint.asm - accurate integer FDCT (64-bit SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2009, 2016, 2020, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -14,7 +14,7 @@
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
-; This file contains a slow-but-accurate integer implementation of the
+; This file contains a slower but more accurate integer implementation of the
; forward DCT (Discrete Cosine Transform). The following code is based
; directly on the IJG's original jfdctint.c; see the jfdctint.c for
; more details.
diff --git a/simd/x86_64/jidctflt-sse2.asm b/simd/x86_64/jidctflt-sse2.asm
index ab95e1a..60bf961 100644
--- a/simd/x86_64/jidctflt-sse2.asm
+++ b/simd/x86_64/jidctflt-sse2.asm
@@ -3,6 +3,7 @@
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -455,12 +456,12 @@ EXTN(jsimd_idct_float_sse2):
pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
- mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
- mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+ mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+ mov rbxp, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
- mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
- mov rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+ mov rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+ mov rbxp, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
diff --git a/simd/x86_64/jidctfst-sse2.asm b/simd/x86_64/jidctfst-sse2.asm
index a66a681..cb97fdf 100644
--- a/simd/x86_64/jidctfst-sse2.asm
+++ b/simd/x86_64/jidctfst-sse2.asm
@@ -3,6 +3,7 @@
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -460,21 +461,21 @@ EXTN(jsimd_idct_ifast_sse2):
pshufd xmm6, xmm4, 0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
pshufd xmm2, xmm7, 0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
- mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+ mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
- mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+ mov rdxp, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
- mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+ mov rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
- mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+ mov rdxp, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
diff --git a/simd/x86_64/jidctint-avx2.asm b/simd/x86_64/jidctint-avx2.asm
index 50270f4..ca7e317 100644
--- a/simd/x86_64/jidctint-avx2.asm
+++ b/simd/x86_64/jidctint-avx2.asm
@@ -2,7 +2,8 @@
; jidctint.asm - accurate integer IDCT (64-bit AVX2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, 2018, D. R. Commander.
+; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -14,7 +15,7 @@
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
-; This file contains a slow-but-accurate integer implementation of the
+; This file contains a slower but more accurate integer implementation of the
; inverse DCT (Discrete Cosine Transform). The following code is based
; directly on the IJG's original jidctint.c; see the jidctint.c for
; more details.
@@ -113,7 +114,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
%endmacro
; --------------------------------------------------------------------------
-; In-place 8x8x16-bit slow integer inverse DCT using AVX2 instructions
+; In-place 8x8x16-bit accurate integer inverse DCT using AVX2 instructions
; %1-%4: Input/output registers
; %5-%12: Temp registers
; %9: Pass (1 or 2)
@@ -387,23 +388,23 @@ EXTN(jsimd_idct_islow_avx2):
mov eax, r13d
- mov rdx, JSAMPROW [r12+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
- mov rsi, JSAMPROW [r12+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdxp, JSAMPROW [r12+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rsip, JSAMPROW [r12+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm0
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
- mov rdx, JSAMPROW [r12+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
- mov rsi, JSAMPROW [r12+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdxp, JSAMPROW [r12+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rsip, JSAMPROW [r12+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
- mov rdx, JSAMPROW [r12+4*SIZEOF_JSAMPROW] ; (JSAMPLE *)
- mov rsi, JSAMPROW [r12+5*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdxp, JSAMPROW [r12+4*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rsip, JSAMPROW [r12+5*SIZEOF_JSAMPROW] ; (JSAMPLE *)
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
- mov rdx, JSAMPROW [r12+6*SIZEOF_JSAMPROW] ; (JSAMPLE *)
- mov rsi, JSAMPROW [r12+7*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdxp, JSAMPROW [r12+6*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rsip, JSAMPROW [r12+7*SIZEOF_JSAMPROW] ; (JSAMPLE *)
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
diff --git a/simd/x86_64/jidctint-sse2.asm b/simd/x86_64/jidctint-sse2.asm
index 034530c..7aa869b 100644
--- a/simd/x86_64/jidctint-sse2.asm
+++ b/simd/x86_64/jidctint-sse2.asm
@@ -2,7 +2,8 @@
; jidctint.asm - accurate integer IDCT (64-bit SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2009, 2016, 2020, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -14,7 +15,7 @@
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
-; This file contains a slow-but-accurate integer implementation of the
+; This file contains a slower but more accurate integer implementation of the
; inverse DCT (Discrete Cosine Transform). The following code is based
; directly on the IJG's original jidctint.c; see the jidctint.c for
; more details.
@@ -817,21 +818,21 @@ EXTN(jsimd_idct_islow_sse2):
pshufd xmm2, xmm4, 0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
pshufd xmm5, xmm3, 0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
- mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+ mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
- mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+ mov rdxp, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
- mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+ mov rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
- mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+ mov rdxp, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
diff --git a/simd/x86_64/jidctred-sse2.asm b/simd/x86_64/jidctred-sse2.asm
index 7fbfcc5..4ece9d8 100644
--- a/simd/x86_64/jidctred-sse2.asm
+++ b/simd/x86_64/jidctred-sse2.asm
@@ -3,6 +3,7 @@
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -379,12 +380,12 @@ EXTN(jsimd_idct_4x4_sse2):
pshufd xmm1, xmm4, 0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
pshufd xmm3, xmm4, 0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
- mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+ mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
- mov rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+ mov rdxp, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
@@ -558,8 +559,8 @@ EXTN(jsimd_idct_2x2_sse2):
pextrw ebx, xmm6, 0x00 ; ebx=(C0 D0 -- --)
pextrw ecx, xmm6, 0x01 ; ecx=(C1 D1 -- --)
- mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
- mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+ mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+ mov rsip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
mov word [rdx+rax*SIZEOF_JSAMPLE], bx
mov word [rsi+rax*SIZEOF_JSAMPLE], cx
diff --git a/simd/x86_64/jquantf-sse2.asm b/simd/x86_64/jquantf-sse2.asm
index 83596a9..ab2e395 100644
--- a/simd/x86_64/jquantf-sse2.asm
+++ b/simd/x86_64/jquantf-sse2.asm
@@ -3,6 +3,7 @@
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -51,8 +52,8 @@ EXTN(jsimd_convsamp_float_sse2):
mov rdi, r12
mov rcx, DCTSIZE/2
.convloop:
- mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
- mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdxp, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
diff --git a/simd/x86_64/jquanti-avx2.asm b/simd/x86_64/jquanti-avx2.asm
index 5f04d22..70fe811 100644
--- a/simd/x86_64/jquanti-avx2.asm
+++ b/simd/x86_64/jquanti-avx2.asm
@@ -4,6 +4,7 @@
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, 2018, D. R. Commander.
; Copyright (C) 2016, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -44,23 +45,23 @@ EXTN(jsimd_convsamp_avx2):
mov eax, r11d
- mov rsi, JSAMPROW [r10+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
- mov rdi, JSAMPROW [r10+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rsip, JSAMPROW [r10+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdip, JSAMPROW [r10+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
movq xmm0, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
pinsrq xmm0, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
- mov rsi, JSAMPROW [r10+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
- mov rdi, JSAMPROW [r10+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rsip, JSAMPROW [r10+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdip, JSAMPROW [r10+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
movq xmm1, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
pinsrq xmm1, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
- mov rsi, JSAMPROW [r10+4*SIZEOF_JSAMPROW] ; (JSAMPLE *)
- mov rdi, JSAMPROW [r10+5*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rsip, JSAMPROW [r10+4*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdip, JSAMPROW [r10+5*SIZEOF_JSAMPROW] ; (JSAMPLE *)
movq xmm2, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
pinsrq xmm2, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
- mov rsi, JSAMPROW [r10+6*SIZEOF_JSAMPROW] ; (JSAMPLE *)
- mov rdi, JSAMPROW [r10+7*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rsip, JSAMPROW [r10+6*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdip, JSAMPROW [r10+7*SIZEOF_JSAMPROW] ; (JSAMPLE *)
movq xmm3, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
pinsrq xmm3, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
diff --git a/simd/x86_64/jquanti-sse2.asm b/simd/x86_64/jquanti-sse2.asm
index bb6fa69..3ee4420 100644
--- a/simd/x86_64/jquanti-sse2.asm
+++ b/simd/x86_64/jquanti-sse2.asm
@@ -3,6 +3,7 @@
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -51,14 +52,14 @@ EXTN(jsimd_convsamp_sse2):
mov rdi, r12
mov rcx, DCTSIZE/4
.convloop:
- mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
- mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdxp, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567)
movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF)
- mov rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
- mov rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rbxp, JSAMPROW [rsi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
+ mov rdxp, JSAMPROW [rsi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
diff --git a/simd/x86_64/jsimd.c b/simd/x86_64/jsimd.c
index dc639fc..eb76679 100644
--- a/simd/x86_64/jsimd.c
+++ b/simd/x86_64/jsimd.c
@@ -472,12 +472,6 @@ jsimd_can_h2v1_fancy_upsample(void)
return 0;
}
-GLOBAL(int)
-jsimd_can_h1v2_fancy_upsample(void)
-{
- return 0;
-}
-
GLOBAL(void)
jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
@@ -506,12 +500,6 @@ jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
output_data_ptr);
}
-GLOBAL(void)
-jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
- JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
-{
-}
-
GLOBAL(int)
jsimd_can_h2v2_merged_upsample(void)
{
@@ -1043,8 +1031,6 @@ jsimd_can_encode_mcu_AC_first_prepare(void)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
- if (SIZEOF_SIZE_T != 8)
- return 0;
if (simd_support & JSIMD_SSE2)
return 1;
@@ -1069,8 +1055,6 @@ jsimd_can_encode_mcu_AC_refine_prepare(void)
return 0;
if (sizeof(JCOEF) != 2)
return 0;
- if (SIZEOF_SIZE_T != 8)
- return 0;
if (simd_support & JSIMD_SSE2)
return 1;