From 3e329d5dc967398cb425588d0a25ad611ff92a1c Mon Sep 17 00:00:00 2001 From: Ben Avison Date: Thu, 8 Nov 2018 17:47:20 +0000 Subject: [PATCH 09/11] ARM: NEON assembly optimization for function BlitRGBtoRGBPixelAlpha --- src/video/SDL_blit_A.c | 21 + src/video/arm/pixman-arm-neon-asm.S | 159 ++++ src/video/arm/pixman-arm-neon-asm.h | 1184 +++++++++++++++++++++++++++ 3 files changed, 1364 insertions(+) create mode 100644 src/video/arm/pixman-arm-neon-asm.S create mode 100644 src/video/arm/pixman-arm-neon-asm.h diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c index a1d087d4e..53a71f3cf 100644 --- a/src/video/SDL_blit_A.c +++ b/src/video/SDL_blit_A.c @@ -421,6 +421,23 @@ BlitRGBtoRGBPixelAlphaARMSIMD(SDL_BlitInfo * info) } #endif +#if SDL_ARM_NEON_BLITTERS +void BlitRGBtoRGBPixelAlphaARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride); + +static void +BlitRGBtoRGBPixelAlphaARMNEON(SDL_BlitInfo * info) +{ + int32_t width = info->dst_w; + int32_t height = info->dst_h; + uint32_t *dstp = (uint32_t *)info->dst; + int32_t dststride = width + (info->dst_skip >> 2); + uint32_t *srcp = (uint32_t *)info->src; + int32_t srcstride = width + (info->src_skip >> 2); + + BlitRGBtoRGBPixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride); +} +#endif + /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info) @@ -1356,6 +1373,10 @@ SDL_CalculateBlitA(SDL_Surface * surface) } #endif /* __MMX__ || __3dNOW__ */ if (sf->Amask == 0xff000000) { +#if SDL_ARM_NEON_BLITTERS + if (SDL_HasNEON()) + return BlitRGBtoRGBPixelAlphaARMNEON; +#endif #if SDL_ARM_SIMD_BLITTERS if (SDL_HasARMSIMD()) return BlitRGBtoRGBPixelAlphaARMSIMD; diff --git a/src/video/arm/pixman-arm-neon-asm.S b/src/video/arm/pixman-arm-neon-asm.S new file mode 100644 index 000000000..72fd3a2b4 --- /dev/null +++ b/src/video/arm/pixman-arm-neon-asm.S @@ -0,0 +1,159 @@ +/* + * Copyright © 2009 Nokia Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) + */ + +/* + * Copyright (c) 2018 RISC OS Open Ltd + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +/* Prevent the stack from becoming executable for no reason... */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + + .text + .fpu neon + .arch armv7a + .object_arch armv4 + .eabi_attribute 10, 0 /* suppress Tag_FP_arch */ + .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */ + .arm + .altmacro + .p2align 2 + +#include "pixman-arm-asm.h" +#include "pixman-arm-neon-asm.h" + +/* Global configuration options and preferences */ + +/* + * The code can optionally make use of unaligned memory accesses to improve + * performance of handling leading/trailing pixels for each scanline. + * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for + * example in linux if unaligned memory accesses are not configured to + * generate.exceptions. + */ +.set RESPECT_STRICT_ALIGNMENT, 1 + +/* + * Set default prefetch type. There is a choice between the following options: + * + * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work + * as NOP to workaround some HW bugs or for whatever other reason) + * + * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where + * advanced prefetch intruduces heavy overhead) + * + * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8 + * which can run ARM and NEON instructions simultaneously so that extra ARM + * instructions do not add (many) extra cycles, but improve prefetch efficiency) + * + * Note: some types of function can't support advanced prefetch and fallback + * to simple one (those which handle 24bpp pixels) + */ +.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED + +/* Prefetch distance in pixels for simple prefetch */ +.set PREFETCH_DISTANCE_SIMPLE, 64 + +/******************************************************************************/ + +.macro RGBtoRGBPixelAlpha_process_pixblock_head + vmvn d30, d3 /* get inverted source alpha */ + vmov d31, d7 /* dest alpha is always unchanged */ + vmull.u8 q14, d0, d3 + vmlal.u8 q14, d4, d30 + vmull.u8 q0, d1, d3 + vmlal.u8 q0, d5, d30 + vmull.u8 q1, d2, d3 + vmlal.u8 q1, d6, d30 + vrshr.u16 q2, q14, #8 + vrshr.u16 q3, q0, #8 + vraddhn.u16 d28, q14, q2 + vrshr.u16 q2, q1, #8 + vraddhn.u16 d29, q0, q3 + vraddhn.u16 d30, q1, q2 +.endm + +.macro RGBtoRGBPixelAlpha_process_pixblock_tail + /* nothing */ +.endm + +.macro RGBtoRGBPixelAlpha_process_pixblock_tail_head + vld4.8 {d0-d3}, [SRC]! + PF add PF_X, PF_X, #8 + vst4.8 {d28-d31}, [DST_W :128]! + PF tst PF_CTL, #0xF + vld4.8 {d4-d7}, [DST_R :128]! + PF addne PF_X, PF_X, #8 + vmvn d30, d3 /* get inverted source alpha */ + vmov d31, d7 /* dest alpha is always unchanged */ + vmull.u8 q14, d0, d3 + PF subne PF_CTL, PF_CTL, #1 + vmlal.u8 q14, d4, d30 + PF cmp PF_X, ORIG_W + vmull.u8 q0, d1, d3 + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] + vmlal.u8 q0, d5, d30 + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] + vmull.u8 q1, d2, d3 + PF subge PF_X, PF_X, ORIG_W + vmlal.u8 q1, d6, d30 + PF subges PF_CTL, PF_CTL, #0x10 + vrshr.u16 q2, q14, #8 + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! + vrshr.u16 q3, q0, #8 + PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! + vraddhn.u16 d28, q14, q2 + vrshr.u16 q2, q1, #8 + vraddhn.u16 d29, q0, q3 + vraddhn.u16 d30, q1, q2 +.endm + +generate_composite_function \ + BlitRGBtoRGBPixelAlphaARMNEONAsm, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ + 8, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + default_init, \ + default_cleanup, \ + RGBtoRGBPixelAlpha_process_pixblock_head, \ + RGBtoRGBPixelAlpha_process_pixblock_tail, \ + RGBtoRGBPixelAlpha_process_pixblock_tail_head diff --git a/src/video/arm/pixman-arm-neon-asm.h b/src/video/arm/pixman-arm-neon-asm.h new file mode 100644 index 000000000..bdcf6a9d4 --- /dev/null +++ b/src/video/arm/pixman-arm-neon-asm.h @@ -0,0 +1,1184 @@ +/* + * Copyright © 2009 Nokia Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) + */ + +/* + * This file contains a macro ('generate_composite_function') which can + * construct 2D image processing functions, based on a common template. + * Any combinations of source, destination and mask images with 8bpp, + * 16bpp, 24bpp, 32bpp color formats are supported. + * + * This macro takes care of: + * - handling of leading and trailing unaligned pixels + * - doing most of the work related to L2 cache preload + * - encourages the use of software pipelining for better instructions + * scheduling + * + * The user of this macro has to provide some configuration parameters + * (bit depths for the images, prefetch distance, etc.) and a set of + * macros, which should implement basic code chunks responsible for + * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage + * examples. + * + * TODO: + * - try overlapped pixel method (from Ian Rickards) when processing + * exactly two blocks of pixels + * - maybe add an option to do reverse scanline processing + */ + +/* + * Bit flags for 'generate_composite_function' macro which are used + * to tune generated functions behavior. + */ +.set FLAG_DST_WRITEONLY, 0 +.set FLAG_DST_READWRITE, 1 +.set FLAG_DEINTERLEAVE_32BPP, 2 + +/* + * Offset in stack where mask and source pointer/stride can be accessed + * from 'init' macro. This is useful for doing special handling for solid mask. + */ +.set ARGS_STACK_OFFSET, 40 + +/* + * Constants for selecting preferable prefetch type. + */ +.set PREFETCH_TYPE_NONE, 0 /* No prefetch at all */ +.set PREFETCH_TYPE_SIMPLE, 1 /* A simple, fixed-distance-ahead prefetch */ +.set PREFETCH_TYPE_ADVANCED, 2 /* Advanced fine-grained prefetch */ + +/* + * Definitions of supplementary pixld/pixst macros (for partial load/store of + * pixel data). + */ + +.macro pixldst1 op, elem_size, reg1, mem_operand, abits +.if abits > 0 + op&.&elem_size {d®1}, [&mem_operand&, :&abits&]! +.else + op&.&elem_size {d®1}, [&mem_operand&]! +.endif +.endm + +.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits +.if abits > 0 + op&.&elem_size {d®1, d®2}, [&mem_operand&, :&abits&]! +.else + op&.&elem_size {d®1, d®2}, [&mem_operand&]! +.endif +.endm + +.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits +.if abits > 0 + op&.&elem_size {d®1, d®2, d®3, d®4}, [&mem_operand&, :&abits&]! +.else + op&.&elem_size {d®1, d®2, d®3, d®4}, [&mem_operand&]! +.endif +.endm + +.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits + op&.&elem_size {d®1[idx]}, [&mem_operand&]! +.endm + +.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand + op&.&elem_size {d®1, d®2, d®3}, [&mem_operand&]! +.endm + +.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand + op&.&elem_size {d®1[idx], d®2[idx], d®3[idx]}, [&mem_operand&]! +.endm + +.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits +.if numbytes == 32 + pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \ + %(basereg+6), %(basereg+7), mem_operand, abits +.elseif numbytes == 16 + pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits +.elseif numbytes == 8 + pixldst1 op, elem_size, %(basereg+1), mem_operand, abits +.elseif numbytes == 4 + .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32) + pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits + .elseif elem_size == 16 + pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits + pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits + .else + pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits + pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits + pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits + pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits + .endif +.elseif numbytes == 2 + .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16) + pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits + .else + pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits + pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits + .endif +.elseif numbytes == 1 + pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits +.else + .error "unsupported size: numbytes" +.endif +.endm + +.macro pixld numpix, bpp, basereg, mem_operand, abits=0 +.if bpp > 0 +.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) + pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \ + %(basereg+6), %(basereg+7), mem_operand, abits +.elseif (bpp == 24) && (numpix == 8) + pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand +.elseif (bpp == 24) && (numpix == 4) + pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand + pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand + pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand + pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand +.elseif (bpp == 24) && (numpix == 2) + pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand + pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand +.elseif (bpp == 24) && (numpix == 1) + pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand +.else + pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits +.endif +.endif +.endm + +.macro pixst numpix, bpp, basereg, mem_operand, abits=0 +.if bpp > 0 +.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) + pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \ + %(basereg+6), %(basereg+7), mem_operand, abits +.elseif (bpp == 24) && (numpix == 8) + pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand +.elseif (bpp == 24) && (numpix == 4) + pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand + pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand + pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand + pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand +.elseif (bpp == 24) && (numpix == 2) + pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand + pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand +.elseif (bpp == 24) && (numpix == 1) + pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand +.else + pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits +.endif +.endif +.endm + +.macro pixld_a numpix, bpp, basereg, mem_operand +.if (bpp * numpix) <= 128 + pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix) +.else + pixld numpix, bpp, basereg, mem_operand, 128 +.endif +.endm + +.macro pixst_a numpix, bpp, basereg, mem_operand +.if (bpp * numpix) <= 128 + pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix) +.else + pixst numpix, bpp, basereg, mem_operand, 128 +.endif +.endm + +/* + * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register + * aliases to be defined) + */ +.macro pixld1_s elem_size, reg1, mem_operand +.if elem_size == 16 + mov TMP1, VX, asr #16 + adds VX, VX, UNIT_X +5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b + add TMP1, mem_operand, TMP1, asl #1 + mov TMP2, VX, asr #16 + adds VX, VX, UNIT_X +5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b + add TMP2, mem_operand, TMP2, asl #1 + vld1.16 {d®1&[0]}, [TMP1, :16] + mov TMP1, VX, asr #16 + adds VX, VX, UNIT_X +5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b + add TMP1, mem_operand, TMP1, asl #1 + vld1.16 {d®1&[1]}, [TMP2, :16] + mov TMP2, VX, asr #16 + adds VX, VX, UNIT_X +5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b + add TMP2, mem_operand, TMP2, asl #1 + vld1.16 {d®1&[2]}, [TMP1, :16] + vld1.16 {d®1&[3]}, [TMP2, :16] +.elseif elem_size == 32 + mov TMP1, VX, asr #16 + adds VX, VX, UNIT_X +5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b + add TMP1, mem_operand, TMP1, asl #2 + mov TMP2, VX, asr #16 + adds VX, VX, UNIT_X +5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b + add TMP2, mem_operand, TMP2, asl #2 + vld1.32 {d®1&[0]}, [TMP1, :32] + vld1.32 {d®1&[1]}, [TMP2, :32] +.else + .error "unsupported" +.endif +.endm + +.macro pixld2_s elem_size, reg1, reg2, mem_operand +.if 0 /* elem_size == 32 */ + mov TMP1, VX, asr #16 + add VX, VX, UNIT_X, asl #1 + add TMP1, mem_operand, TMP1, asl #2 + mov TMP2, VX, asr #16 + sub VX, VX, UNIT_X + add TMP2, mem_operand, TMP2, asl #2 + vld1.32 {d®1&[0]}, [TMP1, :32] + mov TMP1, VX, asr #16 + add VX, VX, UNIT_X, asl #1 + add TMP1, mem_operand, TMP1, asl #2 + vld1.32 {d®2&[0]}, [TMP2, :32] + mov TMP2, VX, asr #16 + add VX, VX, UNIT_X + add TMP2, mem_operand, TMP2, asl #2 + vld1.32 {d®1&[1]}, [TMP1, :32] + vld1.32 {d®2&[1]}, [TMP2, :32] +.else + pixld1_s elem_size, reg1, mem_operand + pixld1_s elem_size, reg2, mem_operand +.endif +.endm + +.macro pixld0_s elem_size, reg1, idx, mem_operand +.if elem_size == 16 + mov TMP1, VX, asr #16 + adds VX, VX, UNIT_X +5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b + add TMP1, mem_operand, TMP1, asl #1 + vld1.16 {d®1&[idx]}, [TMP1, :16] +.elseif elem_size == 32 + mov TMP1, VX, asr #16 + adds VX, VX, UNIT_X +5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b + add TMP1, mem_operand, TMP1, asl #2 + vld1.32 {d®1&[idx]}, [TMP1, :32] +.endif +.endm + +.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand +.if numbytes == 32 + pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand + pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand + pixdeinterleave elem_size, %(basereg+4) +.elseif numbytes == 16 + pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand +.elseif numbytes == 8 + pixld1_s elem_size, %(basereg+1), mem_operand +.elseif numbytes == 4 + .if elem_size == 32 + pixld0_s elem_size, %(basereg+0), 1, mem_operand + .elseif elem_size == 16 + pixld0_s elem_size, %(basereg+0), 2, mem_operand + pixld0_s elem_size, %(basereg+0), 3, mem_operand + .else + pixld0_s elem_size, %(basereg+0), 4, mem_operand + pixld0_s elem_size, %(basereg+0), 5, mem_operand + pixld0_s elem_size, %(basereg+0), 6, mem_operand + pixld0_s elem_size, %(basereg+0), 7, mem_operand + .endif +.elseif numbytes == 2 + .if elem_size == 16 + pixld0_s elem_size, %(basereg+0), 1, mem_operand + .else + pixld0_s elem_size, %(basereg+0), 2, mem_operand + pixld0_s elem_size, %(basereg+0), 3, mem_operand + .endif +.elseif numbytes == 1 + pixld0_s elem_size, %(basereg+0), 1, mem_operand +.else + .error "unsupported size: numbytes" +.endif +.endm + +.macro pixld_s numpix, bpp, basereg, mem_operand +.if bpp > 0 + pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand +.endif +.endm + +.macro vuzp8 reg1, reg2 + vuzp.8 d®1, d®2 +.endm + +.macro vzip8 reg1, reg2 + vzip.8 d®1, d®2 +.endm + +/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ +.macro pixdeinterleave bpp, basereg +.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) + vuzp8 %(basereg+0), %(basereg+1) + vuzp8 %(basereg+2), %(basereg+3) + vuzp8 %(basereg+1), %(basereg+3) + vuzp8 %(basereg+0), %(basereg+2) +.endif +.endm + +/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ +.macro pixinterleave bpp, basereg +.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) + vzip8 %(basereg+0), %(basereg+2) + vzip8 %(basereg+1), %(basereg+3) + vzip8 %(basereg+2), %(basereg+3) + vzip8 %(basereg+0), %(basereg+1) +.endif +.endm + +/* + * This is a macro for implementing cache preload. The main idea is that + * cache preload logic is mostly independent from the rest of pixels + * processing code. It starts at the top left pixel and moves forward + * across pixels and can jump across scanlines. Prefetch distance is + * handled in an 'incremental' way: it starts from 0 and advances to the + * optimal distance over time. After reaching optimal prefetch distance, + * it is kept constant. There are some checks which prevent prefetching + * unneeded pixel lines below the image (but it still can prefetch a bit + * more data on the right side of the image - not a big issue and may + * be actually helpful when rendering text glyphs). Additional trick is + * the use of LDR instruction for prefetch instead of PLD when moving to + * the next line, the point is that we have a high chance of getting TLB + * miss in this case, and PLD would be useless. + * + * This sounds like it may introduce a noticeable overhead (when working with + * fully cached data). But in reality, due to having a separate pipeline and + * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can + * execute simultaneously with NEON and be completely shadowed by it. Thus + * we get no performance overhead at all (*). This looks like a very nice + * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher, + * but still can implement some rather advanced prefetch logic in software + * for almost zero cost! + * + * (*) The overhead of the prefetcher is visible when running some trivial + * pixels processing like simple copy. Anyway, having prefetch is a must + * when working with the graphics data. + */ +.macro PF a, x:vararg +.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED) + a x +.endif +.endm + +.macro cache_preload std_increment, boost_increment +.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0) +.if regs_shortage + PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */ +.endif +.if std_increment != 0 + PF add PF_X, PF_X, #std_increment +.endif + PF tst PF_CTL, #0xF + PF addne PF_X, PF_X, #boost_increment + PF subne PF_CTL, PF_CTL, #1 + PF cmp PF_X, ORIG_W +.if src_bpp_shift >= 0 + PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] +.endif +.if dst_r_bpp != 0 + PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] +.endif +.if mask_bpp_shift >= 0 + PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] +.endif + PF subge PF_X, PF_X, ORIG_W + PF subges PF_CTL, PF_CTL, #0x10 +.if src_bpp_shift >= 0 + PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! +.endif +.if dst_r_bpp != 0 + PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! +.endif +.if mask_bpp_shift >= 0 + PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! +.endif +.endif +.endm + +.macro cache_preload_simple +.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE) +.if src_bpp > 0 + pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)] +.endif +.if dst_r_bpp > 0 + pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)] +.endif +.if mask_bpp > 0 + pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)] +.endif +.endif +.endm + +.macro fetch_mask_pixblock + pixld pixblock_size, mask_bpp, \ + (mask_basereg - pixblock_size * mask_bpp / 64), MASK +.endm + +/* + * Macro which is used to process leading pixels until destination + * pointer is properly aligned (at 16 bytes boundary). When destination + * buffer uses 16bpp format, this is unnecessary, or even pointless. + */ +.macro ensure_destination_ptr_alignment process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head +.if dst_w_bpp != 24 + tst DST_R, #0xF + beq 2f + +.irp lowbit, 1, 2, 4, 8, 16 +local skip1 +.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) +.if lowbit < 16 /* we don't need more than 16-byte alignment */ + tst DST_R, #lowbit + beq 1f +.endif + pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC + pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK +.if dst_r_bpp > 0 + pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R +.else + add DST_R, DST_R, #lowbit +.endif + PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp) + sub W, W, #(lowbit * 8 / dst_w_bpp) +1: +.endif +.endr + pixdeinterleave src_bpp, src_basereg + pixdeinterleave mask_bpp, mask_basereg + pixdeinterleave dst_r_bpp, dst_r_basereg + + process_pixblock_head + cache_preload 0, pixblock_size + cache_preload_simple + process_pixblock_tail + + pixinterleave dst_w_bpp, dst_w_basereg +.irp lowbit, 1, 2, 4, 8, 16 +.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) +.if lowbit < 16 /* we don't need more than 16-byte alignment */ + tst DST_W, #lowbit + beq 1f +.endif + pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W +1: +.endif +.endr +.endif +2: +.endm + +/* + * Special code for processing up to (pixblock_size - 1) remaining + * trailing pixels. As SIMD processing performs operation on + * pixblock_size pixels, anything smaller than this has to be loaded + * and stored in a special way. Loading and storing of pixel data is + * performed in such a way that we fill some 'slots' in the NEON + * registers (some slots naturally are unused), then perform compositing + * operation as usual. In the end, the data is taken from these 'slots' + * and saved to memory. + * + * cache_preload_flag - allows to suppress prefetch if + * set to 0 + * dst_aligned_flag - selects whether destination buffer + * is aligned + */ +.macro process_trailing_pixels cache_preload_flag, \ + dst_aligned_flag, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + tst W, #(pixblock_size - 1) + beq 2f +.irp chunk_size, 16, 8, 4, 2, 1 +.if pixblock_size > chunk_size + tst W, #chunk_size + beq 1f + pixld_src chunk_size, src_bpp, src_basereg, SRC + pixld chunk_size, mask_bpp, mask_basereg, MASK +.if dst_aligned_flag != 0 + pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R +.else + pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R +.endif +.if cache_preload_flag != 0 + PF add PF_X, PF_X, #chunk_size +.endif +1: +.endif +.endr + pixdeinterleave src_bpp, src_basereg + pixdeinterleave mask_bpp, mask_basereg + pixdeinterleave dst_r_bpp, dst_r_basereg + + process_pixblock_head +.if cache_preload_flag != 0 + cache_preload 0, pixblock_size + cache_preload_simple +.endif + process_pixblock_tail + pixinterleave dst_w_bpp, dst_w_basereg +.irp chunk_size, 16, 8, 4, 2, 1 +.if pixblock_size > chunk_size + tst W, #chunk_size + beq 1f +.if dst_aligned_flag != 0 + pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W +.else + pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W +.endif +1: +.endif +.endr +2: +.endm + +/* + * Macro, which performs all the needed operations to switch to the next + * scanline and start the next loop iteration unless all the scanlines + * are already processed. + */ +.macro advance_to_next_scanline start_of_loop_label +.if regs_shortage + ldrd W, [sp] /* load W and H (width and height) from stack */ +.else + mov W, ORIG_W +.endif + add DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift +.if src_bpp != 0 + add SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift +.endif +.if mask_bpp != 0 + add MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift +.endif +.if (dst_w_bpp != 24) + sub DST_W, DST_W, W, lsl #dst_bpp_shift +.endif +.if (src_bpp != 24) && (src_bpp != 0) + sub SRC, SRC, W, lsl #src_bpp_shift +.endif +.if (mask_bpp != 24) && (mask_bpp != 0) + sub MASK, MASK, W, lsl #mask_bpp_shift +.endif + subs H, H, #1 + mov DST_R, DST_W +.if regs_shortage + str H, [sp, #4] /* save updated height to stack */ +.endif + bge start_of_loop_label +.endm + +/* + * Registers are allocated in the following way by default: + * d0, d1, d2, d3 - reserved for loading source pixel data + * d4, d5, d6, d7 - reserved for loading destination pixel data + * d24, d25, d26, d27 - reserved for loading mask pixel data + * d28, d29, d30, d31 - final destination pixel data for writeback to memory + */ +.macro generate_composite_function fname, \ + src_bpp_, \ + mask_bpp_, \ + dst_w_bpp_, \ + flags, \ + pixblock_size_, \ + prefetch_distance, \ + init, \ + cleanup, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head, \ + dst_w_basereg_ = 28, \ + dst_r_basereg_ = 4, \ + src_basereg_ = 0, \ + mask_basereg_ = 24 + + pixman_asm_function fname + + push {r4-r12, lr} /* save all registers */ + +/* + * Select prefetch type for this function. If prefetch distance is + * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch + * has to be used instead of ADVANCED. + */ + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT +.if prefetch_distance == 0 + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE +.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \ + ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24)) + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE +.endif + +/* + * Make some macro arguments globally visible and accessible + * from other macros + */ + .set src_bpp, src_bpp_ + .set mask_bpp, mask_bpp_ + .set dst_w_bpp, dst_w_bpp_ + .set pixblock_size, pixblock_size_ + .set dst_w_basereg, dst_w_basereg_ + .set dst_r_basereg, dst_r_basereg_ + .set src_basereg, src_basereg_ + .set mask_basereg, mask_basereg_ + + .macro pixld_src x:vararg + pixld x + .endm + .macro fetch_src_pixblock + pixld_src pixblock_size, src_bpp, \ + (src_basereg - pixblock_size * src_bpp / 64), SRC + .endm +/* + * Assign symbolic names to registers + */ + W .req r0 /* width (is updated during processing) */ + H .req r1 /* height (is updated during processing) */ + DST_W .req r2 /* destination buffer pointer for writes */ + DST_STRIDE .req r3 /* destination image stride */ + SRC .req r4 /* source buffer pointer */ + SRC_STRIDE .req r5 /* source image stride */ + DST_R .req r6 /* destination buffer pointer for reads */ + + MASK .req r7 /* mask pointer */ + MASK_STRIDE .req r8 /* mask stride */ + + PF_CTL .req r9 /* combined lines counter and prefetch */ + /* distance increment counter */ + PF_X .req r10 /* pixel index in a scanline for current */ + /* pretetch position */ + PF_SRC .req r11 /* pointer to source scanline start */ + /* for prefetch purposes */ + PF_DST .req r12 /* pointer to destination scanline start */ + /* for prefetch purposes */ + PF_MASK .req r14 /* pointer to mask scanline start */ + /* for prefetch purposes */ +/* + * Check whether we have enough registers for all the local variables. + * If we don't have enough registers, original width and height are + * kept on top of stack (and 'regs_shortage' variable is set to indicate + * this for the rest of code). Even if there are enough registers, the + * allocation scheme may be a bit different depending on whether source + * or mask is not used. + */ +.if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED) + ORIG_W .req r10 /* saved original width */ + DUMMY .req r12 /* temporary register */ + .set regs_shortage, 0 +.elseif mask_bpp == 0 + ORIG_W .req r7 /* saved original width */ + DUMMY .req r8 /* temporary register */ + .set regs_shortage, 0 +.elseif src_bpp == 0 + ORIG_W .req r4 /* saved original width */ + DUMMY .req r5 /* temporary register */ + .set regs_shortage, 0 +.else + ORIG_W .req r1 /* saved original width */ + DUMMY .req r1 /* temporary register */ + .set regs_shortage, 1 +.endif + + .set mask_bpp_shift, -1 +.if src_bpp == 32 + .set src_bpp_shift, 2 +.elseif src_bpp == 24 + .set src_bpp_shift, 0 +.elseif src_bpp == 16 + .set src_bpp_shift, 1 +.elseif src_bpp == 8 + .set src_bpp_shift, 0 +.elseif src_bpp == 0 + .set src_bpp_shift, -1 +.else + .error "requested src bpp (src_bpp) is not supported" +.endif +.if mask_bpp == 32 + .set mask_bpp_shift, 2 +.elseif mask_bpp == 24 + .set mask_bpp_shift, 0 +.elseif mask_bpp == 8 + .set mask_bpp_shift, 0 +.elseif mask_bpp == 0 + .set mask_bpp_shift, -1 +.else + .error "requested mask bpp (mask_bpp) is not supported" +.endif +.if dst_w_bpp == 32 + .set dst_bpp_shift, 2 +.elseif dst_w_bpp == 24 + .set dst_bpp_shift, 0 +.elseif dst_w_bpp == 16 + .set dst_bpp_shift, 1 +.elseif dst_w_bpp == 8 + .set dst_bpp_shift, 0 +.else + .error "requested dst bpp (dst_w_bpp) is not supported" +.endif + +.if (((flags) & FLAG_DST_READWRITE) != 0) + .set dst_r_bpp, dst_w_bpp +.else + .set dst_r_bpp, 0 +.endif +.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) + .set DEINTERLEAVE_32BPP_ENABLED, 1 +.else + .set DEINTERLEAVE_32BPP_ENABLED, 0 +.endif + +.if prefetch_distance < 0 || prefetch_distance > 15 + .error "invalid prefetch distance (prefetch_distance)" +.endif + +.if src_bpp > 0 + ldr SRC, [sp, #40] +.endif +.if mask_bpp > 0 + ldr MASK, [sp, #48] +.endif + PF mov PF_X, #0 +.if src_bpp > 0 + ldr SRC_STRIDE, [sp, #44] +.endif +.if mask_bpp > 0 + ldr MASK_STRIDE, [sp, #52] +.endif + mov DST_R, DST_W + +.if src_bpp == 24 + sub SRC_STRIDE, SRC_STRIDE, W + sub SRC_STRIDE, SRC_STRIDE, W, lsl #1 +.endif +.if mask_bpp == 24 + sub MASK_STRIDE, MASK_STRIDE, W + sub MASK_STRIDE, MASK_STRIDE, W, lsl #1 +.endif +.if dst_w_bpp == 24 + sub DST_STRIDE, DST_STRIDE, W + sub DST_STRIDE, DST_STRIDE, W, lsl #1 +.endif + +/* + * Setup advanced prefetcher initial state + */ + PF mov PF_SRC, SRC + PF mov PF_DST, DST_R + PF mov PF_MASK, MASK + /* PF_CTL = prefetch_distance | ((h - 1) << 4) */ + PF mov PF_CTL, H, lsl #4 + PF add PF_CTL, #(prefetch_distance - 0x10) + + init +.if regs_shortage + push {r0, r1} +.endif + subs H, H, #1 +.if regs_shortage + str H, [sp, #4] /* save updated height to stack */ +.else + mov ORIG_W, W +.endif + blt 9f + cmp W, #(pixblock_size * 2) + blt 8f +/* + * This is the start of the pipelined loop, which if optimized for + * long scanlines + */ +0: + ensure_destination_ptr_alignment process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + + /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ + pixld_a pixblock_size, dst_r_bpp, \ + (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R + fetch_src_pixblock + pixld pixblock_size, mask_bpp, \ + (mask_basereg - pixblock_size * mask_bpp / 64), MASK + PF add PF_X, PF_X, #pixblock_size + process_pixblock_head + cache_preload 0, pixblock_size + cache_preload_simple + subs W, W, #(pixblock_size * 2) + blt 2f +1: + process_pixblock_tail_head + cache_preload_simple + subs W, W, #pixblock_size + bge 1b +2: + process_pixblock_tail + pixst_a pixblock_size, dst_w_bpp, \ + (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W + + /* Process the remaining trailing pixels in the scanline */ + process_trailing_pixels 1, 1, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + advance_to_next_scanline 0b + +.if regs_shortage + pop {r0, r1} +.endif + cleanup + pop {r4-r12, pc} /* exit */ +/* + * This is the start of the loop, designed to process images with small width + * (less than pixblock_size * 2 pixels). In this case neither pipelining + * nor prefetch are used. + */ +8: + /* Process exactly pixblock_size pixels if needed */ + tst W, #pixblock_size + beq 1f + pixld pixblock_size, dst_r_bpp, \ + (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R + fetch_src_pixblock + pixld pixblock_size, mask_bpp, \ + (mask_basereg - pixblock_size * mask_bpp / 64), MASK + process_pixblock_head + process_pixblock_tail + pixst pixblock_size, dst_w_bpp, \ + (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W +1: + /* Process the remaining trailing pixels in the scanline */ + process_trailing_pixels 0, 0, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + advance_to_next_scanline 8b +9: +.if regs_shortage + pop {r0, r1} +.endif + cleanup + pop {r4-r12, pc} /* exit */ + + .purgem fetch_src_pixblock + .purgem pixld_src + + .unreq SRC + .unreq MASK + .unreq DST_R + .unreq DST_W + .unreq ORIG_W + .unreq W + .unreq H + .unreq SRC_STRIDE + .unreq DST_STRIDE + .unreq MASK_STRIDE + .unreq PF_CTL + .unreq PF_X + .unreq PF_SRC + .unreq PF_DST + .unreq PF_MASK + .unreq DUMMY + .endfunc +.endm + +/* + * A simplified variant of function generation template for a single + * scanline processing (for implementing pixman combine functions) + */ +.macro generate_composite_function_scanline use_nearest_scaling, \ + fname, \ + src_bpp_, \ + mask_bpp_, \ + dst_w_bpp_, \ + flags, \ + pixblock_size_, \ + init, \ + cleanup, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head, \ + dst_w_basereg_ = 28, \ + dst_r_basereg_ = 4, \ + src_basereg_ = 0, \ + mask_basereg_ = 24 + + pixman_asm_function fname + + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE +/* + * Make some macro arguments globally visible and accessible + * from other macros + */ + .set src_bpp, src_bpp_ + .set mask_bpp, mask_bpp_ + .set dst_w_bpp, dst_w_bpp_ + .set pixblock_size, pixblock_size_ + .set dst_w_basereg, dst_w_basereg_ + .set dst_r_basereg, dst_r_basereg_ + .set src_basereg, src_basereg_ + .set mask_basereg, mask_basereg_ + +.if use_nearest_scaling != 0 + /* + * Assign symbolic names to registers for nearest scaling + */ + W .req r0 + DST_W .req r1 + SRC .req r2 + VX .req r3 + UNIT_X .req ip + MASK .req lr + TMP1 .req r4 + TMP2 .req r5 + DST_R .req r6 + SRC_WIDTH_FIXED .req r7 + + .macro pixld_src x:vararg + pixld_s x + .endm + + ldr UNIT_X, [sp] + push {r4-r8, lr} + ldr SRC_WIDTH_FIXED, [sp, #(24 + 4)] + .if mask_bpp != 0 + ldr MASK, [sp, #(24 + 8)] + .endif +.else + /* + * Assign symbolic names to registers + */ + W .req r0 /* width (is updated during processing) */ + DST_W .req r1 /* destination buffer pointer for writes */ + SRC .req r2 /* source buffer pointer */ + DST_R .req ip /* destination buffer pointer for reads */ + MASK .req r3 /* mask pointer */ + + .macro pixld_src x:vararg + pixld x + .endm +.endif + +.if (((flags) & FLAG_DST_READWRITE) != 0) + .set dst_r_bpp, dst_w_bpp +.else + .set dst_r_bpp, 0 +.endif +.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) + .set DEINTERLEAVE_32BPP_ENABLED, 1 +.else + .set DEINTERLEAVE_32BPP_ENABLED, 0 +.endif + + .macro fetch_src_pixblock + pixld_src pixblock_size, src_bpp, \ + (src_basereg - pixblock_size * src_bpp / 64), SRC + .endm + + init + mov DST_R, DST_W + + cmp W, #pixblock_size + blt 8f + + ensure_destination_ptr_alignment process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + + subs W, W, #pixblock_size + blt 7f + + /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ + pixld_a pixblock_size, dst_r_bpp, \ + (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R + fetch_src_pixblock + pixld pixblock_size, mask_bpp, \ + (mask_basereg - pixblock_size * mask_bpp / 64), MASK + process_pixblock_head + subs W, W, #pixblock_size + blt 2f +1: + process_pixblock_tail_head + subs W, W, #pixblock_size + bge 1b +2: + process_pixblock_tail + pixst_a pixblock_size, dst_w_bpp, \ + (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W +7: + /* Process the remaining trailing pixels in the scanline (dst aligned) */ + process_trailing_pixels 0, 1, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + + cleanup +.if use_nearest_scaling != 0 + pop {r4-r8, pc} /* exit */ +.else + bx lr /* exit */ +.endif +8: + /* Process the remaining trailing pixels in the scanline (dst unaligned) */ + process_trailing_pixels 0, 0, \ + process_pixblock_head, \ + process_pixblock_tail, \ + process_pixblock_tail_head + + cleanup + +.if use_nearest_scaling != 0 + pop {r4-r8, pc} /* exit */ + + .unreq DST_R + .unreq SRC + .unreq W + .unreq VX + .unreq UNIT_X + .unreq TMP1 + .unreq TMP2 + .unreq DST_W + .unreq MASK + .unreq SRC_WIDTH_FIXED + +.else + bx lr /* exit */ + + .unreq SRC + .unreq MASK + .unreq DST_R + .unreq DST_W + .unreq W +.endif + + .purgem fetch_src_pixblock + .purgem pixld_src + + .endfunc +.endm + +.macro generate_composite_function_single_scanline x:vararg + generate_composite_function_scanline 0, x +.endm + +.macro generate_composite_function_nearest_scanline x:vararg + generate_composite_function_scanline 1, x +.endm + +/* Default prologue/epilogue, nothing special needs to be done */ + +.macro default_init +.endm + +.macro default_cleanup +.endm + +/* + * Prologue/epilogue variant which additionally saves/restores d8-d15 + * registers (they need to be saved/restored by callee according to ABI). + * This is required if the code needs to use all the NEON registers. + */ + +.macro default_init_need_all_regs + vpush {d8-d15} +.endm + +.macro default_cleanup_need_all_regs + vpop {d8-d15} +.endm + +/******************************************************************************/ + +/* + * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in) + * into a planar a8r8g8b8 format (with a, r, g, b color components + * stored into 64-bit registers out_a, out_r, out_g, out_b respectively). + * + * Warning: the conversion is destructive and the original + * value (in) is lost. + */ +.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b + vshrn.u16 out_r, in, #8 + vshrn.u16 out_g, in, #3 + vsli.u16 in, in, #5 + vmov.u8 out_a, #255 + vsri.u8 out_r, out_r, #5 + vsri.u8 out_g, out_g, #6 + vshrn.u16 out_b, in, #2 +.endm + +.macro convert_0565_to_x888 in, out_r, out_g, out_b + vshrn.u16 out_r, in, #8 + vshrn.u16 out_g, in, #3 + vsli.u16 in, in, #5 + vsri.u8 out_r, out_r, #5 + vsri.u8 out_g, out_g, #6 + vshrn.u16 out_b, in, #2 +.endm + +/* + * Conversion from planar a8r8g8b8 format (with a, r, g, b color components + * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6 + * pixels packed in 128-bit register (out). Requires two temporary 128-bit + * registers (tmp1, tmp2) + */ +.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2 + vshll.u8 tmp1, in_g, #8 + vshll.u8 out, in_r, #8 + vshll.u8 tmp2, in_b, #8 + vsri.u16 out, tmp1, #5 + vsri.u16 out, tmp2, #11 +.endm + +/* + * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels + * returned in (out0, out1) registers pair. Requires one temporary + * 64-bit register (tmp). 'out1' and 'in' may overlap, the original + * value from 'in' is lost + */ +.macro convert_four_0565_to_x888_packed in, out0, out1, tmp + vshl.u16 out0, in, #5 /* G top 6 bits */ + vshl.u16 tmp, in, #11 /* B top 5 bits */ + vsri.u16 in, in, #5 /* R is ready in top bits */ + vsri.u16 out0, out0, #6 /* G is ready in top bits */ + vsri.u16 tmp, tmp, #5 /* B is ready in top bits */ + vshr.u16 out1, in, #8 /* R is in place */ + vsri.u16 out0, tmp, #8 /* G & B is in place */ + vzip.u16 out0, out1 /* everything is in place */ +.endm -- 2.17.1