From 6f9d9416f8c574db2df8c2ffe3ddd88684985f65 Mon Sep 17 00:00:00 2001 From: Ben Avison Date: Fri, 12 Aug 2016 18:20:16 +0100 Subject: [PATCH 02/11] ARM: SIMD assembly optimization for function BlitRGBtoRGBPixelAlpha Much of the heavy lifting of this optimization is lifted from the Pixman project, which is distributed under an MIT-style license. --- src/video/SDL_blit_A.c | 20 + src/video/arm/pixman-arm-asm.h | 37 ++ src/video/arm/pixman-arm-simd-asm.S | 174 ++++++ src/video/arm/pixman-arm-simd-asm.h | 1040 +++++++++++++++++++++++++++++++++++ 4 files changed, 1271 insertions(+) create mode 100644 src/video/arm/pixman-arm-asm.h create mode 100644 src/video/arm/pixman-arm-simd-asm.S create mode 100644 src/video/arm/pixman-arm-simd-asm.h diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c index 219cdccf5..b4fbe4abf 100644 --- a/src/video/SDL_blit_A.c +++ b/src/video/SDL_blit_A.c @@ -1433,6 +1433,22 @@ static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info) #endif #endif /* SDL_ALTIVEC_BLITTERS */ +#if SDL_ARM_SIMD_BLITTERS +void BlitRGBtoRGBPixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride); + +static void BlitRGBtoRGBPixelAlphaARMSIMD(SDL_BlitInfo *info) +{ + int32_t width = info->d_width; + int32_t height = info->d_height; + uint32_t *dstp = (uint32_t *)info->d_pixels; + int32_t dststride = width + (info->d_skip >> 2); + uint32_t *srcp = (uint32_t *)info->s_pixels; + int32_t srcstride = width + (info->s_skip >> 2); + + BlitRGBtoRGBPixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride); +} +#endif + /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info) { @@ -2853,6 +2869,10 @@ SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index) && SDL_HasAltiVec()) return BlitRGBtoRGBPixelAlphaAltivec; #endif +#if SDL_ARM_SIMD_BLITTERS + if (SDL_HasARMSIMD()) + return BlitRGBtoRGBPixelAlphaARMSIMD; +#endif return BlitRGBtoRGBPixelAlpha; } } diff --git a/src/video/arm/pixman-arm-asm.h b/src/video/arm/pixman-arm-asm.h new file mode 100644 index 000000000..ee7854108 --- /dev/null +++ b/src/video/arm/pixman-arm-asm.h @@ -0,0 +1,37 @@ +/* + * Copyright © 2008 Mozilla Corporation + * Copyright © 2010 Nokia Corporation + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Mozilla Corporation not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission. Mozilla Corporation makes no + * representations about the suitability of this software for any purpose. It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Author: Jeff Muizelaar (jeff@infidigm.net) + * + */ + +/* Supplementary macro for setting function attributes */ +.macro pixman_asm_function fname + .func fname + .global fname +#ifdef __ELF__ + .hidden fname + .type fname, %function +#endif +fname: +.endm diff --git a/src/video/arm/pixman-arm-simd-asm.S b/src/video/arm/pixman-arm-simd-asm.S new file mode 100644 index 000000000..916e63a58 --- /dev/null +++ b/src/video/arm/pixman-arm-simd-asm.S @@ -0,0 +1,174 @@ +/* + * Copyright © 2016 RISC OS Open Ltd + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of the copyright holders not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission. The copyright holders make no + * representations about the suitability of this software for any purpose. It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Author: Ben Avison (bavison@riscosopen.org) + * + */ + +/* Prevent the stack from becoming executable */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + + .text + .arch armv6 + .object_arch armv4 + .arm + .altmacro + .p2align 2 + +#include "pixman-arm-asm.h" +#include "pixman-arm-simd-asm.h" + +/* A head macro should do all processing which results in an output of up to + * 16 bytes, as far as the final load instruction. The corresponding tail macro + * should complete the processing of the up-to-16 bytes. The calling macro will + * sometimes choose to insert a preload or a decrement of X between them. + * cond ARM condition code for code block + * numbytes Number of output bytes that should be generated this time + * firstreg First WK register in which to place output + * unaligned_src Whether to use non-wordaligned loads of source image + * unaligned_mask Whether to use non-wordaligned loads of mask image + * preload If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output + */ + +/******************************************************************************/ + +/* This differs from the over_8888_8888 routine in Pixman in that the destination + * alpha component is always left unchanged, and RGB components are not + * premultiplied by alpha. It differs from BlitRGBtoRGBPixelAlpha in that + * renormalisation is done by multiplying by 257/256 (with rounding) rather than + * simply shifting right by 8 bits - removing the need to special-case alpha=0xff. + */ + +.macro RGBtoRGBPixelAlpha_init + line_saved_regs STRIDE_S, ORIG_W + mov MASK, #0x80 +.endm + +.macro RGBtoRGBPixelAlpha_1pixel_translucent s, d, tmp0, tmp1, tmp2, tmp3, half + uxtb tmp3, s + uxtb tmp0, d + sub tmp0, tmp3, tmp0 + uxtb tmp3, s, ror #16 + uxtb tmp1, d, ror #16 + sub tmp1, tmp3, tmp1 + uxtb tmp3, s, ror #8 + mov s, s, lsr #24 + uxtb tmp2, d, ror #8 + sub tmp2, tmp3, tmp2 + smlabb tmp0, tmp0, s, half + smlabb tmp1, tmp1, s, half + smlabb tmp2, tmp2, s, half + add tmp0, tmp0, asr #8 + add tmp1, tmp1, asr #8 + add tmp2, tmp2, asr #8 + pkhbt tmp0, tmp0, tmp1, lsl #16 + and tmp2, tmp2, #0xff00 + uxtb16 tmp0, tmp0, ror #8 + orr tmp0, tmp0, tmp2 + uadd8 d, d, tmp0 +.endm + +.macro RGBtoRGBPixelAlpha_1pixel_opaque s, d + and d, d, #0xff000000 + bic s, s, #0xff000000 + orr d, d, s +.endm + +.macro RGBtoRGBPixelAlpha_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + .if numbytes == 16 + ldm SRC!, {WK0, WK1} + ldm SRC!, {STRIDE_S, STRIDE_M} + ldrd WK2, WK3, [DST], #16 + orr SCRATCH, WK0, WK1 + and ORIG_W, WK0, WK1 + orr SCRATCH, SCRATCH, STRIDE_S + and ORIG_W, ORIG_W, STRIDE_S + orr SCRATCH, SCRATCH, STRIDE_M + and ORIG_W, ORIG_W, STRIDE_M + tst SCRATCH, #0xff000000 + .elseif numbytes == 8 + ldm SRC!, {WK0, WK1} + ldm DST!, {WK2, WK3} + orr SCRATCH, WK0, WK1 + and ORIG_W, WK0, WK1 + tst SCRATCH, #0xff000000 + .else // numbytes == 4 + ldr WK0, [SRC], #4 + ldr WK2, [DST], #4 + tst WK0, #0xff000000 + .endif +.endm + +.macro RGBtoRGBPixelAlpha_process_tail cond, numbytes, firstreg + beq 20f @ all transparent + .if numbytes == 16 + cmp ORIG_W, #0xff000000 + bhs 10f @ all opaque + RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK + RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK + strd WK2, WK3, [DST, #-16] + ldrd WK0, WK1, [SRC, #-8] + ldrd WK2, WK3, [DST, #-8] + RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK + RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK + b 19f +10: RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2 + RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3 + strd WK2, WK3, [DST, #-16] + ldrd WK0, WK1, [SRC, #-8] + ldrd WK2, WK3, [DST, #-8] + RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2 + RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3 +19: strd WK2, WK3, [DST, #-8] + .elseif numbytes == 8 + cmp ORIG_W, #0xff000000 + bhs 10f @ all opaque + RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK + RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK + b 19f +10: RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2 + RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3 +19: strd WK2, WK3, [DST, #-8] + .else // numbytes == 4 + cmp WK0, #0xff000000 + bhs 10f @ opaque + RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK + b 19f +10: RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2 +19: str WK2, [DST, #-4] + .endif +20: +.endm + +generate_composite_function \ + BlitRGBtoRGBPixelAlphaARMSIMDAsm, 32, 0, 32, \ + FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \ + 2, /* prefetch distance */ \ + RGBtoRGBPixelAlpha_init, \ + nop_macro, /* newline */ \ + nop_macro, /* cleanup */ \ + RGBtoRGBPixelAlpha_process_head, \ + RGBtoRGBPixelAlpha_process_tail + +/******************************************************************************/ diff --git a/src/video/arm/pixman-arm-simd-asm.h b/src/video/arm/pixman-arm-simd-asm.h new file mode 100644 index 000000000..8e6598100 --- /dev/null +++ b/src/video/arm/pixman-arm-simd-asm.h @@ -0,0 +1,1040 @@ +/* + * Copyright © 2012 Raspberry Pi Foundation + * Copyright © 2012 RISC OS Open Ltd + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of the copyright holders not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission. The copyright holders make no + * representations about the suitability of this software for any purpose. It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Author: Ben Avison (bavison@riscosopen.org) + * + */ + +/* + * Because the alignment of pixel data to cachelines, and even the number of + * cachelines per row can vary from row to row, and because of the need to + * preload each scanline once and only once, this prefetch strategy treats + * each row of pixels independently. When a pixel row is long enough, there + * are three distinct phases of prefetch: + * * an inner loop section, where each time a cacheline of data is + * processed, another cacheline is preloaded (the exact distance ahead is + * determined empirically using profiling results from lowlevel-blt-bench) + * * a leading section, where enough cachelines are preloaded to ensure no + * cachelines escape being preloaded when the inner loop starts + * * a trailing section, where a limited number (0 or more) of cachelines + * are preloaded to deal with data (if any) that hangs off the end of the + * last iteration of the inner loop, plus any trailing bytes that were not + * enough to make up one whole iteration of the inner loop + * + * There are (in general) three distinct code paths, selected between + * depending upon how long the pixel row is. If it is long enough that there + * is at least one iteration of the inner loop (as described above) then + * this is described as the "wide" case. If it is shorter than that, but + * there are still enough bytes output that there is at least one 16-byte- + * long, 16-byte-aligned write to the destination (the optimum type of + * write), then this is the "medium" case. If it is not even this long, then + * this is the "narrow" case, and there is no attempt to align writes to + * 16-byte boundaries. In the "medium" and "narrow" cases, all the + * cachelines containing data from the pixel row are prefetched up-front. + */ + +/* + * Determine whether we put the arguments on the stack for debugging. + */ +#undef DEBUG_PARAMS + +/* + * Bit flags for 'generate_composite_function' macro which are used + * to tune generated functions behavior. + */ +.set FLAG_DST_WRITEONLY, 0 +.set FLAG_DST_READWRITE, 1 +.set FLAG_COND_EXEC, 0 +.set FLAG_BRANCH_OVER, 2 +.set FLAG_PROCESS_PRESERVES_PSR, 0 +.set FLAG_PROCESS_CORRUPTS_PSR, 4 +.set FLAG_PROCESS_DOESNT_STORE, 0 +.set FLAG_PROCESS_DOES_STORE, 8 /* usually because it needs to conditionally skip it */ +.set FLAG_NO_SPILL_LINE_VARS, 0 +.set FLAG_SPILL_LINE_VARS_WIDE, 16 +.set FLAG_SPILL_LINE_VARS_NON_WIDE, 32 +.set FLAG_SPILL_LINE_VARS, 48 +.set FLAG_PROCESS_CORRUPTS_SCRATCH, 0 +.set FLAG_PROCESS_PRESERVES_SCRATCH, 64 +.set FLAG_PROCESS_PRESERVES_WK0, 0 +.set FLAG_PROCESS_CORRUPTS_WK0, 128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */ +.set FLAG_PRELOAD_DST, 0 +.set FLAG_NO_PRELOAD_DST, 256 + +/* + * Number of bytes by which to adjust preload offset of destination + * buffer (allows preload instruction to be moved before the load(s)) + */ +.set DST_PRELOAD_BIAS, 0 + +/* + * Offset into stack where mask and source pointer/stride can be accessed. + */ +#ifdef DEBUG_PARAMS +.set ARGS_STACK_OFFSET, (9*4+9*4) +#else +.set ARGS_STACK_OFFSET, (9*4) +#endif + +/* + * Offset into stack where space allocated during init macro can be accessed. + */ +.set LOCALS_STACK_OFFSET, 0 + +/* + * Constants for selecting preferable prefetch type. + */ +.set PREFETCH_TYPE_NONE, 0 +.set PREFETCH_TYPE_STANDARD, 1 + +/* + * Definitions of macros for load/store of pixel data. + */ + +.macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0 + .if numbytes == 16 + .if unaligned == 1 + op&r&cond WK®0, [base], #4 + op&r&cond WK®1, [base], #4 + op&r&cond WK®2, [base], #4 + op&r&cond WK®3, [base], #4 + .else + op&m&cond&ia base!, {WK®0,WK®1,WK®2,WK®3} + .endif + .elseif numbytes == 8 + .if unaligned == 1 + op&r&cond WK®0, [base], #4 + op&r&cond WK®1, [base], #4 + .else + op&m&cond&ia base!, {WK®0,WK®1} + .endif + .elseif numbytes == 4 + op&r&cond WK®0, [base], #4 + .elseif numbytes == 2 + op&r&cond&h WK®0, [base], #2 + .elseif numbytes == 1 + op&r&cond&b WK®0, [base], #1 + .else + .error "unsupported size: numbytes" + .endif +.endm + +.macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base + .if numbytes == 16 + stm&cond&db base, {WK®0,WK®1,WK®2,WK®3} + .elseif numbytes == 8 + stm&cond&db base, {WK®0,WK®1} + .elseif numbytes == 4 + str&cond WK®0, [base, #-4] + .elseif numbytes == 2 + str&cond&h WK®0, [base, #-2] + .elseif numbytes == 1 + str&cond&b WK®0, [base, #-1] + .else + .error "unsupported size: numbytes" + .endif +.endm + +.macro pixld cond, numbytes, firstreg, base, unaligned + pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned +.endm + +.macro pixst cond, numbytes, firstreg, base + .if (flags) & FLAG_DST_READWRITE + pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base + .else + pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base + .endif +.endm + +.macro PF a, x:vararg + .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD) + a x + .endif +.endm + + +.macro preload_leading_step1 bpp, ptr, base +/* If the destination is already 16-byte aligned, then we need to preload + * between 0 and prefetch_distance (inclusive) cache lines ahead so there + * are no gaps when the inner loop starts. + */ + .if bpp > 0 + PF bic, ptr, base, #31 + .set OFFSET, 0 + .rept prefetch_distance+1 + PF pld, [ptr, #OFFSET] + .set OFFSET, OFFSET+32 + .endr + .endif +.endm + +.macro preload_leading_step2 bpp, bpp_shift, ptr, base +/* However, if the destination is not 16-byte aligned, we may need to + * preload more cache lines than that. The question we need to ask is: + * are the bytes corresponding to the leading pixels more than the amount + * by which the source pointer will be rounded down for preloading, and if + * so, by how many cache lines? Effectively, we want to calculate + * leading_bytes = ((-dst)&15)*src_bpp/dst_bpp + * inner_loop_offset = (src+leading_bytes)&31 + * extra_needed = leading_bytes - inner_loop_offset + * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only + * possible when there are 4 src bytes for every 1 dst byte). + */ + .if bpp > 0 + .ifc base,DST + /* The test can be simplified further when preloading the destination */ + PF tst, base, #16 + PF beq, 61f + .else + .if bpp/dst_w_bpp == 4 + PF add, SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift + PF and, SCRATCH, SCRATCH, #31 + PF rsb, SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift + PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */ + PF movs, SCRATCH, SCRATCH, lsl #32-6 /* so this sets NC / nc / Nc */ + PF bcs, 61f + PF bpl, 60f + PF pld, [ptr, #32*(prefetch_distance+2)] + .else + PF mov, SCRATCH, base, lsl #32-5 + PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift + PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift + PF bls, 61f + .endif + .endif +60: PF pld, [ptr, #32*(prefetch_distance+1)] +61: + .endif +.endm + +#define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2)) +.macro preload_middle bpp, base, scratch_holds_offset + .if bpp > 0 + /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */ + .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp) + .if scratch_holds_offset + PF pld, [base, SCRATCH] + .else + PF bic, SCRATCH, base, #31 + PF pld, [SCRATCH, #32*prefetch_distance] + .endif + .endif + .endif +.endm + +.macro preload_trailing bpp, bpp_shift, base + .if bpp > 0 + .if bpp*pix_per_block > 256 + /* Calculations are more complex if more than one fetch per block */ + PF and, WK1, base, #31 + PF add, WK1, WK1, WK0, lsl #bpp_shift + PF add, WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1) + PF bic, SCRATCH, base, #31 +80: PF pld, [SCRATCH, #32*(prefetch_distance+1)] + PF add, SCRATCH, SCRATCH, #32 + PF subs, WK1, WK1, #32 + PF bhi, 80b + .else + /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */ + PF mov, SCRATCH, base, lsl #32-5 + PF adds, SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift + PF adceqs, SCRATCH, SCRATCH, #0 + /* The instruction above has two effects: ensures Z is only + * set if C was clear (so Z indicates that both shifted quantities + * were 0), and clears C if Z was set (so C indicates that the sum + * of the shifted quantities was greater and not equal to 32) */ + PF beq, 82f + PF bic, SCRATCH, base, #31 + PF bcc, 81f + PF pld, [SCRATCH, #32*(prefetch_distance+2)] +81: PF pld, [SCRATCH, #32*(prefetch_distance+1)] +82: + .endif + .endif +.endm + + +.macro preload_line narrow_case, bpp, bpp_shift, base +/* "narrow_case" - just means that the macro was invoked from the "narrow" + * code path rather than the "medium" one - because in the narrow case, + * the row of pixels is known to output no more than 30 bytes, then + * (assuming the source pixels are no wider than the the destination + * pixels) they cannot possibly straddle more than 2 32-byte cachelines, + * meaning there's no need for a loop. + * "bpp" - number of bits per pixel in the channel (source, mask or + * destination) that's being preloaded, or 0 if this channel is not used + * for reading + * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course) + * "base" - base address register of channel to preload (SRC, MASK or DST) + */ + .if bpp > 0 + .if narrow_case && (bpp <= dst_w_bpp) + /* In these cases, each line for each channel is in either 1 or 2 cache lines */ + PF bic, WK0, base, #31 + PF pld, [WK0] + PF add, WK1, base, X, LSL #bpp_shift + PF sub, WK1, WK1, #1 + PF bic, WK1, WK1, #31 + PF cmp, WK1, WK0 + PF beq, 90f + PF pld, [WK1] +90: + .else + PF bic, WK0, base, #31 + PF pld, [WK0] + PF add, WK1, base, X, lsl #bpp_shift + PF sub, WK1, WK1, #1 + PF bic, WK1, WK1, #31 + PF cmp, WK1, WK0 + PF beq, 92f +91: PF add, WK0, WK0, #32 + PF cmp, WK0, WK1 + PF pld, [WK0] + PF bne, 91b +92: + .endif + .endif +.endm + + +.macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx + process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0 + .if decrementx + sub&cond X, X, #8*numbytes/dst_w_bpp + .endif + process_tail cond, numbytes, firstreg + .if !((flags) & FLAG_PROCESS_DOES_STORE) + pixst cond, numbytes, firstreg, DST + .endif +.endm + +.macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx + .if (flags) & FLAG_BRANCH_OVER + .ifc cond,mi + bpl 100f + .endif + .ifc cond,cs + bcc 100f + .endif + .ifc cond,ne + beq 100f + .endif + conditional_process1_helper , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx +100: + .else + conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx + .endif +.endm + +.macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx + .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE) + /* Can't interleave reads and writes */ + test + conditional_process1 cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx + .if (flags) & FLAG_PROCESS_CORRUPTS_PSR + test + .endif + conditional_process1 cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx + .else + /* Can interleave reads and writes for better scheduling */ + test + process_head cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0 + process_head cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0 + .if decrementx + sub&cond1 X, X, #8*numbytes1/dst_w_bpp + sub&cond2 X, X, #8*numbytes2/dst_w_bpp + .endif + process_tail cond1, numbytes1, firstreg1 + process_tail cond2, numbytes2, firstreg2 + pixst cond1, numbytes1, firstreg1, DST + pixst cond2, numbytes2, firstreg2, DST + .endif +.endm + + +.macro test_bits_1_0_ptr + .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 + movs SCRATCH, X, lsl #32-1 /* C,N = bits 1,0 of DST */ + .else + movs SCRATCH, WK0, lsl #32-1 /* C,N = bits 1,0 of DST */ + .endif +.endm + +.macro test_bits_3_2_ptr + .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 + movs SCRATCH, X, lsl #32-3 /* C,N = bits 3, 2 of DST */ + .else + movs SCRATCH, WK0, lsl #32-3 /* C,N = bits 3, 2 of DST */ + .endif +.endm + +.macro leading_15bytes process_head, process_tail + /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */ + .set DECREMENT_X, 1 + .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 + .set DECREMENT_X, 0 + sub X, X, WK0, lsr #dst_bpp_shift + str X, [sp, #LINE_SAVED_REG_COUNT*4] + mov X, WK0 + .endif + /* Use unaligned loads in all cases for simplicity */ + .if dst_w_bpp == 8 + conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X + .elseif dst_w_bpp == 16 + test_bits_1_0_ptr + conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X + .endif + conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X + .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 + ldr X, [sp, #LINE_SAVED_REG_COUNT*4] + .endif +.endm + +.macro test_bits_3_2_pix + movs SCRATCH, X, lsl #dst_bpp_shift+32-3 +.endm + +.macro test_bits_1_0_pix + .if dst_w_bpp == 8 + movs SCRATCH, X, lsl #dst_bpp_shift+32-1 + .else + movs SCRATCH, X, lsr #1 + .endif +.endm + +.macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask + conditional_process2 test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0 + .if dst_w_bpp == 16 + test_bits_1_0_pix + conditional_process1 cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0 + .elseif dst_w_bpp == 8 + conditional_process2 test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0 + .endif +.endm + + +.macro wide_case_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment +110: + .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */ + .rept pix_per_block*dst_w_bpp/128 + process_head , 16, 0, unaligned_src, unaligned_mask, 1 + .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) + preload_middle src_bpp, SRC, 1 + .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) + preload_middle mask_bpp, MASK, 1 + .else + preload_middle src_bpp, SRC, 0 + preload_middle mask_bpp, MASK, 0 + .endif + .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0) + /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that + * destination prefetches are 32-byte aligned. It's also the easiest channel to offset + * preloads for, to achieve staggered prefetches for multiple channels, because there are + * always two STMs per prefetch, so there is always an opposite STM on which to put the + * preload. Note, no need to BIC the base register here */ + PF pld, [DST, #32*prefetch_distance - dst_alignment] + .endif + process_tail , 16, 0 + .if !((flags) & FLAG_PROCESS_DOES_STORE) + pixst , 16, 0, DST + .endif + .set SUBBLOCK, SUBBLOCK+1 + .endr + subs X, X, #pix_per_block + bhs 110b +.endm + +.macro wide_case_inner_loop_and_trailing_pixels process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask + /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */ + .if dst_r_bpp > 0 + tst DST, #16 + bne 111f + process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS + b 112f +111: + .endif + process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS +112: + /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */ + .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256) + PF and, WK0, X, #pix_per_block-1 + .endif + preload_trailing src_bpp, src_bpp_shift, SRC + preload_trailing mask_bpp, mask_bpp_shift, MASK + .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 + preload_trailing dst_r_bpp, dst_bpp_shift, DST + .endif + add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp + /* The remainder of the line is handled identically to the medium case */ + medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask +.endm + +.macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask +120: + process_head , 16, 0, unaligned_src, unaligned_mask, 0 + process_tail , 16, 0 + .if !((flags) & FLAG_PROCESS_DOES_STORE) + pixst , 16, 0, DST + .endif + subs X, X, #128/dst_w_bpp + bhs 120b + /* Trailing pixels */ + tst X, #128/dst_w_bpp - 1 + beq exit_label + trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask +.endm + +.macro narrow_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask + tst X, #16*8/dst_w_bpp + conditional_process1 ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0 + /* Trailing pixels */ + /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */ + trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask +.endm + +.macro switch_on_alignment action, process_head, process_tail, process_inner_loop, exit_label + /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */ + .if mask_bpp == 8 || mask_bpp == 16 + tst MASK, #3 + bne 141f + .endif + .if src_bpp == 8 || src_bpp == 16 + tst SRC, #3 + bne 140f + .endif + action process_head, process_tail, process_inner_loop, exit_label, 0, 0 + .if src_bpp == 8 || src_bpp == 16 + b exit_label +140: + action process_head, process_tail, process_inner_loop, exit_label, 1, 0 + .endif + .if mask_bpp == 8 || mask_bpp == 16 + b exit_label +141: + .if src_bpp == 8 || src_bpp == 16 + tst SRC, #3 + bne 142f + .endif + action process_head, process_tail, process_inner_loop, exit_label, 0, 1 + .if src_bpp == 8 || src_bpp == 16 + b exit_label +142: + action process_head, process_tail, process_inner_loop, exit_label, 1, 1 + .endif + .endif +.endm + + +.macro end_of_line restore_x, vars_spilled, loop_label, last_one + .if SINGLE_SCANLINE + .ifc "last_one","" + b 198f + .endif + .else + .if vars_spilled + /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */ + /* This is ldmia sp,{} */ + .word 0xE89D0000 | LINE_SAVED_REGS + .endif + subs Y, Y, #1 + .if vars_spilled + .if (LINE_SAVED_REGS) & (1<<1) + str Y, [sp] + .endif + .endif + add DST, DST, STRIDE_D + .if src_bpp > 0 + add SRC, SRC, STRIDE_S + .endif + .if mask_bpp > 0 + add MASK, MASK, STRIDE_M + .endif + .if restore_x + mov X, ORIG_W + .endif + bhs loop_label + .ifc "last_one","" + .if vars_spilled + b 197f + .else + b 198f + .endif + .else + .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS) + b 198f + .endif + .endif + .endif +.endm + + +.macro generate_composite_function_common fname, \ + src_bpp_, \ + mask_bpp_, \ + dst_w_bpp_, \ + flags_, \ + prefetch_distance_, \ + init, \ + newline, \ + cleanup, \ + process_head, \ + process_tail, \ + process_inner_loop + + pixman_asm_function fname + +/* + * Make some macro arguments globally visible and accessible + * from other macros + */ + .set src_bpp, src_bpp_ + .set mask_bpp, mask_bpp_ + .set dst_w_bpp, dst_w_bpp_ + .set flags, flags_ + .set prefetch_distance, prefetch_distance_ + +/* + * Select prefetch type for this function. + */ + .if prefetch_distance == 0 + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE + .else + .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD + .endif + + .if src_bpp == 32 + .set src_bpp_shift, 2 + .elseif src_bpp == 24 + .set src_bpp_shift, 0 + .elseif src_bpp == 16 + .set src_bpp_shift, 1 + .elseif src_bpp == 8 + .set src_bpp_shift, 0 + .elseif src_bpp == 0 + .set src_bpp_shift, -1 + .else + .error "requested src bpp (src_bpp) is not supported" + .endif + + .if mask_bpp == 32 + .set mask_bpp_shift, 2 + .elseif mask_bpp == 24 + .set mask_bpp_shift, 0 + .elseif mask_bpp == 8 + .set mask_bpp_shift, 0 + .elseif mask_bpp == 0 + .set mask_bpp_shift, -1 + .else + .error "requested mask bpp (mask_bpp) is not supported" + .endif + + .if dst_w_bpp == 32 + .set dst_bpp_shift, 2 + .elseif dst_w_bpp == 24 + .set dst_bpp_shift, 0 + .elseif dst_w_bpp == 16 + .set dst_bpp_shift, 1 + .elseif dst_w_bpp == 8 + .set dst_bpp_shift, 0 + .else + .error "requested dst bpp (dst_w_bpp) is not supported" + .endif + + .if (((flags) & FLAG_DST_READWRITE) != 0) + .set dst_r_bpp, dst_w_bpp + .else + .set dst_r_bpp, 0 + .endif + + .set pix_per_block, 16*8/dst_w_bpp + .if src_bpp != 0 + .if 32*8/src_bpp > pix_per_block + .set pix_per_block, 32*8/src_bpp + .endif + .endif + .if mask_bpp != 0 + .if 32*8/mask_bpp > pix_per_block + .set pix_per_block, 32*8/mask_bpp + .endif + .endif + .if dst_r_bpp != 0 + .if 32*8/dst_r_bpp > pix_per_block + .set pix_per_block, 32*8/dst_r_bpp + .endif + .endif + +/* The standard entry conditions set up by pixman-arm-common.h are: + * r0 = width (pixels) + * r1 = height (rows) + * r2 = pointer to top-left pixel of destination + * r3 = destination stride (pixels) + * [sp] = source pixel value, or pointer to top-left pixel of source + * [sp,#4] = 0 or source stride (pixels) + * The following arguments are unused for non-mask operations + * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask + * [sp,#12] = 0 or mask stride (pixels) + * + * or in the single-scanline case: + * r0 = width (pixels) + * r1 = pointer to top-left pixel of destination + * r2 = pointer to top-left pixel of source + * The following argument is unused for non-mask operations + * r3 = pointer to top-left pixel of mask + */ + +/* + * Assign symbolic names to registers + */ + X .req r0 /* pixels to go on this line */ + .if SINGLE_SCANLINE + DST .req r1 /* destination pixel pointer */ + SRC .req r2 /* source pixel pointer */ + MASK .req r3 /* mask pixel pointer (if applicable) */ + Y .req r4 /* temporary */ + STRIDE_D .req r5 /* temporary */ + STRIDE_S .req r6 /* temporary */ + STRIDE_M .req r7 /* temporary */ + .else + Y .req r1 /* lines to go */ + DST .req r2 /* destination pixel pointer */ + STRIDE_D .req r3 /* destination stride (bytes, minus width) */ + SRC .req r4 /* source pixel pointer */ + STRIDE_S .req r5 /* source stride (bytes, minus width) */ + MASK .req r6 /* mask pixel pointer (if applicable) */ + STRIDE_M .req r7 /* mask stride (bytes, minus width) */ + .endif + WK0 .req r8 /* pixel data registers */ + WK1 .req r9 + WK2 .req r10 + WK3 .req r11 + SCRATCH .req r12 + ORIG_W .req r14 /* width (pixels) */ + + push {r4-r11, lr} /* save all registers */ + + .if !SINGLE_SCANLINE + subs Y, Y, #1 + blo 199f + .endif + +#ifdef DEBUG_PARAMS + sub sp, sp, #9*4 +#endif + + .if !SINGLE_SCANLINE + .if src_bpp > 0 + ldr SRC, [sp, #ARGS_STACK_OFFSET] + ldr STRIDE_S, [sp, #ARGS_STACK_OFFSET+4] + .endif + .if mask_bpp > 0 + ldr MASK, [sp, #ARGS_STACK_OFFSET+8] + ldr STRIDE_M, [sp, #ARGS_STACK_OFFSET+12] + .endif + .endif + +#ifdef DEBUG_PARAMS + add Y, Y, #1 + stmia sp, {r0-r7,pc} + sub Y, Y, #1 +#endif + + init + + .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 + /* Reserve a word in which to store X during leading pixels */ + sub sp, sp, #4 + .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4 + .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4 + .endif + + .if !SINGLE_SCANLINE + lsl STRIDE_D, #dst_bpp_shift /* stride in bytes */ + sub STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift + .if src_bpp > 0 + lsl STRIDE_S, #src_bpp_shift + sub STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift + .endif + .if mask_bpp > 0 + lsl STRIDE_M, #mask_bpp_shift + sub STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift + .endif + .endif + + /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */ + cmp X, #2*16*8/dst_w_bpp - 1 + blo 170f + .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */ + /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */ + cmp X, #(prefetch_distance+3)*pix_per_block - 1 + blo 160f + + /* Wide case */ + /* Adjust X so that the decrement instruction can also test for + * inner loop termination. We want it to stop when there are + * (prefetch_distance+1) complete blocks to go. */ + sub X, X, #(prefetch_distance+2)*pix_per_block + .if !SINGLE_SCANLINE + mov ORIG_W, X + .if (flags) & FLAG_SPILL_LINE_VARS_WIDE + /* This is stmdb sp!,{} */ + .word 0xE92D0000 | LINE_SAVED_REGS + .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 + .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 + .endif + .endif +151: /* New line */ + newline + preload_leading_step1 src_bpp, WK1, SRC + preload_leading_step1 mask_bpp, WK2, MASK + .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 + preload_leading_step1 dst_r_bpp, WK3, DST + .endif + + ands WK0, DST, #15 + beq 154f + rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */ + + preload_leading_step2 src_bpp, src_bpp_shift, WK1, SRC + preload_leading_step2 mask_bpp, mask_bpp_shift, WK2, MASK + .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 + preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST + .endif + + leading_15bytes process_head, process_tail + +154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */ + .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) + and SCRATCH, SRC, #31 + rsb SCRATCH, SCRATCH, #32*prefetch_distance + .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) + and SCRATCH, MASK, #31 + rsb SCRATCH, SCRATCH, #32*prefetch_distance + .endif + .ifc "process_inner_loop","" + switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f + .else + switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f + .endif + +157: /* Check for another line */ + end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b + .if (!SINGLE_SCANLINE) && ((flags) & FLAG_SPILL_LINE_VARS_WIDE) + .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 + .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 + .endif + .endif + + .ltorg + +160: /* Medium case */ + .if !SINGLE_SCANLINE + mov ORIG_W, X + .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE + /* This is stmdb sp!,{} */ + .word 0xE92D0000 | LINE_SAVED_REGS + .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 + .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 + .endif + .endif +161: /* New line */ + newline + preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ + preload_line 0, mask_bpp, mask_bpp_shift, MASK + .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 + preload_line 0, dst_r_bpp, dst_bpp_shift, DST + .endif + + sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */ + ands WK0, DST, #15 + beq 164f + rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */ + + leading_15bytes process_head, process_tail + +164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */ + switch_on_alignment medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f + +167: /* Check for another line */ + end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b + + .ltorg + +170: /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */ + .if !SINGLE_SCANLINE + .if dst_w_bpp < 32 + mov ORIG_W, X + .endif + .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE + /* This is stmdb sp!,{} */ + .word 0xE92D0000 | LINE_SAVED_REGS + .endif + .endif +171: /* New line */ + newline + preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ + preload_line 1, mask_bpp, mask_bpp_shift, MASK + .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 + preload_line 1, dst_r_bpp, dst_bpp_shift, DST + .endif + + .if dst_w_bpp == 8 + tst DST, #3 + beq 174f +172: subs X, X, #1 + blo 177f + process_head , 1, 0, 1, 1, 0 + process_tail , 1, 0 + .if !((flags) & FLAG_PROCESS_DOES_STORE) + pixst , 1, 0, DST + .endif + tst DST, #3 + bne 172b + .elseif dst_w_bpp == 16 + tst DST, #2 + beq 174f + subs X, X, #1 + blo 177f + process_head , 2, 0, 1, 1, 0 + process_tail , 2, 0 + .if !((flags) & FLAG_PROCESS_DOES_STORE) + pixst , 2, 0, DST + .endif + .endif + +174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */ + switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f + +177: /* Check for another line */ + end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one + .if (!SINGLE_SCANLINE) && ((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE) + .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 + .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 + .endif + +197: + .if (!SINGLE_SCANLINE) && ((flags) & FLAG_SPILL_LINE_VARS) + add sp, sp, #LINE_SAVED_REG_COUNT*4 + .endif +198: + .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 + .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4 + .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4 + add sp, sp, #4 + .endif + + cleanup + +#ifdef DEBUG_PARAMS + add sp, sp, #9*4 /* junk the debug copy of arguments */ +#endif +199: + pop {r4-r11, pc} /* exit */ + + .ltorg + + .unreq X + .unreq Y + .unreq DST + .unreq STRIDE_D + .unreq SRC + .unreq STRIDE_S + .unreq MASK + .unreq STRIDE_M + .unreq WK0 + .unreq WK1 + .unreq WK2 + .unreq WK3 + .unreq SCRATCH + .unreq ORIG_W + .endfunc +.endm + +.macro generate_composite_function fname, \ + src_bpp_, \ + mask_bpp_, \ + dst_w_bpp_, \ + flags_, \ + prefetch_distance_, \ + init, \ + newline, \ + cleanup, \ + process_head, \ + process_tail, \ + process_inner_loop + .set SINGLE_SCANLINE, 0 +generate_composite_function_common \ + fname, src_bpp_, mask_bpp_, dst_w_bpp_, flags_, prefetch_distance_, \ + init, newline, cleanup, process_head, process_tail, process_inner_loop +.endm + +.macro generate_composite_function_single_scanline fname, \ + src_bpp_, \ + mask_bpp_, \ + dst_w_bpp_, \ + flags_, \ + prefetch_distance_, \ + init, \ + newline, \ + cleanup, \ + process_head, \ + process_tail, \ + process_inner_loop + .set SINGLE_SCANLINE, 1 +generate_composite_function_common \ + fname, src_bpp_, mask_bpp_, dst_w_bpp_, flags_, prefetch_distance_, \ + init, newline, cleanup, process_head, process_tail, process_inner_loop +.endm + +.macro line_saved_regs x:vararg + .set LINE_SAVED_REGS, 0 + .set LINE_SAVED_REG_COUNT, 0 + .irp SAVED_REG,x + .ifc "SAVED_REG","Y" + .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1) + .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 + .endif + .ifc "SAVED_REG","STRIDE_D" + .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3) + .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 + .endif + .ifc "SAVED_REG","STRIDE_S" + .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5) + .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 + .endif + .ifc "SAVED_REG","STRIDE_M" + .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7) + .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 + .endif + .ifc "SAVED_REG","ORIG_W" + .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14) + .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 + .endif + .endr + .if SINGLE_SCANLINE + .set LINE_SAVED_REG_COUNT, 0 + .endif +.endm + +.macro nop_macro x:vararg +.endm -- 2.11.0