From dcd113e5b8e1f0697d99cacc36e01315098ffcaf Mon Sep 17 00:00:00 2001 From: Ben Avison Date: Wed, 7 Nov 2018 13:13:44 +0000 Subject: [PATCH 11/11] ARM: NEON assembly optimization for SDL_FillRect --- src/video/SDL_surface.c | 21 +++++ src/video/arm/pixman-arm-neon-asm.S | 128 ++++++++++++++++++++++++++++ 2 files changed, 149 insertions(+) diff --git a/src/video/SDL_surface.c b/src/video/SDL_surface.c index 2a0beed17..73f05034d 100644 --- a/src/video/SDL_surface.c +++ b/src/video/SDL_surface.c @@ -603,6 +603,27 @@ int SDL_FillRect(SDL_Surface *dst, SDL_Rect *dstrect, Uint32 color) } row = (Uint8 *)dst->pixels+dstrect->y*dst->pitch+ dstrect->x*dst->format->BytesPerPixel; +#if SDL_ARM_NEON_BLITTERS + if (SDL_HasARMNEON() && dst->format->BytesPerPixel != 3) { + void FillRect8ARMNEONAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src); + void FillRect16ARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint16_t src); + void FillRect32ARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t src); + switch (dst->format->BytesPerPixel) { + case 1: + FillRect8ARMNEONAsm(dstrect->w, dstrect->h, (uint8_t *) row, dst->pitch >> 0, color); + break; + case 2: + FillRect16ARMNEONAsm(dstrect->w, dstrect->h, (uint16_t *) row, dst->pitch >> 1, color); + break; + case 4: + FillRect32ARMNEONAsm(dstrect->w, dstrect->h, (uint32_t *) row, dst->pitch >> 2, color); + break; + } + + SDL_UnlockSurface(dst); + return(0); + } +#endif #if SDL_ARM_SIMD_BLITTERS if (SDL_HasARMSIMD() && dst->format->BytesPerPixel != 3) { void FillRect8ARMSIMDAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src); diff --git a/src/video/arm/pixman-arm-neon-asm.S b/src/video/arm/pixman-arm-neon-asm.S index 1fcf3c117..ab9bccef9 100644 --- a/src/video/arm/pixman-arm-neon-asm.S +++ b/src/video/arm/pixman-arm-neon-asm.S @@ -95,6 +95,134 @@ /******************************************************************************/ +/* We can actually do significantly better than the Pixman macros, at least for + * the case of fills, by using a carefully scheduled inner loop. Cortex-A53 + * shows an improvement of up to 78% in ideal cases (large fills to L1 cache). + */ + +.macro generate_fillrect_function name, bpp, log2Bpp +/* + * void name(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src); + * On entry: + * a1 = width, pixels + * a2 = height, rows + * a3 = pointer to top-left destination pixel + * a4 = stride, pixels + * [sp] = pixel value to fill with + * Within the function: + * v1 = width remaining + * v2 = vst offset + * v3 = alternate pointer + * ip = data ARM register + */ +pixman_asm_function name + vld1.\bpp {d0[],d1[]}, [sp] + sub a4, a1 + vld1.\bpp {d2[],d3[]}, [sp] + cmp a1, #(15+64) >> \log2Bpp + push {v1-v3,lr} + vmov ip, s0 + blo 51f + + /* Long-row case */ + mov v2, #64 +1: mov v1, a1 + ands v3, a3, #15 + beq 2f + /* Leading pixels */ + rsb v3, v3, #16 /* number of leading bytes until 16-byte aligned */ + sub v1, v1, v3, lsr #\log2Bpp + rbit v3, v3 +.if bpp <= 16 +.if bpp == 8 + tst a3, #1 /* bit 0 unaffected by rsb so can avoid register interlock */ + strneb ip, [a3], #1 + tst v3, #1<<30 +.else + tst a3, #2 /* bit 1 unaffected by rsb (assuming halfword alignment) so can avoid register interlock */ +.endif + strneh ip, [a3], #2 +.endif + movs v3, v3, lsl #3 + vstmcs a3!, {s0} + vstmmi a3!, {d0} +2: sub v1, v1, #64 >> \log2Bpp /* simplifies inner loop termination */ + add v3, a3, #32 + /* Inner loop */ +3: vst1.\bpp {q0-q1}, [a3 :128], v2 + subs v1, v1, #64 >> \log2Bpp + vst1.\bpp {q0-q1}, [v3 :128], v2 + bhs 3b + /* Trailing pixels */ +4: movs v1, v1, lsl #27 + \log2Bpp + bcc 5f + vst1.\bpp {q0-q1}, [a3 :128]! +5: bpl 6f + vst1.\bpp {q0}, [a3 :128]! +6: movs v1, v1, lsl #2 + vstmcs a3!, {d0} + vstmmi a3!, {s0} +.if bpp <= 16 + movs v1, v1, lsl #2 + strcsh ip, [a3], #2 +.if bpp == 8 + strmib ip, [a3], #1 +.endif +.endif + subs a2, a2, #1 + add a3, a3, a4, lsl #\log2Bpp + bhi 1b + pop {v1-v3,pc} + + /* Short-row case */ +51: movs v1, a1 +.if bpp == 8 + tst a3, #3 + beq 53f +52: subs v1, v1, #1 + blo 57f + strb ip, [a3], #1 + tst a3, #3 + bne 52b +.elseif bpp == 16 + tstne a3, #2 + subne v1, v1, #1 + strneh ip, [a3], #2 +.endif +53: cmp v1, #32 >> \log2Bpp + bcc 54f + vst1.\bpp {q0-q1}, [a3]! + sub v1, v1, #32 >> \log2Bpp + /* Trailing pixels */ +54: movs v1, v1, lsl #27 + \log2Bpp + bcc 55f + vst1.\bpp {q0-q1}, [a3]! +55: bpl 56f + vst1.\bpp {q0}, [a3]! +56: movs v1, v1, lsl #2 + vstmcs a3!, {d0} + vstmmi a3!, {s0} +.if bpp <= 16 + movs v1, v1, lsl #2 + strcsh ip, [a3], #2 +.if bpp == 8 + strmib ip, [a3], #1 +.endif +.endif + subs a2, a2, #1 + add a3, a3, a4, lsl #\log2Bpp + bhi 51b +57: pop {v1-v3,pc} + +.endfunc +.endm + +generate_fillrect_function FillRect32ARMNEONAsm, 32, 2 +generate_fillrect_function FillRect16ARMNEONAsm, 16, 1 +generate_fillrect_function FillRect8ARMNEONAsm, 8, 0 + +/******************************************************************************/ + .macro RGBtoRGBPixelAlpha_process_pixblock_head vmvn d30, d3 /* get inverted source alpha */ vmov d31, d7 /* dest alpha is always unchanged */ -- 2.17.1