From d1bcec22750ab5b391760f2159a1fe14d4170ca2 Mon Sep 17 00:00:00 2001 From: Ben Avison Date: Thu, 8 Nov 2018 16:18:42 +0000 Subject: [PATCH 05/11] ARM: SIMD assembly optimization for BGR-to-RGB 32bpp normal blits --- src/video/SDL_blit_N.c | 26 ++++++++++++++++-- src/video/arm/pixman-arm-simd-asm.S | 42 +++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 2 deletions(-) diff --git a/src/video/SDL_blit_N.c b/src/video/SDL_blit_N.c index 2c1ca78fb..497d00715 100644 --- a/src/video/SDL_blit_N.c +++ b/src/video/SDL_blit_N.c @@ -32,7 +32,8 @@ enum blit_features { BLIT_FEATURE_HAS_MMX = 1, BLIT_FEATURE_HAS_ALTIVEC = 2, - BLIT_FEATURE_ALTIVEC_DONT_USE_PREFETCH = 4 + BLIT_FEATURE_ALTIVEC_DONT_USE_PREFETCH = 4, + BLIT_FEATURE_HAS_ARM_SIMD = 8 }; #if SDL_ALTIVEC_BLITTERS @@ -881,7 +882,24 @@ GetBlitFeatures(void) #endif #else /* Feature 1 is has-MMX */ -#define GetBlitFeatures() (SDL_HasMMX() ? BLIT_FEATURE_HAS_MMX : 0) +#define GetBlitFeatures() ((SDL_HasMMX() ? BLIT_FEATURE_HAS_MMX : 0) | (SDL_HasARMSIMD() ? BLIT_FEATURE_HAS_ARM_SIMD : 0)) +#endif + +#if SDL_ARM_SIMD_BLITTERS +void Blit_BGR888_RGB888ARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride); + +static void +Blit_BGR888_RGB888ARMSIMD(SDL_BlitInfo * info) +{ + int32_t width = info->dst_w; + int32_t height = info->dst_h; + uint32_t *dstp = (uint32_t *)info->dst; + int32_t dststride = width + (info->dst_skip >> 2); + uint32_t *srcp = (uint32_t *)info->src; + int32_t srcstride = width + (info->src_skip >> 2); + + Blit_BGR888_RGB888ARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride); +} #endif /* This is now endian dependent */ @@ -2576,6 +2594,10 @@ static const struct blit_table normal_blit_4[] = { /* has-altivec */ {0x00000000, 0x00000000, 0x00000000, 2, 0x0000F800, 0x000007E0, 0x0000001F, BLIT_FEATURE_HAS_ALTIVEC, Blit_RGB888_RGB565Altivec, NO_ALPHA}, +#endif +#if SDL_ARM_SIMD_BLITTERS + {0x000000FF, 0x0000FF00, 0x00FF0000, 4, 0x00FF0000, 0x0000FF00, 0x000000FF, + BLIT_FEATURE_HAS_ARM_SIMD, Blit_BGR888_RGB888ARMSIMD, NO_ALPHA | COPY_ALPHA }, #endif {0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x0000F800, 0x000007E0, 0x0000001F, 0, Blit_RGB888_RGB565, NO_ALPHA}, diff --git a/src/video/arm/pixman-arm-simd-asm.S b/src/video/arm/pixman-arm-simd-asm.S index c749dcd00..cd649ff81 100644 --- a/src/video/arm/pixman-arm-simd-asm.S +++ b/src/video/arm/pixman-arm-simd-asm.S @@ -369,3 +369,45 @@ generate_composite_function \ nop_macro, /* cleanup */ \ ARGBto565PixelAlpha_process_head, \ ARGBto565PixelAlpha_process_tail + + /******************************************************************************/ + +.macro BGR888toRGB888_1pixel cond, reg, tmp + uxtb16&cond tmp, WK®, ror #8 + uxtb16&cond WK®, WK®, ror #16 + orr&cond WK®, WK®, tmp, lsl #8 +.endm + +.macro BGR888toRGB888_2pixels cond, reg1, reg2, tmp1, tmp2 + uxtb16&cond tmp1, WK®1, ror #8 + uxtb16&cond WK®1, WK®1, ror #16 + uxtb16&cond tmp2, WK®2, ror #8 + uxtb16&cond WK®2, WK®2, ror #16 + orr&cond WK®1, WK®1, tmp1, lsl #8 + orr&cond WK®2, WK®2, tmp2, lsl #8 +.endm + +.macro BGR888toRGB888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + pixld cond, numbytes, firstreg, SRC, unaligned_src +.endm + +.macro BGR888toRGB888_process_tail cond, numbytes, firstreg + .if numbytes >= 8 + BGR888toRGB888_2pixels cond, %(firstreg+0), %(firstreg+1), MASK, STRIDE_M + .if numbytes == 16 + BGR888toRGB888_2pixels cond, %(firstreg+2), %(firstreg+3), MASK, STRIDE_M + .endif + .else @ numbytes == 4 + BGR888toRGB888_1pixel cond, %(firstreg+0), MASK + .endif +.endm + +generate_composite_function \ + Blit_BGR888_RGB888ARMSIMDAsm, 32, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \ + 2, /* prefetch distance */ \ + nop_macro, /* init */ \ + nop_macro, /* newline */ \ + nop_macro, /* cleanup */ \ + BGR888toRGB888_process_head, \ + BGR888toRGB888_process_tail -- 2.17.1