From 7e14e09c1901e25c0587c3b52b45fc7ee0719649 Mon Sep 17 00:00:00 2001 From: Ben Avison Date: Tue, 6 Sep 2016 13:16:23 +0100 Subject: [PATCH 07/11] ARM: SIMD optimization for 4:4:4:4 to 8:8:8:8 normal blits --- src/video/SDL_blit_N.c | 18 +++++++++ src/video/arm/pixman-arm-simd-asm.S | 57 +++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) diff --git a/src/video/SDL_blit_N.c b/src/video/SDL_blit_N.c index 308f0ddef..6e9c490cd 100644 --- a/src/video/SDL_blit_N.c +++ b/src/video/SDL_blit_N.c @@ -871,6 +871,20 @@ static void Blit_BGR888_RGB888ARMSIMD(SDL_BlitInfo *info) Blit_BGR888_RGB888ARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride); } + +void Blit_RGB444_RGB888ARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint16_t *src, int32_t src_stride); + +static void Blit_RGB444_RGB888ARMSIMD(SDL_BlitInfo *info) +{ + int32_t width = info->d_width; + int32_t height = info->d_height; + uint32_t *dstp = (uint32_t *)info->d_pixels; + int32_t dststride = width + (info->d_skip >> 2); + uint16_t *srcp = (uint16_t *)info->s_pixels; + int32_t srcstride = width + (info->s_skip >> 1); + + Blit_RGB444_RGB888ARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride); +} #endif /* This is now endian dependent */ @@ -2353,6 +2367,10 @@ static const struct blit_table normal_blit_2[] = { BLIT_FEATURE_HAS_ALTIVEC, NULL, Blit_RGB565_32Altivec, NO_ALPHA | COPY_ALPHA | SET_ALPHA }, { 0x00007C00,0x000003E0,0x0000001F, 4, 0x00000000,0x00000000,0x00000000, BLIT_FEATURE_HAS_ALTIVEC, NULL, Blit_RGB555_32Altivec, NO_ALPHA | COPY_ALPHA | SET_ALPHA }, +#endif +#if SDL_ARM_SIMD_BLITTERS + { 0x00000F00,0x000000F0,0x0000000F, 4, 0x00FF0000,0x0000FF00,0x000000FF, + BLIT_FEATURE_HAS_ARM_SIMD, NULL, Blit_RGB444_RGB888ARMSIMD, NO_ALPHA | COPY_ALPHA }, #endif { 0x0000F800,0x000007E0,0x0000001F, 4, 0x00FF0000,0x0000FF00,0x000000FF, 0, NULL, Blit_RGB565_ARGB8888, NO_ALPHA | COPY_ALPHA | SET_ALPHA }, diff --git a/src/video/arm/pixman-arm-simd-asm.S b/src/video/arm/pixman-arm-simd-asm.S index 769c213f6..d9c29993d 100644 --- a/src/video/arm/pixman-arm-simd-asm.S +++ b/src/video/arm/pixman-arm-simd-asm.S @@ -473,3 +473,60 @@ generate_composite_function \ nop_macro, /* cleanup */ \ BGR888toRGB888_process_head, \ BGR888toRGB888_process_tail + +/******************************************************************************/ + +.macro RGB444toRGB888_init + ldr MASK, =0x0f0f0f0f + /* Set GE[3:0] to 0101 so SEL instructions do what we want */ + msr CPSR_s, #0x50000 +.endm + +.macro RGB444toRGB888_1pixel reg, mask, tmp + pkhbt WK®, WK®, WK®, lsl #12 @ 0000aaaarrrrggggaaaarrrrggggbbbb + and WK®, mask, WK® @ 0000aaaa0000gggg0000rrrr0000bbbb + orr WK®, WK®, WK®, lsl #4 @ aaaaaaaaggggggggrrrrrrrrbbbbbbbb + pkhtb tmp, WK®, WK®, asr #8 @ aaaaaaaaggggggggggggggggrrrrrrrr + pkhbt WK®, WK®, WK®, lsl #8 @ ggggggggrrrrrrrrrrrrrrrrbbbbbbbb + sel WK®, WK®, tmp @ aaaaaaaarrrrrrrrggggggggbbbbbbbb +.endm + +.macro RGB444toRGB888_2pixels in, out1, out2, mask, tmp1, tmp2 + and tmp1, mask, WK&in @ 0000RRRR0000BBBB0000rrrr0000bbbb + and tmp2, mask, WK&in, lsr #4 @ 0000AAAA0000GGGG0000aaaa0000gggg + orr tmp1, tmp1, tmp1, lsl #4 @ RRRRRRRRBBBBBBBBrrrrrrrrbbbbbbbb + orr tmp2, tmp2, tmp2, lsl #4 @ AAAAAAAAGGGGGGGGaaaaaaaagggggggg + pkhtb WK&out2, tmp2, tmp1, asr #16 @ AAAAAAAAGGGGGGGGRRRRRRRRBBBBBBBB + pkhbt WK&out1, tmp1, tmp2, lsl #16 @ aaaaaaaaggggggggrrrrrrrrbbbbbbbb + pkhtb tmp2, WK&out2, WK&out2, asr #8 @ AAAAAAAAGGGGGGGGGGGGGGGGRRRRRRRR + pkhtb tmp1, WK&out1, WK&out1, asr #8 @ aaaaaaaaggggggggggggggggrrrrrrrr + pkhbt WK&out1, WK&out1, WK&out1, lsl #8 @ ggggggggrrrrrrrrrrrrrrrrbbbbbbbb + pkhbt WK&out2, WK&out2, WK&out2, lsl #8 @ GGGGGGGGRRRRRRRRRRRRRRRRBBBBBBBB + sel WK&out1, WK&out1, tmp1 @ aaaaaaaarrrrrrrrggggggggbbbbbbbb + sel WK&out2, WK&out2, tmp2 @ AAAAAAAARRRRRRRRGGGGGGGGBBBBBBBB +.endm + +.macro RGB444toRGB888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload + pixld cond, numbytes/2, firstreg, SRC, unaligned_src +.endm + +.macro RGB444toRGB888_process_tail cond, numbytes, firstreg + .if numbytes >= 8 + .if numbytes == 16 + RGB444toRGB888_2pixels %(firstreg+1), %(firstreg+2), %(firstreg+3), MASK, STRIDE_M, SCRATCH + .endif + RGB444toRGB888_2pixels %(firstreg+0), %(firstreg+0), %(firstreg+1), MASK, STRIDE_M, SCRATCH + .else @ numbytes == 4 + RGB444toRGB888_1pixel %(firstreg+0), MASK, SCRATCH + .endif +.endm + +generate_composite_function \ + Blit_RGB444_RGB888ARMSIMDAsm, 16, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \ + 2, /* prefetch distance */ \ + RGB444toRGB888_init, \ + nop_macro, /* newline */ \ + nop_macro, /* cleanup */ \ + RGB444toRGB888_process_head, \ + RGB444toRGB888_process_tail -- 2.17.1