From 29514a55c07818d0fec529d9b029b8a93d6540da Mon Sep 17 00:00:00 2001 From: Ben Avison Date: Tue, 30 Aug 2016 19:34:01 +0100 Subject: [PATCH 06/11] ARM: assembly optimization for SDL_FillRect --- src/video/SDL_surface.c | 22 ++++++++++ src/video/arm/pixman-arm-simd-asm.S | 68 +++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) diff --git a/src/video/SDL_surface.c b/src/video/SDL_surface.c index 0f3ad12c4..2a0beed17 100644 --- a/src/video/SDL_surface.c +++ b/src/video/SDL_surface.c @@ -28,6 +28,7 @@ #include "SDL_RLEaccel_c.h" #include "SDL_pixels_c.h" #include "SDL_leaks.h" +#include "SDL_cpuinfo.h" /* Public routines */ @@ -602,6 +603,27 @@ int SDL_FillRect(SDL_Surface *dst, SDL_Rect *dstrect, Uint32 color) } row = (Uint8 *)dst->pixels+dstrect->y*dst->pitch+ dstrect->x*dst->format->BytesPerPixel; +#if SDL_ARM_SIMD_BLITTERS + if (SDL_HasARMSIMD() && dst->format->BytesPerPixel != 3) { + void FillRect8ARMSIMDAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src); + void FillRect16ARMSIMDAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint16_t src); + void FillRect32ARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t src); + switch (dst->format->BytesPerPixel) { + case 1: + FillRect8ARMSIMDAsm(dstrect->w, dstrect->h, (uint8_t *) row, dst->pitch >> 0, color); + break; + case 2: + FillRect16ARMSIMDAsm(dstrect->w, dstrect->h, (uint16_t *) row, dst->pitch >> 1, color); + break; + case 4: + FillRect32ARMSIMDAsm(dstrect->w, dstrect->h, (uint32_t *) row, dst->pitch >> 2, color); + break; + } + + SDL_UnlockSurface(dst); + return(0); + } +#endif if ( dst->format->palette || (color == 0) ) { x = dstrect->w*dst->format->BytesPerPixel; if ( !color && !((uintptr_t)row&3) && !(x&3) && !(dst->pitch&3) ) { diff --git a/src/video/arm/pixman-arm-simd-asm.S b/src/video/arm/pixman-arm-simd-asm.S index 81e38c44b..769c213f6 100644 --- a/src/video/arm/pixman-arm-simd-asm.S +++ b/src/video/arm/pixman-arm-simd-asm.S @@ -47,6 +47,74 @@ /******************************************************************************/ +.macro FillRect32_init + ldr SRC, [sp, #ARGS_STACK_OFFSET] + mov STRIDE_S, SRC + mov MASK, SRC + mov STRIDE_M, SRC +.endm + +.macro FillRect16_init + ldrh SRC, [sp, #ARGS_STACK_OFFSET] + orr SRC, SRC, lsl #16 + mov STRIDE_S, SRC + mov MASK, SRC + mov STRIDE_M, SRC +.endm + +.macro FillRect8_init + ldrb SRC, [sp, #ARGS_STACK_OFFSET] + orr SRC, SRC, lsl #8 + orr SRC, SRC, lsl #16 + mov STRIDE_S, SRC + mov MASK, SRC + mov STRIDE_M, SRC +.endm + +.macro FillRect_process_tail cond, numbytes, firstreg + WK4 .req SRC + WK5 .req STRIDE_S + WK6 .req MASK + WK7 .req STRIDE_M + pixst cond, numbytes, 4, DST + .unreq WK4 + .unreq WK5 + .unreq WK6 + .unreq WK7 +.endm + +generate_composite_function \ + FillRect32ARMSIMDAsm, 0, 0, 32, \ + FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ + 0, /* prefetch distance doesn't apply */ \ + FillRect32_init \ + nop_macro, /* newline */ \ + nop_macro /* cleanup */ \ + nop_macro /* process head */ \ + FillRect_process_tail + +generate_composite_function \ + FillRect16ARMSIMDAsm, 0, 0, 16, \ + FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ + 0, /* prefetch distance doesn't apply */ \ + FillRect16_init \ + nop_macro, /* newline */ \ + nop_macro /* cleanup */ \ + nop_macro /* process head */ \ + FillRect_process_tail + +generate_composite_function \ + FillRect8ARMSIMDAsm, 0, 0, 8, \ + FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ + 0, /* prefetch distance doesn't apply */ \ + FillRect8_init \ + nop_macro, /* newline */ \ + nop_macro /* cleanup */ \ + nop_macro /* process head */ \ + FillRect_process_tail + +/******************************************************************************/ + /* This differs from the over_8888_8888 routine in Pixman in that the destination * alpha component is always left unchanged, and RGB components are not * premultiplied by alpha. It differs from BlitRGBtoRGBPixelAlpha in that -- 2.17.1