From dcd113e5b8e1f0697d99cacc36e01315098ffcaf Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Wed, 7 Nov 2018 13:13:44 +0000
Subject: [PATCH 11/11] ARM: NEON assembly optimization for SDL_FillRect

---
 src/video/SDL_surface.c             |  21 +++++
 src/video/arm/pixman-arm-neon-asm.S | 128 ++++++++++++++++++++++++++++
 2 files changed, 149 insertions(+)

diff --git a/src/video/SDL_surface.c b/src/video/SDL_surface.c
index 2a0beed17..73f05034d 100644
--- a/src/video/SDL_surface.c
+++ b/src/video/SDL_surface.c
@@ -603,6 +603,27 @@ int SDL_FillRect(SDL_Surface *dst, SDL_Rect *dstrect, Uint32 color)
 	}
 	row = (Uint8 *)dst->pixels+dstrect->y*dst->pitch+
 			dstrect->x*dst->format->BytesPerPixel;
+#if SDL_ARM_NEON_BLITTERS
+    if (SDL_HasARMNEON() && dst->format->BytesPerPixel != 3) {
+        void FillRect8ARMNEONAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
+        void FillRect16ARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint16_t src);
+        void FillRect32ARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t src);
+        switch (dst->format->BytesPerPixel) {
+        case 1:
+            FillRect8ARMNEONAsm(dstrect->w, dstrect->h, (uint8_t *) row, dst->pitch >> 0, color);
+            break;
+        case 2:
+            FillRect16ARMNEONAsm(dstrect->w, dstrect->h, (uint16_t *) row, dst->pitch >> 1, color);
+            break;
+        case 4:
+            FillRect32ARMNEONAsm(dstrect->w, dstrect->h, (uint32_t *) row, dst->pitch >> 2, color);
+            break;
+        }
+
+        SDL_UnlockSurface(dst);
+        return(0);
+    }
+#endif
 #if SDL_ARM_SIMD_BLITTERS
 	if (SDL_HasARMSIMD() && dst->format->BytesPerPixel != 3) {
 		void FillRect8ARMSIMDAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
diff --git a/src/video/arm/pixman-arm-neon-asm.S b/src/video/arm/pixman-arm-neon-asm.S
index 1fcf3c117..ab9bccef9 100644
--- a/src/video/arm/pixman-arm-neon-asm.S
+++ b/src/video/arm/pixman-arm-neon-asm.S
@@ -95,6 +95,134 @@
 
 /******************************************************************************/
 
+/* We can actually do significantly better than the Pixman macros, at least for
+ * the case of fills, by using a carefully scheduled inner loop. Cortex-A53
+ * shows an improvement of up to 78% in ideal cases (large fills to L1 cache).
+ */
+
+.macro generate_fillrect_function name, bpp, log2Bpp
+/*
+ * void name(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
+ * On entry:
+ * a1 = width, pixels
+ * a2 = height, rows
+ * a3 = pointer to top-left destination pixel
+ * a4 = stride, pixels
+ * [sp] = pixel value to fill with
+ * Within the function:
+ * v1 = width remaining
+ * v2 = vst offset
+ * v3 = alternate pointer
+ * ip = data ARM register
+ */
+pixman_asm_function name
+    vld1.\bpp   {d0[],d1[]}, [sp]
+    sub         a4, a1
+    vld1.\bpp   {d2[],d3[]}, [sp]
+    cmp         a1, #(15+64) >> \log2Bpp
+    push        {v1-v3,lr}
+    vmov        ip, s0
+    blo         51f
+
+    /* Long-row case */
+    mov         v2, #64
+1:  mov         v1, a1
+    ands        v3, a3, #15
+    beq         2f
+    /* Leading pixels */
+    rsb         v3, v3, #16  /* number of leading bytes until 16-byte aligned */
+    sub         v1, v1, v3, lsr #\log2Bpp
+    rbit        v3, v3
+.if bpp <= 16
+.if bpp == 8
+    tst         a3, #1       /* bit 0 unaffected by rsb so can avoid register interlock */
+    strneb      ip, [a3], #1
+    tst         v3, #1<<30
+.else
+    tst         a3, #2       /* bit 1 unaffected by rsb (assuming halfword alignment) so can avoid register interlock */
+.endif
+    strneh      ip, [a3], #2
+.endif
+    movs        v3, v3, lsl #3
+    vstmcs      a3!, {s0}
+    vstmmi      a3!, {d0}
+2:  sub         v1, v1, #64 >> \log2Bpp /* simplifies inner loop termination */
+    add         v3, a3, #32
+    /* Inner loop */
+3:  vst1.\bpp   {q0-q1}, [a3 :128], v2
+    subs        v1, v1, #64 >> \log2Bpp
+    vst1.\bpp   {q0-q1}, [v3 :128], v2
+    bhs         3b
+    /* Trailing pixels */
+4:  movs        v1, v1, lsl #27 + \log2Bpp
+    bcc         5f
+    vst1.\bpp   {q0-q1}, [a3 :128]!
+5:  bpl         6f
+    vst1.\bpp   {q0}, [a3 :128]!
+6:  movs        v1, v1, lsl #2
+    vstmcs      a3!, {d0}
+    vstmmi      a3!, {s0}
+.if bpp <= 16
+    movs        v1, v1, lsl #2
+    strcsh      ip, [a3], #2
+.if bpp == 8
+    strmib      ip, [a3], #1
+.endif
+.endif
+    subs        a2, a2, #1
+    add         a3, a3, a4, lsl #\log2Bpp
+    bhi         1b
+    pop         {v1-v3,pc}
+
+    /* Short-row case */
+51: movs        v1, a1
+.if bpp == 8
+    tst         a3, #3
+    beq         53f
+52: subs        v1, v1, #1
+    blo         57f
+    strb        ip, [a3], #1
+    tst         a3, #3
+    bne         52b
+.elseif bpp == 16
+    tstne       a3, #2
+    subne       v1, v1, #1
+    strneh      ip, [a3], #2
+.endif
+53: cmp         v1, #32 >> \log2Bpp
+    bcc         54f
+    vst1.\bpp   {q0-q1}, [a3]!
+    sub         v1, v1, #32 >> \log2Bpp
+    /* Trailing pixels */
+54: movs        v1, v1, lsl #27 + \log2Bpp
+    bcc         55f
+    vst1.\bpp   {q0-q1}, [a3]!
+55: bpl         56f
+    vst1.\bpp   {q0}, [a3]!
+56: movs        v1, v1, lsl #2
+    vstmcs      a3!, {d0}
+    vstmmi      a3!, {s0}
+.if bpp <= 16
+    movs        v1, v1, lsl #2
+    strcsh      ip, [a3], #2
+.if bpp == 8
+    strmib      ip, [a3], #1
+.endif
+.endif
+    subs        a2, a2, #1
+    add         a3, a3, a4, lsl #\log2Bpp
+    bhi         51b
+57: pop         {v1-v3,pc}
+
+.endfunc
+.endm
+
+generate_fillrect_function FillRect32ARMNEONAsm, 32, 2
+generate_fillrect_function FillRect16ARMNEONAsm, 16, 1
+generate_fillrect_function FillRect8ARMNEONAsm,  8,  0
+
+/******************************************************************************/
+
 .macro RGBtoRGBPixelAlpha_process_pixblock_head
     vmvn        d30, d3  /* get inverted source alpha */
     vmov        d31, d7  /* dest alpha is always unchanged */
-- 
2.17.1