From dd35d0c884f18663fdc51168e83c654288d77f08 Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Thu, 8 Nov 2018 18:00:15 +0000
Subject: [PATCH 10/11] ARM: NEON assembly optimization for function
 BlitARGBto565PixelAlpha

---
 src/video/SDL_blit_A.c              | 30 ++++++++--
 src/video/arm/pixman-arm-neon-asm.S | 88 +++++++++++++++++++++++++++++
 2 files changed, 114 insertions(+), 4 deletions(-)

diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c
index d17091a84..2089774b2 100644
--- a/src/video/SDL_blit_A.c
+++ b/src/video/SDL_blit_A.c
@@ -422,6 +422,21 @@ BlitRGBtoRGBPixelAlphaARMSIMD(SDL_BlitInfo * info)
 #endif
 
 #if SDL_ARM_NEON_BLITTERS
+void BlitARGBto565PixelAlphaARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
+
+static void
+BlitARGBto565PixelAlphaARMNEON(SDL_BlitInfo * info)
+{
+    int32_t width = info->dst_w;
+    int32_t height = info->dst_h;
+    uint16_t *dstp = (uint16_t *)info->dst;
+    int32_t dststride = width + (info->dst_skip >> 1);
+    uint32_t *srcp = (uint32_t *)info->src;
+    int32_t srcstride = width + (info->src_skip >> 2);
+
+    BlitARGBto565PixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride);
+}
+
 void BlitRGBtoRGBPixelAlphaARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
 
 static void
@@ -1328,14 +1343,21 @@ SDL_CalculateBlitA(SDL_Surface * surface)
             return BlitNto1PixelAlpha;
 
         case 2:
-#if SDL_ARM_SIMD_BLITTERS
+#if SDL_ARM_NEON_BLITTERS || SDL_ARM_SIMD_BLITTERS
                 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
                     && sf->Gmask == 0xff00 && df->Gmask == 0x7e0
                     && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
-                    || (sf->Bmask == 0xff && df->Bmask == 0x1f))
-                    && SDL_HasARMSIMD())
+                    || (sf->Bmask == 0xff && df->Bmask == 0x1f)))
+                {
+#if SDL_ARM_NEON_BLITTERS
+                    if (SDL_HasNEON())
+                        return BlitARGBto565PixelAlphaARMNEON;
+#endif
+#if SDL_ARM_SIMD_BLITTERS
+                    if (SDL_HasARMSIMD())
                         return BlitARGBto565PixelAlphaARMSIMD;
-                else
+#endif
+                }
 #endif
                 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
                     && sf->Gmask == 0xff00
diff --git a/src/video/arm/pixman-arm-neon-asm.S b/src/video/arm/pixman-arm-neon-asm.S
index 076ae8eac..aa0f2259f 100644
--- a/src/video/arm/pixman-arm-neon-asm.S
+++ b/src/video/arm/pixman-arm-neon-asm.S
@@ -138,3 +138,91 @@ generate_composite_function \
     RGBtoRGBPixelAlpha_process_pixblock_head, \
     RGBtoRGBPixelAlpha_process_pixblock_tail, \
     RGBtoRGBPixelAlpha_process_pixblock_tail_head
+
+ /******************************************************************************/
+
+.macro ARGBto565PixelAlpha_process_pixblock_head
+    vmvn        d6, d3
+    vshr.u8     d1, #2
+    vshr.u8     d3, #3
+    vshr.u8     d0, #3
+    vshrn.u16   d7, q2, #3
+    vshrn.u16   d25, q2, #8
+    vbic.i16    q2, #0xe0
+    vshr.u8     d6, #3
+    vshr.u8     d7, #2
+    vshr.u8     d2, #3
+    vmovn.u16   d24, q2
+    vshr.u8     d25, #3
+    vmull.u8    q13, d1, d3
+    vmlal.u8    q13, d7, d6
+    vmull.u8    q14, d0, d3
+    vmlal.u8    q14, d24, d6
+    vmull.u8    q15, d2, d3
+    vmlal.u8    q15, d25, d6
+.endm
+
+.macro ARGBto565PixelAlpha_process_pixblock_tail
+    vsra.u16    q13, #5
+    vsra.u16    q14, #5
+    vsra.u16    q15, #5
+    vrshr.u16   q13, #5
+    vrshr.u16   q14, #5
+    vrshr.u16   q15, #5
+    vsli.u16    q14, q13, #5
+    vsli.u16    q14, q15, #11
+.endm
+
+.macro ARGBto565PixelAlpha_process_pixblock_tail_head
+    vld4.8      {d0-d3}, [SRC]!
+                                    PF add PF_X, PF_X, #8
+        vsra.u16    q13, #5
+                                    PF tst PF_CTL, #0xF
+        vsra.u16    q14, #5
+                                    PF addne PF_X, PF_X, #8
+        vsra.u16    q15, #5
+                                    PF subne PF_CTL, PF_CTL, #1
+        vrshr.u16   q13, #5
+                                    PF cmp PF_X, ORIG_W
+        vrshr.u16   q14, #5
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+        vrshr.u16   q15, #5
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+    vld1.8      {d4-d5}, [DST_R]!
+                                    PF subge PF_X, PF_X, ORIG_W
+        vsli.u16    q14, q13, #5
+                                    PF subges PF_CTL, PF_CTL, #0x10
+        vsli.u16    q14, q15, #11
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+        vst1.8      {q14}, [DST_W :128]!
+    vmvn        d6, d3
+    vshr.u8     d1, #2
+    vshr.u8     d3, #3
+    vshr.u8     d0, #3
+    vshrn.u16   d7, q2, #3
+    vshrn.u16   d25, q2, #8
+    vbic.i16    q2, #0xe0
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vshr.u8     d6, #3
+    vshr.u8     d7, #2
+    vshr.u8     d2, #3
+    vmovn.u16   d24, q2
+    vshr.u8     d25, #3
+    vmull.u8    q13, d1, d3
+    vmlal.u8    q13, d7, d6
+    vmull.u8    q14, d0, d3
+    vmlal.u8    q14, d24, d6
+    vmull.u8    q15, d2, d3
+    vmlal.u8    q15, d25, d6
+.endm
+
+generate_composite_function \
+    BlitARGBto565PixelAlphaARMNEONAsm, 32, 0, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    6, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    ARGBto565PixelAlpha_process_pixblock_head, \
+    ARGBto565PixelAlpha_process_pixblock_tail, \
+    ARGBto565PixelAlpha_process_pixblock_tail_head
-- 
2.17.1