--- src/hermes/mmxp2_32.asm +++ src/hermes/mmxp2_32.asm @@ -27,22 +27,37 @@ GLOBAL _ConvertMMXpII32_16BGR565 GLOBAL _ConvertMMXpII32_16RGB555 GLOBAL _ConvertMMXpII32_16BGR555 -SECTION .data - -ALIGN 8 - -;; Constants for conversion routines - -mmx32_rgb888_mask dd 00ffffffh,00ffffffh - -mmx32_rgb565_b dd 000000f8h, 000000f8h -mmx32_rgb565_g dd 0000fc00h, 0000fc00h -mmx32_rgb565_r dd 00f80000h, 00f80000h - -mmx32_rgb555_rb dd 00f800f8h,00f800f8h -mmx32_rgb555_g dd 0000f800h,0000f800h -mmx32_rgb555_mul dd 20000008h,20000008h -mmx32_bgr555_mul dd 00082000h,00082000h + +;; Macros for conversion routines + +%macro _push_immq_mask 1 + push dword %1 + push dword %1 +%endmacro + +%macro load_immq 2 + _push_immq_mask %2 + movq %1, [esp] +%endmacro + +%macro pand_immq 2 + _push_immq_mask %2 + pand %1, [esp] +%endmacro + +%define CLEANUP_IMMQ_LOADS(num) \ + add esp, byte 8 * num + +%define mmx32_rgb888_mask 00ffffffh + +%define mmx32_rgb565_b 000000f8h +%define mmx32_rgb565_g 0000fc00h +%define mmx32_rgb565_r 00f80000h + +%define mmx32_rgb555_rb 00f800f8h +%define mmx32_rgb555_g 0000f800h +%define mmx32_rgb555_mul 20000008h +%define mmx32_bgr555_mul 00082000h @@ -53,7 +66,8 @@ SECTION .text _ConvertMMXpII32_24RGB888: ; set up mm6 as the mask, mm7 as zero - movq mm6, qword [mmx32_rgb888_mask] + load_immq mm6, mmx32_rgb888_mask + CLEANUP_IMMQ_LOADS(1) pxor mm7, mm7 mov edx, ecx ; save ecx @@ -108,9 +122,10 @@ _ConvertMMXpII32_24RGB888: _ConvertMMXpII32_16RGB565: ; set up masks - movq mm5, [mmx32_rgb565_b] - movq mm6, [mmx32_rgb565_g] - movq mm7, [mmx32_rgb565_r] + load_immq mm5, mmx32_rgb565_b + load_immq mm6, mmx32_rgb565_g + load_immq mm7, mmx32_rgb565_r + CLEANUP_IMMQ_LOADS(3) mov edx, ecx shr ecx, 2 @@ -176,9 +191,10 @@ _ConvertMMXpII32_16RGB565: _ConvertMMXpII32_16BGR565: - movq mm5, [mmx32_rgb565_r] - movq mm6, [mmx32_rgb565_g] - movq mm7, [mmx32_rgb565_b] + load_immq mm5, mmx32_rgb565_r + load_immq mm6, mmx32_rgb565_g + load_immq mm7, mmx32_rgb565_b + CLEANUP_IMMQ_LOADS(3) mov edx, ecx shr ecx, 2 @@ -253,7 +269,7 @@ _ConvertMMXpII32_16BGR555: ; except it uses a different multiplier for the pmaddwd ; instruction. cool huh. - movq mm7, qword [mmx32_bgr555_mul] + load_immq mm7, mmx32_bgr555_mul jmp _convert_bgr555_cheat ; This is the same as the Intel version.. they obviously went to @@ -263,9 +279,10 @@ _ConvertMMXpII32_16BGR555: ; (I think) a more accurate name.. _ConvertMMXpII32_16RGB555: - movq mm7,qword [mmx32_rgb555_mul] + load_immq mm7, mmx32_rgb555_mul _convert_bgr555_cheat: - movq mm6,qword [mmx32_rgb555_g] + load_immq mm6, mmx32_rgb555_g + CLEANUP_IMMQ_LOADS(2) mov edx,ecx ; Save ecx @@ -280,12 +297,14 @@ _convert_bgr555_cheat: movq mm0,[esi] movq mm3,mm2 - pand mm3,qword [mmx32_rgb555_rb] + pand_immq mm3, mmx32_rgb555_rb movq mm1,mm0 - pand mm1,qword [mmx32_rgb555_rb] + pand_immq mm1, mmx32_rgb555_rb pmaddwd mm3,mm7 + CLEANUP_IMMQ_LOADS(2) + pmaddwd mm1,mm7 pand mm2,mm6 @@ -302,13 +321,13 @@ _convert_bgr555_cheat: movq mm0,mm4 psrld mm1,6 - pand mm0,qword [mmx32_rgb555_rb] + pand_immq mm0, mmx32_rgb555_rb packssdw mm1,mm3 movq mm3,mm5 pmaddwd mm0,mm7 - pand mm3,qword [mmx32_rgb555_rb] + pand_immq mm3, mmx32_rgb555_rb pand mm4,mm6 movq [edi],mm1 @@ -329,12 +348,14 @@ _convert_bgr555_cheat: movq mm3,mm2 movq mm1,mm0 - pand mm3,qword [mmx32_rgb555_rb] + pand_immq mm3, mmx32_rgb555_rb packssdw mm5,mm4 - pand mm1,qword [mmx32_rgb555_rb] + pand_immq mm1, mmx32_rgb555_rb pand mm2,mm6 + CLEANUP_IMMQ_LOADS(4) + movq [edi+8],mm5 pmaddwd mm3,mm7