--- src/hermes/mmxp2_32.asm
+++ src/hermes/mmxp2_32.asm
@@ -27,22 +27,37 @@ GLOBAL _ConvertMMXpII32_16BGR565
 GLOBAL _ConvertMMXpII32_16RGB555
 GLOBAL _ConvertMMXpII32_16BGR555
 
-SECTION .data
-	
-ALIGN 8
-
-;; Constants for conversion routines
-
-mmx32_rgb888_mask dd 00ffffffh,00ffffffh
-
-mmx32_rgb565_b dd 000000f8h, 000000f8h
-mmx32_rgb565_g dd 0000fc00h, 0000fc00h
-mmx32_rgb565_r dd 00f80000h, 00f80000h
-
-mmx32_rgb555_rb dd 00f800f8h,00f800f8h
-mmx32_rgb555_g dd 0000f800h,0000f800h
-mmx32_rgb555_mul dd 20000008h,20000008h
-mmx32_bgr555_mul dd 00082000h,00082000h
+
+;; Macros for conversion routines
+
+%macro _push_immq_mask 1
+	push dword %1
+	push dword %1
+%endmacro
+
+%macro load_immq 2
+	_push_immq_mask %2
+	movq %1, [esp]
+%endmacro
+
+%macro pand_immq 2
+	_push_immq_mask %2
+	pand %1, [esp]
+%endmacro
+
+%define CLEANUP_IMMQ_LOADS(num) \
+	add esp, byte 8 * num
+
+%define mmx32_rgb888_mask 00ffffffh
+
+%define mmx32_rgb565_b 000000f8h
+%define mmx32_rgb565_g 0000fc00h
+%define mmx32_rgb565_r 00f80000h
+
+%define mmx32_rgb555_rb 00f800f8h
+%define mmx32_rgb555_g 0000f800h
+%define mmx32_rgb555_mul 20000008h
+%define mmx32_bgr555_mul 00082000h
 
 
 			
@@ -53,7 +66,8 @@ SECTION .text
 _ConvertMMXpII32_24RGB888:
 
         ; set up mm6 as the mask, mm7 as zero
-        movq mm6, qword [mmx32_rgb888_mask]
+        load_immq mm6, mmx32_rgb888_mask
+        CLEANUP_IMMQ_LOADS(1)
         pxor mm7, mm7
 
         mov edx, ecx                    ; save ecx
@@ -108,9 +122,10 @@ _ConvertMMXpII32_24RGB888:
 _ConvertMMXpII32_16RGB565:
 
         ; set up masks
-        movq mm5, [mmx32_rgb565_b]
-        movq mm6, [mmx32_rgb565_g]
-        movq mm7, [mmx32_rgb565_r]
+        load_immq mm5, mmx32_rgb565_b
+        load_immq mm6, mmx32_rgb565_g
+        load_immq mm7, mmx32_rgb565_r
+        CLEANUP_IMMQ_LOADS(3)
 
         mov edx, ecx
         shr ecx, 2
@@ -176,9 +191,10 @@ _ConvertMMXpII32_16RGB565:
 	
 _ConvertMMXpII32_16BGR565:
 
-        movq mm5, [mmx32_rgb565_r]
-        movq mm6, [mmx32_rgb565_g]
-        movq mm7, [mmx32_rgb565_b]
+        load_immq mm5, mmx32_rgb565_r
+        load_immq mm6, mmx32_rgb565_g
+        load_immq mm7, mmx32_rgb565_b
+        CLEANUP_IMMQ_LOADS(3)
 
         mov edx, ecx
         shr ecx, 2
@@ -253,7 +269,7 @@ _ConvertMMXpII32_16BGR555:
         ; except it uses a different multiplier for the pmaddwd
         ; instruction.  cool huh.
 
-        movq mm7, qword [mmx32_bgr555_mul]
+        load_immq mm7, mmx32_bgr555_mul
         jmp _convert_bgr555_cheat
 
 ; This is the same as the Intel version.. they obviously went to
@@ -263,9 +279,10 @@ _ConvertMMXpII32_16BGR555:
 ; (I think) a more accurate name..
 _ConvertMMXpII32_16RGB555:
 
-        movq mm7,qword [mmx32_rgb555_mul]
+	load_immq mm7, mmx32_rgb555_mul
 _convert_bgr555_cheat:
-        movq mm6,qword [mmx32_rgb555_g]
+	load_immq mm6, mmx32_rgb555_g
+	CLEANUP_IMMQ_LOADS(2)
         
 	mov edx,ecx		           ; Save ecx 
 
@@ -280,12 +297,14 @@ _convert_bgr555_cheat:
 	movq mm0,[esi]
 	movq mm3,mm2
 
-	pand mm3,qword [mmx32_rgb555_rb]
+	pand_immq mm3, mmx32_rgb555_rb
 	movq mm1,mm0
 
-	pand mm1,qword [mmx32_rgb555_rb]
+	pand_immq mm1, mmx32_rgb555_rb
 	pmaddwd mm3,mm7
 
+	CLEANUP_IMMQ_LOADS(2)
+
 	pmaddwd mm1,mm7
 	pand mm2,mm6
 
@@ -302,13 +321,13 @@ _convert_bgr555_cheat:
 	movq mm0,mm4
 	psrld mm1,6
 
-	pand mm0,qword [mmx32_rgb555_rb]
+	pand_immq mm0, mmx32_rgb555_rb
 	packssdw mm1,mm3
 
 	movq mm3,mm5
 	pmaddwd mm0,mm7
 
-	pand mm3,qword [mmx32_rgb555_rb]
+	pand_immq mm3, mmx32_rgb555_rb
 	pand mm4,mm6
 
 	movq [edi],mm1			
@@ -329,12 +348,14 @@ _convert_bgr555_cheat:
 	movq mm3,mm2
 	movq mm1,mm0
 
-	pand mm3,qword [mmx32_rgb555_rb]
+	pand_immq mm3, mmx32_rgb555_rb
 	packssdw mm5,mm4
 
-	pand mm1,qword [mmx32_rgb555_rb]
+	pand_immq mm1, mmx32_rgb555_rb
 	pand mm2,mm6
 
+	CLEANUP_IMMQ_LOADS(4)
+
 	movq [edi+8],mm5
 	pmaddwd mm3,mm7