We are currently migrating Bugzilla to GitHub issues.
Any changes made to the bug tracker now will be lost, so please do not post new bugs or make changes to them.
When we're done, all bug URLs will redirect to their equivalent location on the new bug tracker.

Bug 3479

Summary: YUV to RGB software rendering does not work
Product: SDL Reporter: Andrew Wilson <awils>
Component: videoAssignee: Sam Lantinga <slouken>
Status: RESOLVED DUPLICATE QA Contact: Sam Lantinga <slouken>
Severity: major    
Priority: P2    
Version: 2.0.4   
Hardware: x86   
OS: Linux   
Attachments: patch indicating my changes that fix the problem

Description Andrew Wilson 2016-11-01 21:44:28 UTC
When trying to use the software renderer in SDL_yuv_mmx.c (ColorRGBDitherYV12MMX1X), gcc 4.6.3, all I see is a black screen with two or so colored pixels in top-left hand corner.

The problem appears to be gcc is spilling registers to the stack, but the inline assembly pushes onto the stack, making all of the computed stack offsets incorrect.

I fixed this locally using the following scheme:



    void* dummy;
    Uint32 *row1;
    Uint32 *row2;
    unsigned char* y = lum +cols*rows;    /* Pointer to the end */
    int x = 0;

    row1 = (Uint32 *)out;                 /* 32 bit target */
    row2 = (Uint32 *)out+cols+mod;        /* start of second row */
    mod = (mod+cols+mod)*4;               /* increment for row1 in byte */

    __asm__ __volatile__ (
        /*  move one thing to the stack... */
        "pushl %0\n"
        "popl %16\n"

        ".align 8\n"
        "1:\n"

        /* create Cr (result in mm1) */
        "movd (%16),%%mm1\n"
        "pxor %%mm7,%%mm7\n"      /*         00 00 00 00 00 00 00 00 */
        "movd (%2), %%mm2\n"           /*    0  0  0  0 l3 l2 l1 l0 */
        "punpcklbw %%mm7,%%mm1\n" /*         0  v3 0  v2 00 v1 00 v0 */
        "punpckldq %%mm1,%%mm1\n" /*         00 v1 00 v0 00 v1 00 v0 */
        "psubw %9,%%mm1\n"        /* mm1-128:r1 r1 r0 r0 r1 r1 r0 r0 */

        /* create Cr_g (result in mm0) */
        "movq %%mm1,%%mm0\n"           /* r1 r1 r0 r0 r1 r1 r0 r0 */
        "pmullw %10,%%mm0\n"           /* red*-46dec=0.7136*64 */
        "pmullw %11,%%mm1\n"           /* red*89dec=1.4013*64 */
        "psraw  $6, %%mm0\n"           /* red=red/64 */
        "psraw  $6, %%mm1\n"           /* red=red/64 */

        /* create L1 L2 (result in mm2,mm4) */
        /* L2=lum+cols */
        "movq (%2,%4),%%mm3\n"         /*    0  0  0  0 L3 L2 L1 L0 */
        "punpckldq %%mm3,%%mm2\n"      /*   L3 L2 L1 L0 l3 l2 l1 l0 */
        "movq %%mm2,%%mm4\n"           /*   L3 L2 L1 L0 l3 l2 l1 l0 */
        "pand %12,%%mm2\n"             /*   L3 0  L1  0 l3  0 l1  0 */
        "pand %13,%%mm4\n"             /*   0  L2  0 L0  0 l2  0 l0 */
        "psrlw $8,%%mm2\n"             /*   0  L3  0 L1  0 l3  0 l1 */

        /* create R (result in mm6) */
        "movq %%mm2,%%mm5\n"           /*   0 L3  0 L1  0 l3  0 l1 */
        "movq %%mm4,%%mm6\n"           /*   0 L2  0 L0  0 l2  0 l0 */
        "paddsw  %%mm1, %%mm5\n"       /* lum1+red:x R3 x R1 x r3 x r1 */
        "paddsw  %%mm1, %%mm6\n"       /* lum1+red:x R2 x R0 x r2 x r0 */
        "packuswb %%mm5,%%mm5\n"       /*  R3 R1 r3 r1 R3 R1 r3 r1 */
        "packuswb %%mm6,%%mm6\n"       /*  R2 R0 r2 r0 R2 R0 r2 r0 */
        "pxor %%mm7,%%mm7\n"      /*         00 00 00 00 00 00 00 00 */
        "punpcklbw %%mm5,%%mm6\n"      /*  R3 R2 R1 R0 r3 r2 r1 r0 */

        /* create Cb (result in mm1) */
        "movd (%1), %%mm1\n"      /*         0  0  0  0  u3 u2 u1 u0 */
        "punpcklbw %%mm7,%%mm1\n" /*         0  u3 0  u2 00 u1 00 u0 */
        "punpckldq %%mm1,%%mm1\n" /*         00 u1 00 u0 00 u1 00 u0 */
        "psubw %9,%%mm1\n"        /* mm1-128:u1 u1 u0 u0 u1 u1 u0 u0 */

        /* create Cb_g (result in mm5) */
        "movq %%mm1,%%mm5\n"            /* u1 u1 u0 u0 u1 u1 u0 u0 */
        "pmullw %14,%%mm5\n"            /* blue*-109dec=1.7129*64 */
        "pmullw %15,%%mm1\n"            /* blue*114dec=1.78125*64 */
        "psraw  $6, %%mm5\n"            /* blue=red/64 */
        "psraw  $6, %%mm1\n"            /* blue=blue/64 */

        /* create G (result in mm7) */
        "movq %%mm2,%%mm3\n"      /*   0  L3  0 L1  0 l3  0 l1 */
        "movq %%mm4,%%mm7\n"      /*   0  L2  0 L0  0 l2  0 l1 */
        "paddsw  %%mm5, %%mm3\n"  /* lum1+Cb_g:x G3t x G1t x g3t x g1t */
        "paddsw  %%mm5, %%mm7\n"  /* lum1+Cb_g:x G2t x G0t x g2t x g0t */
        "paddsw  %%mm0, %%mm3\n"  /* lum1+Cr_g:x G3  x G1  x g3  x g1 */
        "paddsw  %%mm0, %%mm7\n"  /* lum1+blue:x G2  x G0  x g2  x g0 */
        "packuswb %%mm3,%%mm3\n"  /* G3 G1 g3 g1 G3 G1 g3 g1 */
        "packuswb %%mm7,%%mm7\n"  /* G2 G0 g2 g0 G2 G0 g2 g0 */
        "punpcklbw %%mm3,%%mm7\n" /* G3 G2 G1 G0 g3 g2 g1 g0 */

        /* create B (result in mm5) */
        "movq %%mm2,%%mm3\n"         /*   0  L3  0 L1  0 l3  0 l1 */
        "movq %%mm4,%%mm5\n"         /*   0  L2  0 L0  0 l2  0 l1 */
        "paddsw  %%mm1, %%mm3\n"     /* lum1+blue:x B3 x B1 x b3 x b1 */
        "paddsw  %%mm1, %%mm5\n"     /* lum1+blue:x B2 x B0 x b2 x b0 */
        "packuswb %%mm3,%%mm3\n"     /* B3 B1 b3 b1 B3 B1 b3 b1 */
        "packuswb %%mm5,%%mm5\n"     /* B2 B0 b2 b0 B2 B0 b2 b0 */
        "punpcklbw %%mm3,%%mm5\n"    /* B3 B2 B1 B0 b3 b2 b1 b0 */

        /* fill destination row1 (needed are mm6=Rr,mm7=Gg,mm5=Bb) */

        "pxor %%mm2,%%mm2\n"           /*  0  0  0  0  0  0  0  0 */
        "pxor %%mm4,%%mm4\n"           /*  0  0  0  0  0  0  0  0 */
        "movq %%mm6,%%mm1\n"           /* R3 R2 R1 R0 r3 r2 r1 r0 */
        "movq %%mm5,%%mm3\n"           /* B3 B2 B1 B0 b3 b2 b1 b0 */

        /* process lower lum */
        "punpcklbw %%mm4,%%mm1\n"      /*  0 r3  0 r2  0 r1  0 r0 */
        "punpcklbw %%mm4,%%mm3\n"      /*  0 b3  0 b2  0 b1  0 b0 */
        "movq %%mm1,%%mm2\n"           /*  0 r3  0 r2  0 r1  0 r0 */
        "movq %%mm3,%%mm0\n"           /*  0 b3  0 b2  0 b1  0 b0 */
        "punpcklwd %%mm1,%%mm3\n"      /*  0 r1  0 b1  0 r0  0 b0 */
        "punpckhwd %%mm2,%%mm0\n"      /*  0 r3  0 b3  0 r2  0 b2 */

        "pxor %%mm2,%%mm2\n"           /*  0  0  0  0  0  0  0  0 */
        "movq %%mm7,%%mm1\n"           /* G3 G2 G1 G0 g3 g2 g1 g0 */
        "punpcklbw %%mm1,%%mm2\n"      /* g3  0 g2  0 g1  0 g0  0 */
        "punpcklwd %%mm4,%%mm2\n"      /*  0  0 g1  0  0  0 g0  0 */
        "por %%mm3, %%mm2\n"          /*  0 r1 g1 b1  0 r0 g0 b0 */
        "movq %%mm2,(%3)\n"          /* wrote out ! row1 */

        "pxor %%mm2,%%mm2\n"           /*  0  0  0  0  0  0  0  0 */
        "punpcklbw %%mm1,%%mm4\n"      /* g3  0 g2  0 g1  0 g0  0 */
        "punpckhwd %%mm2,%%mm4\n"      /*  0  0 g3  0  0  0 g2  0 */
        "por %%mm0, %%mm4\n"          /*  0 r3 g3 b3  0 r2 g2 b2 */
        "movq %%mm4,8(%3)\n"         /* wrote out ! row1 */

        /* fill destination row2 (needed are mm6=Rr,mm7=Gg,mm5=Bb) */
        /* this can be done "destructive" */
        "pxor %%mm2,%%mm2\n"           /*  0  0  0  0  0  0  0  0 */
        "punpckhbw %%mm2,%%mm6\n"      /*  0 R3  0 R2  0 R1  0 R0 */
        "punpckhbw %%mm1,%%mm5\n"      /* G3 B3 G2 B2 G1 B1 G0 B0 */
        "movq %%mm5,%%mm1\n"           /* G3 B3 G2 B2 G1 B1 G0 B0 */
        "punpcklwd %%mm6,%%mm1\n"      /*  0 R1 G1 B1  0 R0 G0 B0 */
        "movq %%mm1,(%5)\n"          /* wrote out ! row2 */
        "punpckhwd %%mm6,%%mm5\n"      /*  0 R3 G3 B3  0 R2 G2 B2 */
        "movq %%mm5,8(%5)\n"         /* wrote out ! row2 */

        "addl $4,%2\n"            /* lum+4 */
        "leal 16(%3),%3\n"        /* row1+16 */
        "leal 16(%5),%5\n"        /* row2+16 */
        "addl $2,%16\n"        /* cr+2 */
        "addl $2,%1\n"           /* cb+2 */

        "addl $4,%6\n"            /* x+4 */
        "cmpl %4,%6\n"

        "jl 1b\n"
        "addl %4,%2\n" /* lum += cols */
        "addl %8,%3\n" /* row1+= mod */
        "addl %8,%5\n" /* row2+= mod */
        "movl $0,%6\n" /* x=0 */
        "cmpl %7,%2\n"
        "jl 1b\n"

        "emms\n"  /* reset MMX registers. */
        :
        : "m" (cr), "r"(cb),"r"(lum),
          "r"(row1),"r"(cols),"r"(row2),"m"(x),"m"(y),"m"(mod),
          "m"(MMX_0080w),"m"(MMX_VgrnRGB),"m"(MMX_VredRGB),
          "m"(MMX_FF00w),"m"(MMX_00FFw),"m"(MMX_UgrnRGB),
          "m"(MMX_UbluRGB), "r"(dummy)
        : "memory"
    );
Comment 1 Sam Lantinga 2016-11-02 17:54:49 UTC
Does this bug happen in some of the other functions as well?
Can you attach a unified diff patch that fixes the issue for you?

Thanks!
Comment 2 Andrew Wilson 2016-11-04 14:57:02 UTC
Created attachment 2601 [details]
patch indicating my changes that fix the problem

I'm sorry to say that I only fixed the function that I was using, the software 4:2:0 renderer.  I would imagine that the other function Color565DitherYV12MMX1X would suffer the same problem.
Comment 3 Sam Lantinga 2016-11-05 08:50:07 UTC
Thanks. Any chance you can test the other MMX functions and fix them as well?
Comment 4 Andrew Wilson 2016-11-22 15:56:16 UTC
sure - though it may be a wee while before I can fit it in.  (Sorry for delay in replying).
Comment 5 Sam Lantinga 2017-08-11 20:27:32 UTC
This is better documented in bug 3689, so I'm closing this one.

*** This bug has been marked as a duplicate of bug 3689 ***