Logo Search packages:      
Sourcecode: xserver-xorg-video-ati version File versions

radeon_textured_videofuncs.c

/*
 * Copyright 2008 Alex Deucher
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 *
 * Based on radeon_exa_render.c and kdrive ati_video.c by Eric Anholt, et al.
 *
 */

#if defined(ACCEL_MMIO) && defined(ACCEL_CP)
#error Cannot define both MMIO and CP acceleration!
#endif

#if !defined(UNIXCPP) || defined(ANSICPP)
#define FUNC_NAME_CAT(prefix,suffix) prefix##suffix
#else
#define FUNC_NAME_CAT(prefix,suffix) prefix/**/suffix
#endif

#ifdef ACCEL_MMIO
#define FUNC_NAME(prefix) FUNC_NAME_CAT(prefix,MMIO)
#else
#ifdef ACCEL_CP
#define FUNC_NAME(prefix) FUNC_NAME_CAT(prefix,CP)
#else
#error No accel type defined!
#endif
#endif

#ifdef ACCEL_CP

#define VTX_OUT_FILTER(_dstX, _dstY, _srcX, _srcY, _maskX, _maskY)      \
do {                                                  \
    OUT_RING_F(_dstX);                                \
    OUT_RING_F(_dstY);                                \
    OUT_RING_F(_srcX);                                \
    OUT_RING_F(_srcY);                                \
    OUT_RING_F(_maskX);                               \
    OUT_RING_F(_maskY);                               \
} while (0)

#define VTX_OUT(_dstX, _dstY, _srcX, _srcY)     \
do {                                            \
    OUT_RING_F(_dstX);                                \
    OUT_RING_F(_dstY);                                \
    OUT_RING_F(_srcX);                                \
    OUT_RING_F(_srcY);                                \
} while (0)

#else /* ACCEL_CP */

#define VTX_OUT_FILTER(_dstX, _dstY, _srcX, _srcY, _maskX, _maskY)      \
do {                                                  \
    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _dstX);                 \
    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _dstY);                 \
    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _srcX);                 \
    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _srcY);                 \
    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _maskX);                \
    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _maskY);                \
} while (0)

#define VTX_OUT(_dstX, _dstY, _srcX, _srcY)     \
do {                                            \
    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _dstX);           \
    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _dstY);           \
    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _srcX);           \
    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _srcY);           \
} while (0)

#endif /* !ACCEL_CP */

static void
FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
{
    RADEONInfoPtr info = RADEONPTR(pScrn);
    PixmapPtr pPixmap = pPriv->pPixmap;
    uint32_t txformat;
    uint32_t txfilter, txformat0, txformat1, txoffset, txpitch;
    uint32_t dst_offset, dst_pitch, dst_format;
    uint32_t txenable, colorpitch;
    uint32_t blendcntl;
    Bool isplanar = FALSE;
    int dstxoff, dstyoff, pixel_shift, vtx_count;
    BoxPtr pBox = REGION_RECTS(&pPriv->clip);
    int nBox = REGION_NUM_RECTS(&pPriv->clip);
    ACCEL_PREAMBLE();

    pixel_shift = pPixmap->drawable.bitsPerPixel >> 4;

#ifdef USE_EXA
    if (info->useEXA) {
      dst_offset = exaGetPixmapOffset(pPixmap) + info->fbLocation + pScrn->fbOffset;
      dst_pitch = exaGetPixmapPitch(pPixmap);
    } else
#endif
      {
          dst_offset = (pPixmap->devPrivate.ptr - info->FB) +
            info->fbLocation + pScrn->fbOffset;
          dst_pitch = pPixmap->devKind;
      }

#ifdef COMPOSITE
    dstxoff = -pPixmap->screen_x + pPixmap->drawable.x;
    dstyoff = -pPixmap->screen_y + pPixmap->drawable.y;
#else
    dstxoff = 0;
    dstyoff = 0;
#endif

#ifdef USE_EXA
    if (info->useEXA) {
      RADEON_SWITCH_TO_3D();
    } else
#endif
      {
          BEGIN_ACCEL(2);
          if (IS_R300_3D || IS_R500_3D)
            OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);
          else
            OUT_ACCEL_REG(RADEON_RB3D_DSTCACHE_CTLSTAT, RADEON_RB3D_DC_FLUSH);
          /* We must wait for 3d to idle, in case source was just written as a dest. */
          OUT_ACCEL_REG(RADEON_WAIT_UNTIL,
                    RADEON_WAIT_HOST_IDLECLEAN |
                    RADEON_WAIT_2D_IDLECLEAN |
                    RADEON_WAIT_3D_IDLECLEAN |
                    RADEON_WAIT_DMA_GUI_IDLE);
          FINISH_ACCEL();

          if (!info->accel_state->XInited3D)
            RADEONInit3DEngine(pScrn);
      }

    if (pPriv->bicubic_enabled)
      vtx_count = 6;
    else
      vtx_count = 4;

    if (IS_R300_3D || IS_R500_3D) {
      uint32_t output_fmt;

      switch (pPixmap->drawable.bitsPerPixel) {
      case 16:
          if (pPixmap->drawable.depth == 15)
            dst_format = R300_COLORFORMAT_ARGB1555;
          else
            dst_format = R300_COLORFORMAT_RGB565;
          break;
      case 32:
          dst_format = R300_COLORFORMAT_ARGB8888;
          break;
      default:
          return;
      }

      output_fmt = (R300_OUT_FMT_C4_8 |
                  R300_OUT_FMT_C0_SEL_BLUE |
                  R300_OUT_FMT_C1_SEL_GREEN |
                  R300_OUT_FMT_C2_SEL_RED |
                  R300_OUT_FMT_C3_SEL_ALPHA);

      colorpitch = dst_pitch >> pixel_shift;
      colorpitch |= dst_format;

      if (RADEONTilingEnabled(pScrn, pPixmap))
          colorpitch |= R300_COLORTILE;

      if (pPriv->planar_hw && (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12)) {
          isplanar = TRUE;
      }

      if (isplanar) {
          txformat1 = R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_HALF_REGION_0;
          txpitch = pPriv->src_pitch;
      } else {
          if (pPriv->id == FOURCC_UYVY)
            txformat1 = R300_TX_FORMAT_YVYU422;
          else
            txformat1 = R300_TX_FORMAT_VYUY422;

          txformat1 |= R300_TX_FORMAT_YUV_TO_RGB_CLAMP;

          /* pitch is in pixels */
          txpitch = pPriv->src_pitch / 2;
      }
      txpitch -= 1;

      txformat0 = ((((pPriv->w - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
                (((pPriv->h - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
                R300_TXPITCH_EN);

      info->accel_state->texW[0] = pPriv->w;
      info->accel_state->texH[0] = pPriv->h;

      txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
                R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
                R300_TX_MAG_FILTER_LINEAR |
                R300_TX_MIN_FILTER_LINEAR |
                (0 << R300_TX_ID_SHIFT));


      if (IS_R500_3D && ((pPriv->w - 1) & 0x800))
          txpitch |= R500_TXWIDTH_11;

      if (IS_R500_3D && ((pPriv->h - 1) & 0x800))
          txpitch |= R500_TXHEIGHT_11;

      txoffset = pPriv->src_offset;

      BEGIN_ACCEL(6);
      OUT_ACCEL_REG(R300_TX_FILTER0_0, txfilter);
      OUT_ACCEL_REG(R300_TX_FILTER1_0, 0);
      OUT_ACCEL_REG(R300_TX_FORMAT0_0, txformat0);
      OUT_ACCEL_REG(R300_TX_FORMAT1_0, txformat1);
      OUT_ACCEL_REG(R300_TX_FORMAT2_0, txpitch);
      OUT_ACCEL_REG(R300_TX_OFFSET_0, txoffset);
      FINISH_ACCEL();

      txenable = R300_TEX_0_ENABLE;

      if (isplanar) {
          txformat0 = ((((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
                  (((((pPriv->h + 1 ) >> 1 ) - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
                  R300_TXPITCH_EN);
          txpitch = ((pPriv->src_pitch >> 1) + 63) & ~63;
          txpitch -= 1;
          txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
                    R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
                  R300_TX_MIN_FILTER_LINEAR |
                  R300_TX_MAG_FILTER_LINEAR);

            BEGIN_ACCEL(12);
            OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter | (1 << R300_TX_ID_SHIFT));
            OUT_ACCEL_REG(R300_TX_FILTER1_1, 0);
            OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0);
            OUT_ACCEL_REG(R300_TX_FORMAT1_1, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_2);
            OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch);
            OUT_ACCEL_REG(R300_TX_OFFSET_1, txoffset + pPriv->planeu_offset);
            OUT_ACCEL_REG(R300_TX_FILTER0_2, txfilter | (2 << R300_TX_ID_SHIFT));
            OUT_ACCEL_REG(R300_TX_FILTER1_2, 0);
            OUT_ACCEL_REG(R300_TX_FORMAT0_2, txformat0);
            OUT_ACCEL_REG(R300_TX_FORMAT1_2, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_3);
            OUT_ACCEL_REG(R300_TX_FORMAT2_2, txpitch);
            OUT_ACCEL_REG(R300_TX_OFFSET_2, txoffset + pPriv->planev_offset);
            FINISH_ACCEL();
            txenable |= R300_TEX_1_ENABLE | R300_TEX_2_ENABLE;
      }

      if (pPriv->bicubic_enabled) {
            /* Size is 128x1 */
            txformat0 = ((0x7f << R300_TXWIDTH_SHIFT) |
                       (0x0 << R300_TXHEIGHT_SHIFT) |
                       R300_TXPITCH_EN);
            /* Format is 32-bit floats, 4bpp */
            txformat1 = R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R16G16B16A16);
            /* Pitch is 127 (128-1) */
            txpitch = 0x7f;
            /* Tex filter */
            txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_WRAP) |
                      R300_TX_CLAMP_T(R300_TX_CLAMP_WRAP) |
                      R300_TX_MIN_FILTER_NEAREST |
                      R300_TX_MAG_FILTER_NEAREST |
                      (1 << R300_TX_ID_SHIFT));

            BEGIN_ACCEL(6);
            OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter);
            OUT_ACCEL_REG(R300_TX_FILTER1_1, 0);
            OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0);
            OUT_ACCEL_REG(R300_TX_FORMAT1_1, txformat1);
            OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch);
            OUT_ACCEL_REG(R300_TX_OFFSET_1, pPriv->bicubic_src_offset);
            FINISH_ACCEL();

            /* Enable tex 1 */
            txenable |= R300_TEX_1_ENABLE;
      }

      /* setup the VAP */
      if (info->accel_state->has_tcl) {
          if (pPriv->bicubic_enabled)
            BEGIN_ACCEL(7);
          else
            BEGIN_ACCEL(6);
      } else {
          if (pPriv->bicubic_enabled)
            BEGIN_ACCEL(5);
          else
            BEGIN_ACCEL(4);
      }

      /* These registers define the number, type, and location of data submitted
       * to the PVS unit of GA input (when PVS is disabled)
       * DST_VEC_LOC is the slot in the PVS input vector memory when PVS/TCL is
       * enabled.  This memory provides the imputs to the vertex shader program
       * and ordering is not important.  When PVS/TCL is disabled, this field maps
       * directly to the GA input memory and the order is signifigant.  In
       * PVS_BYPASS mode the order is as follows:
       * Position
       * Point Size
       * Color 0-3
       * Textures 0-7
       * Fog
       */
      if (pPriv->bicubic_enabled) {
          OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_0,
                    ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
                     (0 << R300_SKIP_DWORDS_0_SHIFT) |
                     (0 << R300_DST_VEC_LOC_0_SHIFT) |
                     R300_SIGNED_0 |
                     (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
                     (0 << R300_SKIP_DWORDS_1_SHIFT) |
                     (6 << R300_DST_VEC_LOC_1_SHIFT) |
                     R300_SIGNED_1));
          OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_1,
                    ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_2_SHIFT) |
                     (0 << R300_SKIP_DWORDS_2_SHIFT) |
                     (7 << R300_DST_VEC_LOC_2_SHIFT) |
                     R300_LAST_VEC_2 |
                     R300_SIGNED_2));
      } else {
          OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_0,
                    ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
                     (0 << R300_SKIP_DWORDS_0_SHIFT) |
                     (0 << R300_DST_VEC_LOC_0_SHIFT) |
                     R300_SIGNED_0 |
                     (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
                     (0 << R300_SKIP_DWORDS_1_SHIFT) |
                     (6 << R300_DST_VEC_LOC_1_SHIFT) |
                     R300_LAST_VEC_1 |
                     R300_SIGNED_1));
      }

      /* load the vertex shader
       * We pre-load vertex programs in RADEONInit3DEngine():
       * - exa mask/Xv bicubic
       * - exa no mask
       * - Xv
       * Here we select the offset of the vertex program we want to use
       */
      if (info->accel_state->has_tcl) {
          if (pPriv->bicubic_enabled) {
            OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0,
                        ((0 << R300_PVS_FIRST_INST_SHIFT) |
                         (2 << R300_PVS_XYZW_VALID_INST_SHIFT) |
                         (2 << R300_PVS_LAST_INST_SHIFT)));
            OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1,
                        (2 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
          } else {
            OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0,
                        ((5 << R300_PVS_FIRST_INST_SHIFT) |
                         (6 << R300_PVS_XYZW_VALID_INST_SHIFT) |
                         (6 << R300_PVS_LAST_INST_SHIFT)));
            OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1,
                        (6 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
          }
      }

      /* Position and one set of 2 texture coordinates */
      OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_0, R300_VTX_POS_PRESENT);
      if (pPriv->bicubic_enabled)
          OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_1, ((2 << R300_TEX_0_COMP_CNT_SHIFT) |
                                       (2 << R300_TEX_1_COMP_CNT_SHIFT)));
      else
          OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_1, (2 << R300_TEX_0_COMP_CNT_SHIFT));

      OUT_ACCEL_REG(R300_US_OUT_FMT_0, output_fmt);
      FINISH_ACCEL();

      /* setup pixel shader */
      if (IS_R300_3D) {
          if (pPriv->bicubic_enabled) {
            BEGIN_ACCEL(79);

            /* 4 components: 2 for tex0 and 2 for tex1 */
            OUT_ACCEL_REG(R300_RS_COUNT, ((4 << R300_RS_COUNT_IT_COUNT_SHIFT) |
                                       R300_RS_COUNT_HIRES_EN));

            /* R300_INST_COUNT_RS - highest RS instruction used */
            OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(1) | R300_TX_OFFSET_RS(6));

            /* Pixel stack frame size. */
            OUT_ACCEL_REG(R300_US_PIXSIZE, 5);

            /* Indirection levels */
            OUT_ACCEL_REG(R300_US_CONFIG, ((2 << R300_NLEVEL_SHIFT) |
                                          R300_FIRST_TEX));

            /* Set nodes. */
            OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
                                          R300_ALU_CODE_SIZE(14) |
                                          R300_TEX_CODE_OFFSET(0) |
                                          R300_TEX_CODE_SIZE(6)));

            /* Nodes are allocated highest first, but executed lowest first */
            OUT_ACCEL_REG(R300_US_CODE_ADDR_0, 0);
            OUT_ACCEL_REG(R300_US_CODE_ADDR_1, (R300_ALU_START(0) |
                                          R300_ALU_SIZE(0) |
                                          R300_TEX_START(0) |
                                          R300_TEX_SIZE(0)));
            OUT_ACCEL_REG(R300_US_CODE_ADDR_2, (R300_ALU_START(1) |
                                          R300_ALU_SIZE(9) |
                                          R300_TEX_START(1) |
                                          R300_TEX_SIZE(0)));
            OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(11) |
                                          R300_ALU_SIZE(2) |
                                          R300_TEX_START(2) |
                                          R300_TEX_SIZE(3) |
                                          R300_RGBA_OUT));

            /* ** BICUBIC FP ** */

            /* texcoord0 => temp0
             * texcoord1 => temp1 */

            // first node
            /* TEX temp2, temp1.rrr0, tex1, 1D */
            OUT_ACCEL_REG(R300_US_TEX_INST(0), (R300_TEX_INST(R300_TEX_INST_LD) |
                                       R300_TEX_ID(1) |
                                       R300_TEX_SRC_ADDR(1) |
                                       R300_TEX_DST_ADDR(2)));

            /* MOV temp1.r, temp1.ggg0 */
            OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
                                       R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
                                       R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
                                       R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
            OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(1) |
                                       R300_ALU_RGB_ADDRD(1) |
                                       R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
                                       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
                                       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
                                       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDRD(1) |
                                       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));


            // second node
            /* TEX temp1, temp1, tex1, 1D */
            OUT_ACCEL_REG(R300_US_TEX_INST(1), (R300_TEX_INST(R300_TEX_INST_LD) |
                                       R300_TEX_ID(1) |
                                       R300_TEX_SRC_ADDR(1) |
                                       R300_TEX_DST_ADDR(1)));

            /* MUL temp3.rg, temp2.ggg0, const0.rgb0 */
            OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
                                       R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
                                       R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
                                       R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
            OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(2) |
                                       R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(0)) |
                                       R300_ALU_RGB_ADDRD(3) |
                                       R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
                                       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
                                       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
                                       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(3) |
                                       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));


            /* MUL temp2.rg, temp2.rrr0, const0.rgb */
            OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
                                       R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
                                       R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
                                       R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
            OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(2) |
                                       R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(0)) |
                                       R300_ALU_RGB_ADDRD(2) |
                                       R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
                                       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
                                       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
                                       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(2) |
                                       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));

            /* MAD temp4.rg, temp1.ggg0, const1.rgb, temp3.rgb0 */
            OUT_ACCEL_REG(R300_US_ALU_RGB_INST(3), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
                                       R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
                                       R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
                                       R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
            OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(1) |
                                       R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
                                       R300_ALU_RGB_ADDR2(3) |
                                       R300_ALU_RGB_ADDRD(4) |
                                       R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
                                       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
                                       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
                                       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(4) |
                                       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));

            /* MAD temp5.rg, temp1.ggg0, const1.rgb, temp2.rgb0 */
            OUT_ACCEL_REG(R300_US_ALU_RGB_INST(4), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
                                       R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
                                       R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
                                       R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
            OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(1) |
                                       R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
                                       R300_ALU_RGB_ADDR2(2) |
                                       R300_ALU_RGB_ADDRD(5) |
                                       R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
                                       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
                                       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
                                       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(5) |
                                       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));

            /* MAD temp3.rg, temp1.rrr0, const1.rgb, temp3.rgb0 */
            OUT_ACCEL_REG(R300_US_ALU_RGB_INST(5), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
                                       R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
                                       R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
                                       R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
            OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(1) |
                                       R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
                                       R300_ALU_RGB_ADDR2(3) |
                                       R300_ALU_RGB_ADDRD(3) |
                                       R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
                                       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
                                       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
                                       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(3) |
                                       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));

            /* MAD temp1.rg, temp1.rrr0, const1.rgb, temp2.rgb0 */
            OUT_ACCEL_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
                                       R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
                                       R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
                                       R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
            OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(1) |
                                       R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
                                       R300_ALU_RGB_ADDR2(2) |
                                       R300_ALU_RGB_ADDRD(1) |
                                       R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
                                       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
                                       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
                                       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(1) |
                                       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));

            /* ADD temp1.rg, temp0.rgb0, temp1.rgb0 */
            OUT_ACCEL_REG(R300_US_ALU_RGB_INST(7), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
                                       R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
                                       R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
                                       R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
            OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) |
                                       R300_ALU_RGB_ADDR2(1) |
                                       R300_ALU_RGB_ADDRD(1) |
                                       R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
                                       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
                                       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
                                       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(1) |
                                       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));

            /* ADD temp2.rg, temp0.rgb0, temp3.rgb0 */
            OUT_ACCEL_REG(R300_US_ALU_RGB_INST(8), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
                                       R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
                                       R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
                                       R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
            OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) |
                                       R300_ALU_RGB_ADDR2(3) |
                                       R300_ALU_RGB_ADDRD(2) |
                                       R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
                                       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
                                       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
                                       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(2) |
                                       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));

            /* ADD temp3.rg, temp0.rgb0, temp5.rgb0 */
            OUT_ACCEL_REG(R300_US_ALU_RGB_INST(9), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
                                       R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
                                       R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
                                       R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
            OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) |
                                       R300_ALU_RGB_ADDR2(5) |
                                       R300_ALU_RGB_ADDRD(3) |
                                       R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
                                       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
                                       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
                                       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(3) |
                                       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));

            /* ADD temp0.rg, temp0.rgb0, temp4.rgb0 */
            OUT_ACCEL_REG(R300_US_ALU_RGB_INST(10), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
                                       R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
                                       R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
                                       R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
            OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(10), (R300_ALU_RGB_ADDR0(0) |
                                       R300_ALU_RGB_ADDR2(4) |
                                       R300_ALU_RGB_ADDRD(0) |
                                       R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(10), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
                                       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
                                       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
                                       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(10), (R300_ALU_ALPHA_ADDRD(0) |
                                       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));


            // third node
            /* TEX temp4, temp1.rg--, tex0, 1D */
            OUT_ACCEL_REG(R300_US_TEX_INST(2), (R300_TEX_INST(R300_TEX_INST_LD) |
                                       R300_TEX_ID(0) |
                                       R300_TEX_SRC_ADDR(1) |
                                       R300_TEX_DST_ADDR(4)));

            /* TEX temp3, temp3.rg--, tex0, 1D */
            OUT_ACCEL_REG(R300_US_TEX_INST(3), (R300_TEX_INST(R300_TEX_INST_LD) |
                                       R300_TEX_ID(0) |
                                       R300_TEX_SRC_ADDR(3) |
                                       R300_TEX_DST_ADDR(3)));

            /* TEX temp5, temp2.rg--, tex0, 1D */
            OUT_ACCEL_REG(R300_US_TEX_INST(4), (R300_TEX_INST(R300_TEX_INST_LD) |
                                       R300_TEX_ID(0) |
                                       R300_TEX_SRC_ADDR(2) |
                                       R300_TEX_DST_ADDR(5)));

            /* TEX temp0, temp0.rg--, tex0, 1D */
            OUT_ACCEL_REG(R300_US_TEX_INST(5), (R300_TEX_INST(R300_TEX_INST_LD) |
                                       R300_TEX_ID(0) |
                                       R300_TEX_SRC_ADDR(0) |
                                       R300_TEX_DST_ADDR(0)));

            /* LRP temp3, temp1.bbbb, temp4, temp3 ->
             * - PRESUB temps, temp4 - temp3
             * - MAD temp3, temp1.bbbb, temps, temp3 */
            OUT_ACCEL_REG(R300_US_ALU_RGB_INST(11), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
                                       R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
                                       R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
                                       R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
                                       R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0)));
            OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(11), (R300_ALU_RGB_ADDR0(3) |
                                       R300_ALU_RGB_ADDR1(4) |
                                       R300_ALU_RGB_ADDR2(1) |
                                       R300_ALU_RGB_ADDRD(3) |
                                       R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(11), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
                                       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
                                       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
                                       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(11), (R300_ALU_ALPHA_ADDR0(3) |
                                       R300_ALU_ALPHA_ADDR1(4) |
                                       R300_ALU_ALPHA_ADDR2(1) |
                                       R300_ALU_ALPHA_ADDRD(3) |
                                       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_A)));

            /* LRP temp0, temp1.bbbb, temp5, temp0 ->
             * - PRESUB temps, temp5 - temp0
             * - MAD temp0, temp1.bbbb, temps, temp0 */
            OUT_ACCEL_REG(R300_US_ALU_RGB_INST(12), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
                                       R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
                                       R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
                                       R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
                                       R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0) |
                                       R300_ALU_RGB_INSERT_NOP));
            OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(12), (R300_ALU_RGB_ADDR0(0) |
                                       R300_ALU_RGB_ADDR1(5) |
                                       R300_ALU_RGB_ADDR2(1) |
                                       R300_ALU_RGB_ADDRD(0) |
                                       R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(12), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
                                       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
                                       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
                                       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(12), (R300_ALU_ALPHA_ADDR0(0) |
                                       R300_ALU_ALPHA_ADDR1(5) |
                                       R300_ALU_ALPHA_ADDR2(1) |
                                       R300_ALU_ALPHA_ADDRD(0) |
                                       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_A)));

            /* LRP output, temp2.bbbb, temp3, temp0 ->
             * - PRESUB temps, temp3 - temp0
             * - MAD output, temp2.bbbb, temps, temp0 */
            OUT_ACCEL_REG(R300_US_ALU_RGB_INST(13), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
                                       R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
                                       R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
                                       R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
                                       R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0)));
            OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(13), (R300_ALU_RGB_ADDR0(0) |
                                       R300_ALU_RGB_ADDR1(3) |
                                       R300_ALU_RGB_ADDR2(2) |
                                       R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(13), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
                                       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
                                       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
                                       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(13), (R300_ALU_ALPHA_ADDR0(0) |
                                       R300_ALU_ALPHA_ADDR1(3) |
                                       R300_ALU_ALPHA_ADDR2(2) |
                                       R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A)));

            /* Shader constants. */
            OUT_ACCEL_REG(R300_US_ALU_CONST_R(0), F_TO_24(1.0/(float)pPriv->w));
            OUT_ACCEL_REG(R300_US_ALU_CONST_G(0), 0);
            OUT_ACCEL_REG(R300_US_ALU_CONST_B(0), 0);
            OUT_ACCEL_REG(R300_US_ALU_CONST_A(0), 0);

            OUT_ACCEL_REG(R300_US_ALU_CONST_R(1), 0);
            OUT_ACCEL_REG(R300_US_ALU_CONST_G(1), F_TO_24(1.0/(float)pPriv->h));
            OUT_ACCEL_REG(R300_US_ALU_CONST_B(1), 0);
            OUT_ACCEL_REG(R300_US_ALU_CONST_A(1), 0);

            FINISH_ACCEL();
          } else if (isplanar) {
          /*
           * y' = y - .0625
           * u' = u - .5
           * v' = v - .5;
           *
           * r = 1.1643 * y' + 0.0     * u' + 1.5958  * v'
           * g = 1.1643 * y' - 0.39173 * u' - 0.81290 * v'
           * b = 1.1643 * y' + 2.017   * u' + 0.0     * v'
           *
           * DP3 might look like the straightforward solution
           * but we'd need to move the texture yuv values in
           * the same reg for this to work. Therefore use MADs.
           * Without changing the shader at all (only the constants)
           * could also provide hue/saturation/brightness/contrast control.
           *
           * yco = 1.1643
           * uco = 0, -0.39173, 2.017
           * vco = 1.5958, -0.8129, 0
           * off = -0.0625 * yco + -0.5 * uco[r] + -0.5 * vco[r],
           *       -0.0625 * yco + -0.5 * uco[g] + -0.5 * vco[g],
           *       -0.0625 * yco + -0.5 * uco[b] + -0.5 * vco[b],
           *
           * temp = MAD(yco, yuv.yyyy, off)
           * temp = MAD(uco, yuv.uuuu, temp)
           * result = MAD(vco, yuv.vvvv, temp)
           */
            float yco = 1.1643;
            float uco[3] = {0.0, -0.39173, 2.018};
            float vco[3] = {1.5958, -0.8129, 0.0};
            float off[3] = {-0.0625 * yco + -0.5 * uco[0] + -0.5 * vco[0],
                        -0.0625 * yco + -0.5 * uco[1] + -0.5 * vco[1],
                        -0.0625 * yco + -0.5 * uco[2] + -0.5 * vco[2]};

            BEGIN_ACCEL(33);
            /* 2 components: same 2 for tex0/1/2 */
            OUT_ACCEL_REG(R300_RS_COUNT,
                    ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
                     R300_RS_COUNT_HIRES_EN));
            /* R300_INST_COUNT_RS - highest RS instruction used */
            OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6));

            OUT_ACCEL_REG(R300_US_PIXSIZE, 2); /* highest temp used */

            /* Indirection levels */
            OUT_ACCEL_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
                                          R300_FIRST_TEX));

            OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
                                       R300_ALU_CODE_SIZE(3) |
                                       R300_TEX_CODE_OFFSET(0) |
                                       R300_TEX_CODE_SIZE(3)));

            OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
                                       R300_ALU_SIZE(2) |
                                       R300_TEX_START(0) |
                                       R300_TEX_SIZE(2) |
                                       R300_RGBA_OUT));

            /* tex inst */
            OUT_ACCEL_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
                                      R300_TEX_DST_ADDR(0) |
                                      R300_TEX_ID(0) |
                                      R300_TEX_INST(R300_TEX_INST_LD)));
            OUT_ACCEL_REG(R300_US_TEX_INST_1, (R300_TEX_SRC_ADDR(0) |
                                      R300_TEX_DST_ADDR(1) |
                                      R300_TEX_ID(1) |
                                      R300_TEX_INST(R300_TEX_INST_LD)));
            OUT_ACCEL_REG(R300_US_TEX_INST_2, (R300_TEX_SRC_ADDR(0) |
                                      R300_TEX_DST_ADDR(2) |
                                      R300_TEX_ID(2) |
                                      R300_TEX_INST(R300_TEX_INST_LD)));

            /* ALU inst */
            /* MAD temp0, const0.a, temp0, const0.rgb */
            OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(0)) |
                                       R300_ALU_RGB_ADDR1(0) |
                                       R300_ALU_RGB_ADDR2(0) |
                                       R300_ALU_RGB_ADDRD(0) |
                                       R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
            OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_AAA) |
                                       R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
                                       R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
                                       R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
                                       R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
                                       R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
                                       R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
                                       R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
            /* alpha nop, but need to set up alpha source for rgb usage */
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(0)) |
                                       R300_ALU_ALPHA_ADDR1(0) |
                                       R300_ALU_ALPHA_ADDR2(0) |
                                       R300_ALU_ALPHA_ADDRD(0) |
                                       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
                                       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
                                       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
                                       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));

            /* MAD const1, temp1, temp0 */
            OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(1)) |
                                       R300_ALU_RGB_ADDR1(1) |
                                       R300_ALU_RGB_ADDR2(0) |
                                       R300_ALU_RGB_ADDRD(0) |
                                       R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
            OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
                                       R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
                                       R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
                                       R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
                                       R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
                                       R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
                                       R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
                                       R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
            /* alpha nop */
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(0) |
                                       R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
                                       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
                                       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
                                       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));

            /* MAD result, const2, temp2, temp0 */
            OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(2)) |
                                       R300_ALU_RGB_ADDR1(2) |
                                       R300_ALU_RGB_ADDR2(0) |
                                       R300_ALU_RGB_ADDRD(0) |
                                       R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB) |
                                       R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB)));
            OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
                                       R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
                                       R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
                                       R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
                                       R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
                                       R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
                                       R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
                                       R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
                                       R300_ALU_RGB_CLAMP));
            /* write alpha 1 */
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(0) |
                                       R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
                                       R300_ALU_ALPHA_TARGET_A));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
                                       R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
                                       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
                                       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_1_0)));

            /* Shader constants. */
            /* constant 0: off, yco */
            OUT_ACCEL_REG(R300_US_ALU_CONST_R(0), F_TO_24(off[0]));
            OUT_ACCEL_REG(R300_US_ALU_CONST_G(0), F_TO_24(off[1]));
            OUT_ACCEL_REG(R300_US_ALU_CONST_B(0), F_TO_24(off[2]));
            OUT_ACCEL_REG(R300_US_ALU_CONST_A(0), F_TO_24(yco));
            /* constant 1: uco */
            OUT_ACCEL_REG(R300_US_ALU_CONST_R(1), F_TO_24(uco[0]));
            OUT_ACCEL_REG(R300_US_ALU_CONST_G(1), F_TO_24(uco[1]));
            OUT_ACCEL_REG(R300_US_ALU_CONST_B(1), F_TO_24(uco[2]));
            OUT_ACCEL_REG(R300_US_ALU_CONST_A(1), F_TO_24(0.0));
            /* constant 2: vco */
            OUT_ACCEL_REG(R300_US_ALU_CONST_R(2), F_TO_24(vco[0]));
            OUT_ACCEL_REG(R300_US_ALU_CONST_G(2), F_TO_24(vco[1]));
            OUT_ACCEL_REG(R300_US_ALU_CONST_B(2), F_TO_24(vco[2]));
            OUT_ACCEL_REG(R300_US_ALU_CONST_A(2), F_TO_24(0.0));

            FINISH_ACCEL();

          } else {
            BEGIN_ACCEL(11);
            /* 2 components: 2 for tex0 */
            OUT_ACCEL_REG(R300_RS_COUNT,
                    ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
                     R300_RS_COUNT_HIRES_EN));
            /* R300_INST_COUNT_RS - highest RS instruction used */
            OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6));

            OUT_ACCEL_REG(R300_US_PIXSIZE, 0); /* highest temp used */

            /* Indirection levels */
            OUT_ACCEL_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
                                          R300_FIRST_TEX));

            OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
                                       R300_ALU_CODE_SIZE(1) |
                                       R300_TEX_CODE_OFFSET(0) |
                                       R300_TEX_CODE_SIZE(1)));

            OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
                                       R300_ALU_SIZE(0) |
                                       R300_TEX_START(0) |
                                       R300_TEX_SIZE(0) |
                                       R300_RGBA_OUT));

            /* tex inst */
            OUT_ACCEL_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
                                      R300_TEX_DST_ADDR(0) |
                                      R300_TEX_ID(0) |
                                      R300_TEX_INST(R300_TEX_INST_LD)));

            /* ALU inst */
            /* RGB */
            OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR_0, (R300_ALU_RGB_ADDR0(0) |
                                       R300_ALU_RGB_ADDR1(0) |
                                       R300_ALU_RGB_ADDR2(0) |
                                       R300_ALU_RGB_ADDRD(0) |
                                       R300_ALU_RGB_OMASK((R300_ALU_RGB_MASK_R |
                                       R300_ALU_RGB_MASK_G |
                                       R300_ALU_RGB_MASK_B)) |
                                       R300_ALU_RGB_TARGET_A));
            OUT_ACCEL_REG(R300_US_ALU_RGB_INST_0, (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
                                       R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
                                       R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
                                       R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
                                       R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) |
                                       R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
                                       R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
                                       R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
                                       R300_ALU_RGB_CLAMP));
            /* Alpha */
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR_0, (R300_ALU_ALPHA_ADDR0(0) |
                                       R300_ALU_ALPHA_ADDR1(0) |
                                       R300_ALU_ALPHA_ADDR2(0) |
                                       R300_ALU_ALPHA_ADDRD(0) |
                                       R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
                                       R300_ALU_ALPHA_TARGET_A |
                                       R300_ALU_ALPHA_OMASK_W(R300_ALU_ALPHA_MASK_NONE)));
            OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST_0, (R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_A) |
                                       R300_ALU_ALPHA_MOD_A(R300_ALU_ALPHA_MOD_NOP) |
                                       R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_1_0) |
                                       R300_ALU_ALPHA_MOD_B(R300_ALU_ALPHA_MOD_NOP) |
                                       R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0) |
                                       R300_ALU_ALPHA_MOD_C(R300_ALU_ALPHA_MOD_NOP) |
                                       R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
                                       R300_ALU_ALPHA_OMOD(R300_ALU_ALPHA_OMOD_NONE) |
                                       R300_ALU_ALPHA_CLAMP));
            FINISH_ACCEL();
          }
      } else {
          if (pPriv->bicubic_enabled) {
            BEGIN_ACCEL(7);

            /* 4 components: 2 for tex0 and 2 for tex1 */
            OUT_ACCEL_REG(R300_RS_COUNT,
                        ((4 << R300_RS_COUNT_IT_COUNT_SHIFT) |
                         R300_RS_COUNT_HIRES_EN));

            /* R300_INST_COUNT_RS - highest RS instruction used */
            OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(1) | R300_TX_OFFSET_RS(6));

            /* Pixel stack frame size. */
            OUT_ACCEL_REG(R300_US_PIXSIZE, 5);

            /* FP length. */
            OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
                                      R500_US_CODE_END_ADDR(13)));
            OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
                                       R500_US_CODE_RANGE_SIZE(13)));

            /* Prepare for FP emission. */
            OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
            OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
            FINISH_ACCEL();

            BEGIN_ACCEL(89);
            /* Pixel shader.
             * I've gone ahead and annotated each instruction, since this
             * thing is MASSIVE. :3
             * Note: In order to avoid buggies with temps and multiple
             * inputs, all temps are offset by 2. temp0 -> register2. */

            /* TEX temp2, input1.xxxx, tex1, 1D */
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
                                           R500_INST_RGB_WMASK_R |
                                           R500_INST_RGB_WMASK_G |
                                           R500_INST_RGB_WMASK_B));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
                                           R500_TEX_INST_LD |
                                           R500_TEX_IGNORE_UNCOVERED));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(1) |
                                           R500_TEX_SRC_S_SWIZ_R |
                                           R500_TEX_SRC_T_SWIZ_R |
                                           R500_TEX_SRC_R_SWIZ_R |
                                           R500_TEX_SRC_Q_SWIZ_R |
                                           R500_TEX_DST_ADDR(2) |
                                           R500_TEX_DST_R_SWIZ_R |
                                           R500_TEX_DST_G_SWIZ_G |
                                           R500_TEX_DST_B_SWIZ_B |
                                           R500_TEX_DST_A_SWIZ_A));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);

            /* TEX temp5, input1.yyyy, tex1, 1D */
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
                                           R500_INST_TEX_SEM_WAIT |
                                           R500_INST_RGB_WMASK_R |
                                           R500_INST_RGB_WMASK_G |
                                           R500_INST_RGB_WMASK_B));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
                                           R500_TEX_INST_LD |
                                           R500_TEX_SEM_ACQUIRE |
                                           R500_TEX_IGNORE_UNCOVERED));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(1) |
                                           R500_TEX_SRC_S_SWIZ_G |
                                           R500_TEX_SRC_T_SWIZ_G |
                                           R500_TEX_SRC_R_SWIZ_G |
                                           R500_TEX_SRC_Q_SWIZ_G |
                                           R500_TEX_DST_ADDR(5) |
                                           R500_TEX_DST_R_SWIZ_R |
                                           R500_TEX_DST_G_SWIZ_G |
                                           R500_TEX_DST_B_SWIZ_B |
                                           R500_TEX_DST_A_SWIZ_A));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);

            /* MUL temp4, const0.x0x0, temp2.yyxx */
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
                                           R500_INST_TEX_SEM_WAIT |
                                           R500_INST_RGB_WMASK_R |
                                           R500_INST_RGB_WMASK_G |
                                           R500_INST_RGB_WMASK_B |
                                           R500_INST_ALPHA_WMASK));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
                                           R500_RGB_ADDR0_CONST |
                                           R500_RGB_ADDR1(2)));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
                                           R500_ALPHA_ADDR0_CONST |
                                           R500_ALPHA_ADDR1(2)));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
                                           R500_ALU_RGB_R_SWIZ_A_R |
                                           R500_ALU_RGB_G_SWIZ_A_0 |
                                           R500_ALU_RGB_B_SWIZ_A_R |
                                           R500_ALU_RGB_SEL_B_SRC1 |
                                           R500_ALU_RGB_R_SWIZ_B_G |
                                           R500_ALU_RGB_G_SWIZ_B_G |
                                           R500_ALU_RGB_B_SWIZ_B_R));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(4) |
                                           R500_ALPHA_OP_MAD |
                                           R500_ALPHA_SEL_A_SRC0 |
                                           R500_ALPHA_SWIZ_A_0 |
                                           R500_ALPHA_SEL_B_SRC1 |
                                           R500_ALPHA_SWIZ_B_R));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(4) |
                                           R500_ALU_RGBA_OP_MAD |
                                           R500_ALU_RGBA_R_SWIZ_0 |
                                           R500_ALU_RGBA_G_SWIZ_0 |
                                           R500_ALU_RGBA_B_SWIZ_0 |
                                           R500_ALU_RGBA_A_SWIZ_0));

            /* MAD temp3, const0.0y0y, temp5.xxxx, temp4 */
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
                                           R500_INST_RGB_WMASK_R |
                                           R500_INST_RGB_WMASK_G |
                                           R500_INST_RGB_WMASK_B |
                                           R500_INST_ALPHA_WMASK));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
                                           R500_RGB_ADDR0_CONST |
                                           R500_RGB_ADDR1(5) |
                                           R500_RGB_ADDR2(4)));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
                                           R500_ALPHA_ADDR0_CONST |
                                           R500_ALPHA_ADDR1(5) |
                                           R500_ALPHA_ADDR2(4)));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
                                           R500_ALU_RGB_R_SWIZ_A_0 |
                                           R500_ALU_RGB_G_SWIZ_A_G |
                                           R500_ALU_RGB_B_SWIZ_A_0 |
                                           R500_ALU_RGB_SEL_B_SRC1 |
                                           R500_ALU_RGB_R_SWIZ_B_R |
                                           R500_ALU_RGB_G_SWIZ_B_R |
                                           R500_ALU_RGB_B_SWIZ_B_R));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
                                           R500_ALPHA_OP_MAD |
                                           R500_ALPHA_SEL_A_SRC0 |
                                           R500_ALPHA_SWIZ_A_G |
                                           R500_ALPHA_SEL_B_SRC1 |
                                           R500_ALPHA_SWIZ_B_R));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
                                           R500_ALU_RGBA_OP_MAD |
                                           R500_ALU_RGBA_SEL_C_SRC2 |
                                           R500_ALU_RGBA_R_SWIZ_R |
                                           R500_ALU_RGBA_G_SWIZ_G |
                                           R500_ALU_RGBA_B_SWIZ_B |
                                           R500_ALU_RGBA_A_SWIZ_A));

            /* ADD temp3, temp3, input0.xyxy */
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
                                           R500_INST_RGB_WMASK_R |
                                           R500_INST_RGB_WMASK_G |
                                           R500_INST_RGB_WMASK_B |
                                           R500_INST_ALPHA_WMASK));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR1(3) |
                                           R500_RGB_ADDR2(0)));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR1(3) |
                                           R500_ALPHA_ADDR2(0)));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_R_SWIZ_A_1 |
                                           R500_ALU_RGB_G_SWIZ_A_1 |
                                           R500_ALU_RGB_B_SWIZ_A_1 |
                                           R500_ALU_RGB_SEL_B_SRC1 |
                                           R500_ALU_RGB_R_SWIZ_B_R |
                                           R500_ALU_RGB_G_SWIZ_B_G |
                                           R500_ALU_RGB_B_SWIZ_B_B));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
                                           R500_ALPHA_OP_MAD |
                                           R500_ALPHA_SWIZ_A_1 |
                                           R500_ALPHA_SEL_B_SRC1 |
                                           R500_ALPHA_SWIZ_B_A));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
                                           R500_ALU_RGBA_OP_MAD |
                                           R500_ALU_RGBA_SEL_C_SRC2 |
                                           R500_ALU_RGBA_R_SWIZ_R |
                                           R500_ALU_RGBA_G_SWIZ_G |
                                           R500_ALU_RGBA_B_SWIZ_R |
                                           R500_ALU_RGBA_A_SWIZ_G));

            /* TEX temp1, temp3.zwxy, tex0, 2D */
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
                                           R500_INST_RGB_WMASK_R |
                                           R500_INST_RGB_WMASK_G |
                                           R500_INST_RGB_WMASK_B |
                                           R500_INST_ALPHA_WMASK));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
                                           R500_TEX_INST_LD |
                                           R500_TEX_IGNORE_UNCOVERED));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(3) |
                                           R500_TEX_SRC_S_SWIZ_B |
                                           R500_TEX_SRC_T_SWIZ_A |
                                           R500_TEX_SRC_R_SWIZ_R |
                                           R500_TEX_SRC_Q_SWIZ_G |
                                           R500_TEX_DST_ADDR(1) |
                                           R500_TEX_DST_R_SWIZ_R |
                                           R500_TEX_DST_G_SWIZ_G |
                                           R500_TEX_DST_B_SWIZ_B |
                                           R500_TEX_DST_A_SWIZ_A));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);

            /* TEX temp3, temp3.xyzw, tex0, 2D */
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
                                           R500_INST_TEX_SEM_WAIT |
                                           R500_INST_RGB_WMASK_R |
                                           R500_INST_RGB_WMASK_G |
                                           R500_INST_RGB_WMASK_B |
                                           R500_INST_ALPHA_WMASK));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
                                           R500_TEX_INST_LD |
                                           R500_TEX_SEM_ACQUIRE |
                                           R500_TEX_IGNORE_UNCOVERED));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(3) |
                                           R500_TEX_SRC_S_SWIZ_R |
                                           R500_TEX_SRC_T_SWIZ_G |
                                           R500_TEX_SRC_R_SWIZ_B |
                                           R500_TEX_SRC_Q_SWIZ_A |
                                           R500_TEX_DST_ADDR(3) |
                                           R500_TEX_DST_R_SWIZ_R |
                                           R500_TEX_DST_G_SWIZ_G |
                                           R500_TEX_DST_B_SWIZ_B |
                                           R500_TEX_DST_A_SWIZ_A));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);

            /* MAD temp4, const0.0y0y, temp5.yyyy, temp4 */
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
                                           R500_INST_RGB_WMASK_R |
                                           R500_INST_RGB_WMASK_G |
                                           R500_INST_RGB_WMASK_B |
                                           R500_INST_ALPHA_WMASK));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
                                           R500_RGB_ADDR0_CONST |
                                           R500_RGB_ADDR1(5) |
                                           R500_RGB_ADDR2(4)));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
                                           R500_ALPHA_ADDR0_CONST |
                                           R500_ALPHA_ADDR1(5) |
                                           R500_ALPHA_ADDR2(4)));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
                                           R500_ALU_RGB_R_SWIZ_A_0 |
                                           R500_ALU_RGB_G_SWIZ_A_G |
                                           R500_ALU_RGB_B_SWIZ_A_0 |
                                           R500_ALU_RGB_SEL_B_SRC1 |
                                           R500_ALU_RGB_R_SWIZ_B_G |
                                           R500_ALU_RGB_G_SWIZ_B_G |
                                           R500_ALU_RGB_B_SWIZ_B_G));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(4) |
                                           R500_ALPHA_OP_MAD |
                                           R500_ALPHA_SEL_A_SRC0 |
                                           R500_ALPHA_SWIZ_A_G |
                                           R500_ALPHA_SEL_B_SRC1 |
                                           R500_ALPHA_SWIZ_B_G));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(4) |
                                           R500_ALU_RGBA_OP_MAD |
                                           R500_ALU_RGBA_SEL_C_SRC2 |
                                           R500_ALU_RGBA_R_SWIZ_R |
                                           R500_ALU_RGBA_G_SWIZ_G |
                                           R500_ALU_RGBA_B_SWIZ_B |
                                           R500_ALU_RGBA_A_SWIZ_A));

            /* ADD temp0, temp4, input0.xyxy */
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
                                           R500_INST_RGB_WMASK_R |
                                           R500_INST_RGB_WMASK_G |
                                           R500_INST_RGB_WMASK_B |
                                           R500_INST_ALPHA_WMASK));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR1(4) |
                                           R500_RGB_ADDR2(0)));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR1(4) |
                                           R500_ALPHA_ADDR2(0)));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_R_SWIZ_A_1 |
                                           R500_ALU_RGB_G_SWIZ_A_1 |
                                           R500_ALU_RGB_B_SWIZ_A_1 |
                                           R500_ALU_RGB_SEL_B_SRC1 |
                                           R500_ALU_RGB_R_SWIZ_B_R |
                                           R500_ALU_RGB_G_SWIZ_B_G |
                                           R500_ALU_RGB_B_SWIZ_B_B));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
                                           R500_ALPHA_OP_MAD |
                                           R500_ALPHA_SWIZ_A_1 |
                                           R500_ALPHA_SEL_B_SRC1 |
                                           R500_ALPHA_SWIZ_B_A));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
                                           R500_ALU_RGBA_OP_MAD |
                                           R500_ALU_RGBA_SEL_C_SRC2 |
                                           R500_ALU_RGBA_R_SWIZ_R |
                                           R500_ALU_RGBA_G_SWIZ_G |
                                           R500_ALU_RGBA_B_SWIZ_R |
                                           R500_ALU_RGBA_A_SWIZ_G));

            /* TEX temp4, temp0.zwzw, tex0, 2D */
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
                                           R500_INST_TEX_SEM_WAIT |
                                           R500_INST_RGB_WMASK_R |
                                           R500_INST_RGB_WMASK_G |
                                           R500_INST_RGB_WMASK_B |
                                           R500_INST_ALPHA_WMASK));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
                                           R500_TEX_INST_LD |
                                           R500_TEX_IGNORE_UNCOVERED));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
                                           R500_TEX_SRC_S_SWIZ_B |
                                           R500_TEX_SRC_T_SWIZ_A |
                                           R500_TEX_SRC_R_SWIZ_B |
                                           R500_TEX_SRC_Q_SWIZ_A |
                                           R500_TEX_DST_ADDR(4) |
                                           R500_TEX_DST_R_SWIZ_R |
                                           R500_TEX_DST_G_SWIZ_G |
                                           R500_TEX_DST_B_SWIZ_B |
                                           R500_TEX_DST_A_SWIZ_A));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);

            /* TEX temp0, temp0.xyzw, tex0, 2D */
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
                                           R500_INST_TEX_SEM_WAIT |
                                           R500_INST_RGB_WMASK_R |
                                           R500_INST_RGB_WMASK_G |
                                           R500_INST_RGB_WMASK_B |
                                       R500_INST_ALPHA_WMASK));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
                                           R500_TEX_INST_LD |
                                           R500_TEX_SEM_ACQUIRE |
                                           R500_TEX_IGNORE_UNCOVERED));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
                                           R500_TEX_SRC_S_SWIZ_R |
                                           R500_TEX_SRC_T_SWIZ_G |
                                           R500_TEX_SRC_R_SWIZ_B |
                                           R500_TEX_SRC_Q_SWIZ_A |
                                           R500_TEX_DST_ADDR(0) |
                                           R500_TEX_DST_R_SWIZ_R |
                                           R500_TEX_DST_G_SWIZ_G |
                                           R500_TEX_DST_B_SWIZ_B |
                                           R500_TEX_DST_A_SWIZ_A));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);

            /* LRP temp3, temp2.zzzz, temp1, temp3 ->
             * - PRESUB temps, temp1 - temp3
             * - MAD temp2.zzzz, temps, temp3 */
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
                                           R500_INST_RGB_WMASK_R |
                                           R500_INST_RGB_WMASK_G |
                                           R500_INST_RGB_WMASK_B |
                                           R500_INST_ALPHA_WMASK));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(3) |
                                           R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
                                           R500_RGB_ADDR1(1) |
                                           R500_RGB_ADDR2(2)));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(3) |
                                           R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
                                           R500_ALPHA_ADDR1(1) |
                                           R500_ALPHA_ADDR2(2)));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
                                           R500_ALU_RGB_R_SWIZ_A_B |
                                           R500_ALU_RGB_G_SWIZ_A_B |
                                           R500_ALU_RGB_B_SWIZ_A_B |
                                           R500_ALU_RGB_SEL_B_SRCP |
                                           R500_ALU_RGB_R_SWIZ_B_R |
                                           R500_ALU_RGB_G_SWIZ_B_G |
                                           R500_ALU_RGB_B_SWIZ_B_B));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
                                           R500_ALPHA_OP_MAD |
                                           R500_ALPHA_SEL_A_SRC2 |
                                           R500_ALPHA_SWIZ_A_B |
                                           R500_ALPHA_SEL_B_SRCP |
                                           R500_ALPHA_SWIZ_B_A));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
                                           R500_ALU_RGBA_OP_MAD |
                                           R500_ALU_RGBA_SEL_C_SRC0 |
                                           R500_ALU_RGBA_R_SWIZ_R |
                                           R500_ALU_RGBA_G_SWIZ_G |
                                           R500_ALU_RGBA_B_SWIZ_B |
                                           R500_ALU_RGBA_A_SWIZ_A));

            /* LRP temp0, temp2.zzzz, temp4, temp0 ->
             * - PRESUB temps, temp4 - temp1
             * - MAD temp2.zzzz, temps, temp0 */
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
                                           R500_INST_TEX_SEM_WAIT |
                                           R500_INST_RGB_WMASK_R |
                                           R500_INST_RGB_WMASK_G |
                                           R500_INST_RGB_WMASK_B |
                                           R500_INST_ALPHA_WMASK));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
                                           R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
                                           R500_RGB_ADDR1(4) |
                                           R500_RGB_ADDR2(2)));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
                                           R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
                                           R500_ALPHA_ADDR1(4) |
                                           R500_ALPHA_ADDR2(2)));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
                                           R500_ALU_RGB_R_SWIZ_A_B |
                                           R500_ALU_RGB_G_SWIZ_A_B |
                                           R500_ALU_RGB_B_SWIZ_A_B |
                                           R500_ALU_RGB_SEL_B_SRCP |
                                           R500_ALU_RGB_R_SWIZ_B_R |
                                           R500_ALU_RGB_G_SWIZ_B_G |
                                           R500_ALU_RGB_B_SWIZ_B_B));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
                                           R500_ALPHA_OP_MAD |
                                           R500_ALPHA_SEL_A_SRC2 |
                                           R500_ALPHA_SWIZ_A_B |
                                           R500_ALPHA_SEL_B_SRCP |
                                           R500_ALPHA_SWIZ_B_A));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
                                           R500_ALU_RGBA_OP_MAD |
                                           R500_ALU_RGBA_SEL_C_SRC0 |
                                           R500_ALU_RGBA_R_SWIZ_R |
                                           R500_ALU_RGBA_G_SWIZ_G |
                                           R500_ALU_RGBA_B_SWIZ_B |
                                           R500_ALU_RGBA_A_SWIZ_A));

            /* LRP output, temp5.zzzz, temp3, temp0 ->
             * - PRESUB temps, temp3 - temp0
             * - MAD temp5.zzzz, temps, temp0 */
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
                                           R500_INST_LAST |
                                           R500_INST_TEX_SEM_WAIT |
                                           R500_INST_RGB_WMASK_R |
                                           R500_INST_RGB_WMASK_G |
                                           R500_INST_RGB_WMASK_B |
                                           R500_INST_ALPHA_WMASK |
                                           R500_INST_RGB_OMASK_R |
                                           R500_INST_RGB_OMASK_G |
                                           R500_INST_RGB_OMASK_B |
                                           R500_INST_ALPHA_OMASK));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
                                           R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
                                           R500_RGB_ADDR1(3) |
                                           R500_RGB_ADDR2(5)));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
                                           R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
                                           R500_ALPHA_ADDR1(3) |
                                           R500_ALPHA_ADDR2(5)));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
                                           R500_ALU_RGB_R_SWIZ_A_B |
                                           R500_ALU_RGB_G_SWIZ_A_B |
                                           R500_ALU_RGB_B_SWIZ_A_B |
                                           R500_ALU_RGB_SEL_B_SRCP |
                                           R500_ALU_RGB_R_SWIZ_B_R |
                                           R500_ALU_RGB_G_SWIZ_B_G |
                                           R500_ALU_RGB_B_SWIZ_B_B));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
                                           R500_ALPHA_OP_MAD |
                                           R500_ALPHA_SEL_A_SRC2 |
                                           R500_ALPHA_SWIZ_A_B |
                                           R500_ALPHA_SEL_B_SRCP |
                                           R500_ALPHA_SWIZ_B_A));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
                                           R500_ALU_RGBA_OP_MAD |
                                           R500_ALU_RGBA_SEL_C_SRC0 |
                                           R500_ALU_RGBA_R_SWIZ_R |
                                           R500_ALU_RGBA_G_SWIZ_G |
                                           R500_ALU_RGBA_B_SWIZ_B |
                                           R500_ALU_RGBA_A_SWIZ_A));

            /* Shader constants. */
            OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_CONST_INDEX(0));

            /* const0 = {1 / texture[0].width, 1 / texture[0].height, 0, 0} */
            OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, (1.0/(float)pPriv->w));
            OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, (1.0/(float)pPriv->h));
            OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0x0);
            OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0x0);

            FINISH_ACCEL();

          } else {
            BEGIN_ACCEL(19);
            /* 2 components: 2 for tex0 */
            OUT_ACCEL_REG(R300_RS_COUNT,
                        ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
                         R300_RS_COUNT_HIRES_EN));

            /* R300_INST_COUNT_RS - highest RS instruction used */
            OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6));

            /* Pixel stack frame size. */
            OUT_ACCEL_REG(R300_US_PIXSIZE, 0); /* highest temp used */

            /* FP length. */
            OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
                                      R500_US_CODE_END_ADDR(1)));
            OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
                                       R500_US_CODE_RANGE_SIZE(1)));

            /* Prepare for FP emission. */
            OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
            OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));

            /* tex inst */
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
                                           R500_INST_TEX_SEM_WAIT |
                                           R500_INST_RGB_WMASK_R |
                                           R500_INST_RGB_WMASK_G |
                                           R500_INST_RGB_WMASK_B |
                                           R500_INST_ALPHA_WMASK |
                                           R500_INST_RGB_CLAMP |
                                           R500_INST_ALPHA_CLAMP));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
                                           R500_TEX_INST_LD |
                                           R500_TEX_SEM_ACQUIRE |
                                           R500_TEX_IGNORE_UNCOVERED));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
                                           R500_TEX_SRC_S_SWIZ_R |
                                           R500_TEX_SRC_T_SWIZ_G |
                                           R500_TEX_DST_ADDR(0) |
                                           R500_TEX_DST_R_SWIZ_R |
                                           R500_TEX_DST_G_SWIZ_G |
                                           R500_TEX_DST_B_SWIZ_B |
                                           R500_TEX_DST_A_SWIZ_A));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
                                           R500_DX_S_SWIZ_R |
                                           R500_DX_T_SWIZ_R |
                                           R500_DX_R_SWIZ_R |
                                           R500_DX_Q_SWIZ_R |
                                           R500_DY_ADDR(0) |
                                           R500_DY_S_SWIZ_R |
                                           R500_DY_T_SWIZ_R |
                                           R500_DY_R_SWIZ_R |
                                           R500_DY_Q_SWIZ_R));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);

            /* ALU inst */
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
                                           R500_INST_TEX_SEM_WAIT |
                                           R500_INST_LAST |
                                           R500_INST_RGB_OMASK_R |
                                           R500_INST_RGB_OMASK_G |
                                           R500_INST_RGB_OMASK_B |
                                           R500_INST_ALPHA_OMASK |
                                           R500_INST_RGB_CLAMP |
                                           R500_INST_ALPHA_CLAMP));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
                                           R500_RGB_ADDR1(0) |
                                           R500_RGB_ADDR1_CONST |
                                           R500_RGB_ADDR2(0) |
                                           R500_RGB_ADDR2_CONST));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
                                           R500_ALPHA_ADDR1(0) |
                                           R500_ALPHA_ADDR1_CONST |
                                           R500_ALPHA_ADDR2(0) |
                                           R500_ALPHA_ADDR2_CONST));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
                                           R500_ALU_RGB_R_SWIZ_A_R |
                                           R500_ALU_RGB_G_SWIZ_A_G |
                                           R500_ALU_RGB_B_SWIZ_A_B |
                                           R500_ALU_RGB_SEL_B_SRC0 |
                                           R500_ALU_RGB_R_SWIZ_B_1 |
                                           R500_ALU_RGB_B_SWIZ_B_1 |
                                           R500_ALU_RGB_G_SWIZ_B_1));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
                                           R500_ALPHA_SWIZ_A_A |
                                           R500_ALPHA_SWIZ_B_1));
            OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
                                           R500_ALU_RGBA_R_SWIZ_0 |
                                           R500_ALU_RGBA_G_SWIZ_0 |
                                           R500_ALU_RGBA_B_SWIZ_0 |
                                           R500_ALU_RGBA_A_SWIZ_0));
            FINISH_ACCEL();
          }
      }

      BEGIN_ACCEL(6);
      OUT_ACCEL_REG(R300_TX_INVALTAGS, 0);
      OUT_ACCEL_REG(R300_TX_ENABLE, txenable);

      OUT_ACCEL_REG(R300_RB3D_COLOROFFSET0, dst_offset);
      OUT_ACCEL_REG(R300_RB3D_COLORPITCH0, colorpitch);

      blendcntl = RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO;
      /* no need to enable blending */
      OUT_ACCEL_REG(R300_RB3D_BLENDCNTL, blendcntl);

      OUT_ACCEL_REG(R300_VAP_VTX_SIZE, vtx_count);
      FINISH_ACCEL();

    } else {

      /* Same for R100/R200 */
      switch (pPixmap->drawable.bitsPerPixel) {
      case 16:
          if (pPixmap->drawable.depth == 15)
            dst_format = RADEON_COLOR_FORMAT_ARGB1555;
          else
            dst_format = RADEON_COLOR_FORMAT_RGB565;
          break;
      case 32:
          dst_format = RADEON_COLOR_FORMAT_ARGB8888;
          break;
      default:
          return;
      }

      if (pPriv->planar_hw && (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12)) {
          isplanar = TRUE;
      }

      if (isplanar) {
          txformat = RADEON_TXFORMAT_I8;
      } else {
          if (pPriv->id == FOURCC_UYVY)
            txformat = RADEON_TXFORMAT_YVYU422;
          else
            txformat = RADEON_TXFORMAT_VYUY422;
      }

      txformat |= RADEON_TXFORMAT_NON_POWER2;

      colorpitch = dst_pitch >> pixel_shift;

      if (RADEONTilingEnabled(pScrn, pPixmap))
          colorpitch |= RADEON_COLOR_TILE_ENABLE;

      BEGIN_ACCEL(4);

      OUT_ACCEL_REG(RADEON_RB3D_CNTL,
                  dst_format /*| RADEON_ALPHA_BLEND_ENABLE*/);
      OUT_ACCEL_REG(RADEON_RB3D_COLOROFFSET, dst_offset);

      OUT_ACCEL_REG(RADEON_RB3D_COLORPITCH, colorpitch);

      OUT_ACCEL_REG(RADEON_RB3D_BLENDCNTL,
                  RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);

      FINISH_ACCEL();


      if ((info->ChipFamily == CHIP_FAMILY_RV250) ||
          (info->ChipFamily == CHIP_FAMILY_RV280) ||
          (info->ChipFamily == CHIP_FAMILY_RS300) ||
          (info->ChipFamily == CHIP_FAMILY_R200)) {

          info->accel_state->texW[0] = pPriv->w;
          info->accel_state->texH[0] = pPriv->h;

          if (isplanar) {
            /* note: in contrast to r300, use input biasing on uv components */
            float yco = 1.1643;
            float yoff = -0.0625 * yco;
            float uco[3] = {0.0, -0.39173, 2.018};
            float vco[3] = {1.5958, -0.8129, 0.0};

            /* need 2 texcoord sets (even though they are identical) due
               to denormalization! hw apparently can't premultiply
               same coord set by different texture size */
            vtx_count = 6;

            txformat0 = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) |
                      (((((pPriv->h + 1 ) >> 1) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT));
            txpitch = ((pPriv->src_pitch >> 1) + 63) & ~63;
            txpitch -= 32;
            txfilter =  R200_MAG_FILTER_LINEAR |
                      R200_MIN_FILTER_LINEAR |
                      R200_CLAMP_S_CLAMP_LAST |
                      R200_CLAMP_T_CLAMP_LAST;

            BEGIN_ACCEL(36);

            OUT_ACCEL_REG(RADEON_PP_CNTL,
                        RADEON_TEX_0_ENABLE | RADEON_TEX_1_ENABLE | RADEON_TEX_2_ENABLE |
                        RADEON_TEX_BLEND_0_ENABLE | RADEON_TEX_BLEND_1_ENABLE |
                        RADEON_TEX_BLEND_2_ENABLE);

            OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY);
            OUT_ACCEL_REG(R200_SE_VTX_FMT_1,
                        (2 << R200_VTX_TEX0_COMP_CNT_SHIFT) |
                        (2 << R200_VTX_TEX1_COMP_CNT_SHIFT));

            OUT_ACCEL_REG(R200_PP_TXFILTER_0, txfilter);
            OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat);
            OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0);
            OUT_ACCEL_REG(R200_PP_TXSIZE_0,
                        (pPriv->w - 1) |
                        ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
            OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32);
            OUT_ACCEL_REG(R200_PP_TXOFFSET_0, pPriv->src_offset);

            OUT_ACCEL_REG(R200_PP_TXFILTER_1, txfilter);
            OUT_ACCEL_REG(R200_PP_TXFORMAT_1, txformat | R200_TXFORMAT_ST_ROUTE_STQ1);
            OUT_ACCEL_REG(R200_PP_TXFORMAT_X_1, 0);
            OUT_ACCEL_REG(R200_PP_TXSIZE_1, txformat0);
            OUT_ACCEL_REG(R200_PP_TXPITCH_1, txpitch);
            OUT_ACCEL_REG(R200_PP_TXOFFSET_1, pPriv->src_offset + pPriv->planeu_offset);

            OUT_ACCEL_REG(R200_PP_TXFILTER_2, txfilter);
            OUT_ACCEL_REG(R200_PP_TXFORMAT_2, txformat | R200_TXFORMAT_ST_ROUTE_STQ1);
            OUT_ACCEL_REG(R200_PP_TXFORMAT_X_2, 0);
            OUT_ACCEL_REG(R200_PP_TXSIZE_2, txformat0);
            OUT_ACCEL_REG(R200_PP_TXPITCH_2, txpitch);
            OUT_ACCEL_REG(R200_PP_TXOFFSET_2, pPriv->src_offset + pPriv->planev_offset);

            /* similar to r300 code. Note the big problem is that hardware constants
             * are 8 bits only, representing 0.0-1.0. We can get that up (using bias
             * + scale) to -1.0-1.0 (but precision will suffer). AFAIK the hw actually
             * has 12 bits fractional precision (plus 1 sign bit, 3 range bits) but
             * the constants not. To get larger range can use output scale, but for
             * that 2.018 value we need a total scale by 8, which means the constants
             * really have no accuracy whatsoever (5 fractional bits only).
             * The only direct way to get high  precision "constants" into the fragment
             * pipe I know of is to use the texcoord interpolator (not color, this one
             * is 8 bit only too), which seems a bit expensive. We're lucky though it
             * seems the values we need seem to fit better than worst case (get about
             * 6 fractional bits for this instead of 5, at least when not correcting for
             * hue/saturation/contrast/brightness, which is the same as for vco - yco and
             * yoff get 8 fractional bits).
             *
             * A higher precision (8 fractional bits) version might just put uco into
             * a texcoord, and calculate a new vcoconst in the shader, like so:
             * cohelper = {1.0, 0.0, 0.0} - shouldn't use 0.5 since not exactly representable
             * vco = {1.5958 - 1.0, -0.8129 + 1.0, 1.0}
             * vcocalc = ADD temp, bias/scale(cohelper), vco
             * would in total use 4 tex units, 4 instructions which seems fairly
             * balanced for this architecture (instead of 3 + 3 for the solution here)
             *
             * temp = MAD(yco, yuv.yyyy, yoff)
             * temp = MAD(uco, yuv.uuuu, temp)
             * result = MAD(vco, yuv.vvvv, temp)
             *
             * note first mad produces actually scalar, hence we transform
             * it into a dp2a to get 8 bit precision of yco instead of 7 -
             * That's assuming hw correctly expands consts to internal precision.
             * (y * 1 + y * (yco - 1) + yoff)
             * temp = DP2A / 2 (yco, yuv.yyyy, yoff)
             * temp = MAD (uco / 4, yuv.uuuu * 2, temp)
             * result = MAD x2 (vco / 2, yuv.vvvv, temp)
             *
             * vco, uco need bias (and hence scale too)
             *
             */

            /* MAD temp0 / 2, const0.a * 2, temp0, -const0.rgb */
            OUT_ACCEL_REG(R200_PP_TXCBLEND_0,
                        R200_TXC_ARG_A_TFACTOR_COLOR |
                        R200_TXC_ARG_B_R0_COLOR |
                        R200_TXC_ARG_C_TFACTOR_COLOR |
                        R200_TXC_NEG_ARG_C |
                        R200_TXC_OP_DOT2_ADD);
            OUT_ACCEL_REG(R200_PP_TXCBLEND2_0,
                        (0 << R200_TXC_TFACTOR_SEL_SHIFT) |
                        R200_TXC_SCALE_INV2 |
                        R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R0);
            OUT_ACCEL_REG(R200_PP_TXABLEND_0,
                        R200_TXA_ARG_A_ZERO |
                        R200_TXA_ARG_B_ZERO |
                        R200_TXA_ARG_C_ZERO |
                        R200_TXA_OP_MADD);
            OUT_ACCEL_REG(R200_PP_TXABLEND2_0,
                        R200_TXA_OUTPUT_REG_NONE);

            /* MAD temp0, (const1 - 0.5) * 2, (temp1 - 0.5) * 2, temp0 */
            OUT_ACCEL_REG(R200_PP_TXCBLEND_1,
                        R200_TXC_ARG_A_TFACTOR_COLOR |
                        R200_TXC_BIAS_ARG_A |
                        R200_TXC_SCALE_ARG_A |
                        R200_TXC_ARG_B_R1_COLOR |
                        R200_TXC_BIAS_ARG_B |
                        R200_TXC_SCALE_ARG_B |
                        R200_TXC_ARG_C_R0_COLOR |
                        R200_TXC_OP_MADD);
            OUT_ACCEL_REG(R200_PP_TXCBLEND2_1,
                        (1 << R200_TXC_TFACTOR_SEL_SHIFT) |
                        R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R0);
            OUT_ACCEL_REG(R200_PP_TXABLEND_1,
                        R200_TXA_ARG_A_ZERO |
                        R200_TXA_ARG_B_ZERO |
                        R200_TXA_ARG_C_ZERO |
                        R200_TXA_OP_MADD);
            OUT_ACCEL_REG(R200_PP_TXABLEND2_1,
                        R200_TXA_OUTPUT_REG_NONE);

            /* MAD temp0 x 2, (const2 - 0.5) * 2, (temp2 - 0.5), temp0 */
            OUT_ACCEL_REG(R200_PP_TXCBLEND_2,
                        R200_TXC_ARG_A_TFACTOR_COLOR |
                        R200_TXC_BIAS_ARG_A |
                        R200_TXC_SCALE_ARG_A |
                        R200_TXC_ARG_B_R2_COLOR |
                        R200_TXC_BIAS_ARG_B |
                        R200_TXC_ARG_C_R0_COLOR |
                        R200_TXC_OP_MADD);
            OUT_ACCEL_REG(R200_PP_TXCBLEND2_2,
                        (2 << R200_TXC_TFACTOR_SEL_SHIFT) |
                        R200_TXC_SCALE_2X |
                        R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
            OUT_ACCEL_REG(R200_PP_TXABLEND_2,
                        R200_TXA_ARG_A_ZERO |
                        R200_TXA_ARG_B_ZERO |
                        R200_TXA_ARG_C_ZERO |
                        R200_TXA_COMP_ARG_C |
                        R200_TXA_OP_MADD);
            OUT_ACCEL_REG(R200_PP_TXABLEND2_2,
                        R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);

            /* shader constants */
            OUT_ACCEL_REG(R200_PP_TFACTOR_0, float4touint(1.0, /* src range [1, 2] */
                                                yco - 1.0,
                                                -yoff, /* range [-1, 0] */
                                                0.0));
            OUT_ACCEL_REG(R200_PP_TFACTOR_1, float4touint(uco[0] * 0.125 + 0.5, /* range [-4, 4] */
                                                uco[1] * 0.125 + 0.5,
                                                uco[2] * 0.125 + 0.5,
                                                0.0));
            OUT_ACCEL_REG(R200_PP_TFACTOR_2, float4touint(vco[0] * 0.25 + 0.5, /* range [-2, 2] */
                                                vco[1] * 0.25 + 0.5,
                                                vco[2] * 0.25 + 0.5,
                                                0.0));

            FINISH_ACCEL();
          }
          else if (info->ChipFamily == CHIP_FAMILY_RV250) {
            /* fix up broken packed yuv - shader same as above except
               yuv compoents are all in same reg */
            float yco = 1.1643;
            float yoff = -0.0625 * yco;
            float uco[3] = {0.0, -0.39173, 2.018};
            float vco[3] = {1.5958, -0.8129, 0.0};

            txformat0 = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) |
                      (((((pPriv->h + 1 ) >> 1 ) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT));
            txpitch = ((pPriv->src_pitch >> 1) + 63) & ~63;
            txpitch -= 32;
            txfilter =  R200_MAG_FILTER_LINEAR |
                      R200_MIN_FILTER_LINEAR |
                      R200_CLAMP_S_CLAMP_LAST |
                      R200_CLAMP_T_CLAMP_LAST;

            BEGIN_ACCEL(24);

            OUT_ACCEL_REG(RADEON_PP_CNTL,
                        RADEON_TEX_0_ENABLE |
                        RADEON_TEX_BLEND_0_ENABLE | RADEON_TEX_BLEND_1_ENABLE |
                        RADEON_TEX_BLEND_2_ENABLE);

            OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY);
            OUT_ACCEL_REG(R200_SE_VTX_FMT_1,
                        (2 << R200_VTX_TEX0_COMP_CNT_SHIFT));

            OUT_ACCEL_REG(R200_PP_TXFILTER_0, txfilter);
            OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat);
            OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0);
            OUT_ACCEL_REG(R200_PP_TXSIZE_0,
                        (pPriv->w - 1) |
                        ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
            OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32);
            OUT_ACCEL_REG(R200_PP_TXOFFSET_0, pPriv->src_offset);

            /* MAD temp1 / 2, const0.a * 2, temp0.ggg, -const0.rgb */
            OUT_ACCEL_REG(R200_PP_TXCBLEND_0,
                        R200_TXC_ARG_A_TFACTOR_COLOR |
                        R200_TXC_ARG_B_R0_COLOR |
                        R200_TXC_ARG_C_TFACTOR_COLOR |
                        R200_TXC_NEG_ARG_C |
                        R200_TXC_OP_DOT2_ADD);
            OUT_ACCEL_REG(R200_PP_TXCBLEND2_0,
                        (0 << R200_TXC_TFACTOR_SEL_SHIFT) |
                        R200_TXC_SCALE_INV2 |
                        (R200_TXC_REPL_GREEN << R200_TXC_REPL_ARG_B_SHIFT) |
                        R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R1);
            OUT_ACCEL_REG(R200_PP_TXABLEND_0,
                        R200_TXA_ARG_A_ZERO |
                        R200_TXA_ARG_B_ZERO |
                        R200_TXA_ARG_C_ZERO |
                        R200_TXA_OP_MADD);
            OUT_ACCEL_REG(R200_PP_TXABLEND2_0,
                        R200_TXA_OUTPUT_REG_NONE);

            /* MAD temp1, (const1 - 0.5) * 2, (temp0.rrr - 0.5) * 2, temp1 */
            OUT_ACCEL_REG(R200_PP_TXCBLEND_1,
                        R200_TXC_ARG_A_TFACTOR_COLOR |
                        R200_TXC_BIAS_ARG_A |
                        R200_TXC_SCALE_ARG_A |
                        R200_TXC_ARG_B_R0_COLOR |
                        R200_TXC_BIAS_ARG_B |
                        R200_TXC_SCALE_ARG_B |
                        R200_TXC_ARG_C_R1_COLOR |
                        R200_TXC_OP_MADD);
            OUT_ACCEL_REG(R200_PP_TXCBLEND2_1,
                        (1 << R200_TXC_TFACTOR_SEL_SHIFT) |
                        (R200_TXC_REPL_BLUE << R200_TXC_REPL_ARG_B_SHIFT) |
                        R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R1);
            OUT_ACCEL_REG(R200_PP_TXABLEND_1,
                        R200_TXA_ARG_A_ZERO |
                        R200_TXA_ARG_B_ZERO |
                        R200_TXA_ARG_C_ZERO |
                        R200_TXA_OP_MADD);
            OUT_ACCEL_REG(R200_PP_TXABLEND2_1,
                        R200_TXA_OUTPUT_REG_NONE);

            /* MAD temp0 x 2, (const2 - 0.5) * 2, (temp0.bbb - 0.5), temp1 */
            OUT_ACCEL_REG(R200_PP_TXCBLEND_2,
                        R200_TXC_ARG_A_TFACTOR_COLOR |
                        R200_TXC_BIAS_ARG_A |
                        R200_TXC_SCALE_ARG_A |
                        R200_TXC_ARG_B_R0_COLOR |
                        R200_TXC_BIAS_ARG_B |
                        R200_TXC_ARG_C_R1_COLOR |
                        R200_TXC_OP_MADD);
            OUT_ACCEL_REG(R200_PP_TXCBLEND2_2,
                        (2 << R200_TXC_TFACTOR_SEL_SHIFT) |
                        R200_TXC_SCALE_2X |
                        (R200_TXC_REPL_RED << R200_TXC_REPL_ARG_B_SHIFT) |
                        R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
            OUT_ACCEL_REG(R200_PP_TXABLEND_2,
                        R200_TXA_ARG_A_ZERO |
                        R200_TXA_ARG_B_ZERO |
                        R200_TXA_ARG_C_ZERO |
                        R200_TXA_COMP_ARG_C |
                        R200_TXA_OP_MADD);
            OUT_ACCEL_REG(R200_PP_TXABLEND2_2,
                        R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);

            /* shader constants */
            OUT_ACCEL_REG(R200_PP_TFACTOR_0, float4touint(1.0, /* src range [1, 2] */
                                                yco - 1.0,
                                                -yoff, /* range [-1, 0] */
                                                0.0));
            OUT_ACCEL_REG(R200_PP_TFACTOR_1, float4touint(uco[0] * 0.125 + 0.5, /* range [-4, 4] */
                                                uco[1] * 0.125 + 0.5,
                                                uco[2] * 0.125 + 0.5,
                                                0.0));
            OUT_ACCEL_REG(R200_PP_TFACTOR_2, float4touint(vco[0] * 0.25 + 0.5, /* range [-2, 2] */
                                                vco[1] * 0.25 + 0.5,
                                                vco[2] * 0.25 + 0.5,
                                                0.0));

            FINISH_ACCEL();
          }
          else {
            BEGIN_ACCEL(13);
            OUT_ACCEL_REG(RADEON_PP_CNTL,
                        RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE);

            OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY);
            OUT_ACCEL_REG(R200_SE_VTX_FMT_1,
                        (2 << R200_VTX_TEX0_COMP_CNT_SHIFT));

            OUT_ACCEL_REG(R200_PP_TXFILTER_0,
                        R200_MAG_FILTER_LINEAR |
                        R200_MIN_FILTER_LINEAR |
                        R200_CLAMP_S_CLAMP_LAST |
                        R200_CLAMP_T_CLAMP_LAST |
                        R200_YUV_TO_RGB);
            OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat);
            OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0);
            OUT_ACCEL_REG(R200_PP_TXSIZE_0,
                        (pPriv->w - 1) |
                        ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
            OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32);

            OUT_ACCEL_REG(R200_PP_TXOFFSET_0, pPriv->src_offset);

            OUT_ACCEL_REG(R200_PP_TXCBLEND_0,
                        R200_TXC_ARG_A_ZERO |
                        R200_TXC_ARG_B_ZERO |
                        R200_TXC_ARG_C_R0_COLOR |
                        R200_TXC_OP_MADD);
            OUT_ACCEL_REG(R200_PP_TXCBLEND2_0,
                        R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
            OUT_ACCEL_REG(R200_PP_TXABLEND_0,
                        R200_TXA_ARG_A_ZERO |
                        R200_TXA_ARG_B_ZERO |
                        R200_TXA_ARG_C_R0_ALPHA |
                        R200_TXA_OP_MADD);
            OUT_ACCEL_REG(R200_PP_TXABLEND2_0,
                        R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);
            FINISH_ACCEL();
          }
      } else {

          info->accel_state->texW[0] = 1;
          info->accel_state->texH[0] = 1;

          BEGIN_ACCEL(9);

          OUT_ACCEL_REG(RADEON_PP_CNTL,
                    RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE);

          OUT_ACCEL_REG(RADEON_SE_VTX_FMT, (RADEON_SE_VTX_FMT_XY |
                                    RADEON_SE_VTX_FMT_ST0));

          OUT_ACCEL_REG(RADEON_PP_TXFILTER_0,
                    RADEON_MAG_FILTER_LINEAR |
                    RADEON_MIN_FILTER_LINEAR |
                    RADEON_CLAMP_S_CLAMP_LAST |
                    RADEON_CLAMP_T_CLAMP_LAST |
                    RADEON_YUV_TO_RGB);
          OUT_ACCEL_REG(RADEON_PP_TXFORMAT_0, txformat);
          OUT_ACCEL_REG(RADEON_PP_TXOFFSET_0, pPriv->src_offset);
          OUT_ACCEL_REG(RADEON_PP_TXCBLEND_0,
                    RADEON_COLOR_ARG_A_ZERO |
                    RADEON_COLOR_ARG_B_ZERO |
                    RADEON_COLOR_ARG_C_T0_COLOR |
                    RADEON_BLEND_CTL_ADD |
                    RADEON_CLAMP_TX);
          OUT_ACCEL_REG(RADEON_PP_TXABLEND_0,
                    RADEON_ALPHA_ARG_A_ZERO |
                    RADEON_ALPHA_ARG_B_ZERO |
                    RADEON_ALPHA_ARG_C_T0_ALPHA |
                    RADEON_BLEND_CTL_ADD |
                    RADEON_CLAMP_TX);

          OUT_ACCEL_REG(RADEON_PP_TEX_SIZE_0,
                    (pPriv->w - 1) |
                    ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
          OUT_ACCEL_REG(RADEON_PP_TEX_PITCH_0,
                    pPriv->src_pitch - 32);
          FINISH_ACCEL();
      }
    }

    if (pPriv->vsync) {
      xf86CrtcPtr crtc = radeon_xv_pick_best_crtc(pScrn,
                                        pPriv->drw_x,
                                        pPriv->drw_x + pPriv->dst_w,
                                        pPriv->drw_y,
                                        pPriv->drw_y + pPriv->dst_h);
      if (crtc) {
          RADEONCrtcPrivatePtr radeon_crtc = crtc->driver_private;

          FUNC_NAME(RADEONWaitForVLine)(pScrn, pPixmap,
                                radeon_crtc->crtc_id,
                                pPriv->drw_y - crtc->y,
                                (pPriv->drw_y - crtc->y) + pPriv->dst_h);
      }
    }
    /*
     * Rendering of the actual polygon is done in two different
     * ways depending on chip generation:
     *
     * < R300:
     *
     *     These chips can render a rectangle in one pass, so
     *     handling is pretty straight-forward.
     *
     * >= R300:
     *
     *     These chips can accept a quad, but will render it as
     *     two triangles which results in a diagonal tear. Instead
     *     We render a single, large triangle and use the scissor
     *     functionality to restrict it to the desired rectangle.
     *     Due to guardband limits on r3xx/r4xx, we can only use
     *     the single triangle up to 2880 pixels; above that we
     *     render as a quad.
     */

    while (nBox--) {
      int srcX, srcY, srcw, srch;
      int dstX, dstY, dstw, dsth;
      Bool use_quad = FALSE;
      dstX = pBox->x1 + dstxoff;
      dstY = pBox->y1 + dstyoff;
      dstw = pBox->x2 - pBox->x1;
      dsth = pBox->y2 - pBox->y1;

      srcX = ((pBox->x1 - pPriv->drw_x) *
            pPriv->src_w) / pPriv->dst_w;
      srcY = ((pBox->y1 - pPriv->drw_y) *
            pPriv->src_h) / pPriv->dst_h;

      srcw = (pPriv->src_w * dstw) / pPriv->dst_w;
      srch = (pPriv->src_h * dsth) / pPriv->dst_h;

#if 0
      ErrorF("dst: %d, %d, %d, %d\n", dstX, dstY, dstw, dsth);
      ErrorF("src: %d, %d, %d, %d\n", srcX, srcY, srcw, srch);
#endif

      if (IS_R300_3D || IS_R500_3D) {
          if (IS_R300_3D && ((dstw+dsth) > 2880))
            use_quad = TRUE;
          /*
           * Set up the scissor area to that of the output size.
           */
          BEGIN_ACCEL(2);
          if (IS_R300_3D) {
            /* R300 has an offset */
            OUT_ACCEL_REG(R300_SC_SCISSOR0, (((dstX + 1088) << R300_SCISSOR_X_SHIFT) |
                                     ((dstY + 1088) << R300_SCISSOR_Y_SHIFT)));
            OUT_ACCEL_REG(R300_SC_SCISSOR1, (((dstX + dstw + 1088 - 1) << R300_SCISSOR_X_SHIFT) |
                                     ((dstY + dsth + 1088 - 1) << R300_SCISSOR_Y_SHIFT)));
          } else {
            OUT_ACCEL_REG(R300_SC_SCISSOR0, (((dstX) << R300_SCISSOR_X_SHIFT) |
                                     ((dstY) << R300_SCISSOR_Y_SHIFT)));
            OUT_ACCEL_REG(R300_SC_SCISSOR1, (((dstX + dstw - 1) << R300_SCISSOR_X_SHIFT) |
                                     ((dstY + dsth - 1) << R300_SCISSOR_Y_SHIFT)));
          }
          FINISH_ACCEL();
      }

#ifdef ACCEL_CP
      if (info->ChipFamily < CHIP_FAMILY_R200) {
          BEGIN_RING(3 * vtx_count + 3);
          OUT_RING(CP_PACKET3(RADEON_CP_PACKET3_3D_DRAW_IMMD,
                        3 * vtx_count + 1));
          OUT_RING(RADEON_CP_VC_FRMT_XY |
                 RADEON_CP_VC_FRMT_ST0);
          OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST |
                 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
                 RADEON_CP_VC_CNTL_MAOS_ENABLE |
                 RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE |
                 (3 << RADEON_CP_VC_CNTL_NUM_SHIFT));
      } else if (IS_R300_3D || IS_R500_3D) {
          if (use_quad) {
            BEGIN_RING(4 * vtx_count + 4);
            OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
                            4 * vtx_count));
            OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_QUAD_LIST |
                   RADEON_CP_VC_CNTL_PRIM_WALK_RING |
                   (4 << RADEON_CP_VC_CNTL_NUM_SHIFT));
          } else {
            BEGIN_RING(3 * vtx_count + 4);
            OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
                            3 * vtx_count));
            OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_LIST |
                   RADEON_CP_VC_CNTL_PRIM_WALK_RING |
                   (3 << RADEON_CP_VC_CNTL_NUM_SHIFT));
          }
      } else {
          BEGIN_RING(3 * vtx_count + 2);
          OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
                        3 * vtx_count));
          OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST |
                 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
                 (3 << RADEON_CP_VC_CNTL_NUM_SHIFT));
      }
#else /* ACCEL_CP */
      if (IS_R300_3D || IS_R500_3D) {
          if (use_quad)
            BEGIN_ACCEL(2 + vtx_count * 4);
          else
            BEGIN_ACCEL(2 + vtx_count * 3);
      } else
          BEGIN_ACCEL(1 + vtx_count * 3);

      if (info->ChipFamily < CHIP_FAMILY_R200)
          OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_RECTANGLE_LIST |
                                    RADEON_VF_PRIM_WALK_DATA |
                                    RADEON_VF_RADEON_MODE |
                                    (3 << RADEON_VF_NUM_VERTICES_SHIFT)));
      else if (IS_R300_3D || IS_R500_3D) {
          if (use_quad)
            OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_QUAD_LIST |
                                      RADEON_VF_PRIM_WALK_DATA |
                                      (4 << RADEON_VF_NUM_VERTICES_SHIFT)));
          else
            OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_TRIANGLE_LIST |
                                      RADEON_VF_PRIM_WALK_DATA |
                                      (3 << RADEON_VF_NUM_VERTICES_SHIFT)));
      } else
          OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_RECTANGLE_LIST |
                                    RADEON_VF_PRIM_WALK_DATA |
                                    (3 << RADEON_VF_NUM_VERTICES_SHIFT)));

#endif
      if (pPriv->bicubic_enabled) {
            /*
             * This code is only executed on >= R300, so we don't
             * have to deal with the legacy handling.
             */
          if (use_quad) {
            VTX_OUT_FILTER((float)dstX,                                       (float)dstY,
                         (float)srcX / info->accel_state->texW[0],          (float)srcY / info->accel_state->texH[0],
                         (float)srcX + 0.5,                                 (float)srcY + 0.5);
            VTX_OUT_FILTER((float)dstX,                                       (float)(dstY + dsth),
                         (float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0],
                         (float)srcX + 0.5,                                 (float)(srcY + srch) + 0.5);
            VTX_OUT_FILTER((float)(dstX + dstw),                              (float)(dstY + dsth),
                         (float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0],
                         (float)(srcX + srcw) + 0.5,                        (float)(srcY + srch) + 0.5);
            VTX_OUT_FILTER((float)(dstX + dstw),                              (float)dstY,
                         (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0],
                         (float)(srcX + srcw) + 0.5,                        (float)srcY + 0.5);
          } else {
            VTX_OUT_FILTER((float)dstX,                                       (float)dstY,
                         (float)srcX / info->accel_state->texW[0],          (float)srcY / info->accel_state->texH[0],
                         (float)srcX + 0.5,                                 (float)srcY + 0.5);
            VTX_OUT_FILTER((float)dstX,                                       (float)(dstY + dstw + dsth),
                         (float)srcX / info->accel_state->texW[0],          ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / info->accel_state->texH[0],
                         (float)srcX + 0.5,                                 (float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0) + 0.5);
            VTX_OUT_FILTER((float)(dstX + dstw + dsth),                       (float)dstY,
                         ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / info->accel_state->texW[0],
                                                                            (float)srcY / info->accel_state->texH[0],
                         (float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0) + 0.5,
                                                                            (float)srcY + 0.5);
          }
      } else {
          if (IS_R300_3D || IS_R500_3D) {
            if (use_quad) {
                VTX_OUT((float)dstX,                                       (float)dstY,
                      (float)srcX / info->accel_state->texW[0],          (float)srcY / info->accel_state->texH[0]);
                VTX_OUT((float)dstX,                                       (float)(dstY + dsth),
                      (float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0]);
                VTX_OUT((float)(dstX + dstw),                              (float)(dstY + dsth),
                      (float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]);
                VTX_OUT((float)(dstX + dstw),                              (float)dstY,
                      (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
            } else {
                /*
                 * Render a big, scissored triangle. This means
                 * increasing the triangle size and adjusting
                 * texture coordinates.
                 */
                VTX_OUT((float)dstX,                              (float)dstY,
                      (float)srcX / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
                VTX_OUT((float)dstX,                              (float)(dstY + dsth + dstw),
                      (float)srcX / info->accel_state->texW[0], ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / info->accel_state->texH[0]);
                      
                VTX_OUT((float)(dstX + dstw + dsth),              (float)dstY,
                      ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / info->accel_state->texW[0],
                                                                (float)srcY / info->accel_state->texH[0]);
            }
          } else if (isplanar) {
            /*
             * Just render a rect (using three coords).
             * Filter is a bit a misnomer, it's just texcoords...
             */
            VTX_OUT_FILTER((float)dstX,                                (float)(dstY + dsth),
                  (float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0],
                  (float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0]);
            VTX_OUT_FILTER((float)(dstX + dstw),                       (float)(dstY + dsth),
                  (float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0],
                  (float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]);
            VTX_OUT_FILTER((float)(dstX + dstw),                       (float)dstY,
                  (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0],
                  (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
          } else {
            /*
             * Just render a rect (using three coords).
             */
            VTX_OUT((float)dstX,                                       (float)(dstY + dsth),
                  (float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0]);
            VTX_OUT((float)(dstX + dstw),                              (float)(dstY + dsth),
                  (float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]);
            VTX_OUT((float)(dstX + dstw),                              (float)dstY,
                  (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
          }
      }

      if (IS_R300_3D || IS_R500_3D)
          /* flushing is pipelined, free/finish is not */
          OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);

#ifdef ACCEL_CP
      ADVANCE_RING();
#else
      FINISH_ACCEL();
#endif /* !ACCEL_CP */

      pBox++;
    }

    if (IS_R300_3D || IS_R500_3D) {
      BEGIN_ACCEL(3);
      OUT_ACCEL_REG(R300_SC_CLIP_RULE, 0xAAAA);
      OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_RB3D_DC_FLUSH_ALL);
    } else
      BEGIN_ACCEL(1);
    OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
    FINISH_ACCEL();

    DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
}

#undef VTX_OUT
#undef VTX_OUT_FILTER
#undef FUNC_NAME

Generated by  Doxygen 1.6.0   Back to index