25 files changed, 964 insertions, 662 deletions
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
index 2ef4293d4d..2f973684f6 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2007-2009 VMware, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -18,7 +18,7 @@
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -80,11 +80,27 @@ struct fenced_buffer_list
  */
 struct fenced_buffer
 {
+   /* 
+    * Immutable members.
+    */
+
    struct pb_buffer base;
-   
    struct pb_buffer *buffer;
+   struct fenced_buffer_list *list;
+
+   /**
+    * Protected by fenced_buffer_list::mutex
+    */
+   struct list_head head;
 
-   /* FIXME: protect access with mutex */
+   /**
+    * Following members are mutable and protected by this mutex.
+    * 
+    * You may lock this mutex alone, or lock it with fenced_buffer_list::mutex
+    * held, but in order to prevent deadlocks you must never lock 
+    * fenced_buffer_list::mutex with this mutex held.
+    */
+   pipe_mutex mutex;
 
    /**
     * A bitmask of PIPE_BUFFER_USAGE_CPU/GPU_READ/WRITE describing the current
@@ -96,9 +112,6 @@ struct fenced_buffer
    struct pb_validate *vl;
    unsigned validation_flags;
    struct pipe_fence_handle *fence;
-
-   struct list_head head;
-   struct fenced_buffer_list *list;
 };
 
 
@@ -110,15 +123,24 @@ fenced_buffer(struct pb_buffer *buf)
 }
 
 
+/**
+ * Add the buffer to the fenced list.
+ * 
+ * fenced_buffer_list::mutex and fenced_buffer::mutex must be held, in this 
+ * order before calling this function.
+ * 
+ * Reference count should be incremented before calling this function.
+ */
 static INLINE void
-_fenced_buffer_add(struct fenced_buffer *fenced_buf)
+fenced_buffer_add_locked(struct fenced_buffer_list *fenced_list, 
+                         struct fenced_buffer *fenced_buf)
 {
-   struct fenced_buffer_list *fenced_list = fenced_buf->list;
-
    assert(pipe_is_referenced(&fenced_buf->base.base.reference));
    assert(fenced_buf->flags & PIPE_BUFFER_USAGE_GPU_READ_WRITE);
    assert(fenced_buf->fence);
 
+   /* TODO: Move the reference count increment here */
+   
 #ifdef DEBUG
    LIST_DEL(&fenced_buf->head);
    assert(fenced_list->numUnfenced);
@@ -130,32 +152,16 @@ _fenced_buffer_add(struct fenced_buffer *fenced_buf)
 
 
 /**
- * Actually destroy the buffer.
+ * Remove the buffer from the fenced list.
+ * 
+ * fenced_buffer_list::mutex and fenced_buffer::mutex must be held, in this 
+ * order before calling this function.
+ * 
+ * Reference count should be decremented after calling this function.
  */
 static INLINE void
-_fenced_buffer_destroy(struct fenced_buffer *fenced_buf)
-{
-   struct fenced_buffer_list *fenced_list = fenced_buf->list;
-   
-   assert(!pipe_is_referenced(&fenced_buf->base.base.reference));
-   assert(!fenced_buf->fence);
-#ifdef DEBUG
-   assert(fenced_buf->head.prev);
-   assert(fenced_buf->head.next);
-   LIST_DEL(&fenced_buf->head);
-   assert(fenced_list->numUnfenced);
-   --fenced_list->numUnfenced;
-#else
-   (void)fenced_list;
-#endif
-   pb_reference(&fenced_buf->buffer, NULL);
-   FREE(fenced_buf);
-}
-
-
-static INLINE void
-_fenced_buffer_remove(struct fenced_buffer_list *fenced_list,
-                      struct fenced_buffer *fenced_buf)
+fenced_buffer_remove_locked(struct fenced_buffer_list *fenced_list,
+                            struct fenced_buffer *fenced_buf)
 {
    struct pb_fence_ops *ops = fenced_list->ops;
 
@@ -177,37 +183,53 @@ _fenced_buffer_remove(struct fenced_buffer_list *fenced_list,
    ++fenced_list->numUnfenced;
 #endif
    
-   /**
-    * FIXME!!!
-    */
-
-   if(!pipe_is_referenced(&fenced_buf->base.base.reference))
-      _fenced_buffer_destroy(fenced_buf);
+   /* TODO: Move the reference count decrement and destruction here */
 }
 
 
+/**
+ * Wait for the fence to expire, and remove it from the fenced list.
+ * 
+ * fenced_buffer::mutex must be held. fenced_buffer_list::mutex must not be 
+ * held -- it will
+ */
 static INLINE enum pipe_error
-_fenced_buffer_finish(struct fenced_buffer *fenced_buf)
+fenced_buffer_finish_locked(struct fenced_buffer_list *fenced_list,
+                              struct fenced_buffer *fenced_buf)
 {
-   struct fenced_buffer_list *fenced_list = fenced_buf->list;
    struct pb_fence_ops *ops = fenced_list->ops;
+   enum pipe_error ret = PIPE_ERROR;
 
 #if 0
    debug_warning("waiting for GPU");
 #endif
 
+   assert(pipe_is_referenced(&fenced_buf->base.base.reference));
    assert(fenced_buf->fence);
+
+   /* Acquire the global lock */
+   pipe_mutex_unlock(fenced_buf->mutex);
+   pipe_mutex_lock(fenced_list->mutex);
+   pipe_mutex_lock(fenced_buf->mutex);
+
    if(fenced_buf->fence) {
-      if(ops->fence_finish(ops, fenced_buf->fence, 0) != 0) {
-	 return PIPE_ERROR;
+      if(ops->fence_finish(ops, fenced_buf->fence, 0) == 0) {
+         /* Remove from the fenced list */
+         /* TODO: remove consequents */
+         fenced_buffer_remove_locked(fenced_list, fenced_buf);
+         
+         p_atomic_dec(&fenced_buf->base.base.reference.count);
+         assert(pipe_is_referenced(&fenced_buf->base.base.reference));
+
+         fenced_buf->flags &= ~PIPE_BUFFER_USAGE_GPU_READ_WRITE;
+
+         ret = PIPE_OK;
       }
-      /* Remove from the fenced list */
-      /* TODO: remove consequents */
-      _fenced_buffer_remove(fenced_list, fenced_buf);
    }
 
-   fenced_buf->flags &= ~PIPE_BUFFER_USAGE_GPU_READ_WRITE;
-   return PIPE_OK;
+   pipe_mutex_unlock(fenced_list->mutex);
+
+   return ret;
 }
 
 
@@ -215,7 +237,7 @@ _fenced_buffer_finish(struct fenced_buffer *fenced_buf)
  * Free as many fenced buffers from the list head as possible. 
  */
 static void
-_fenced_buffer_list_check_free(struct fenced_buffer_list *fenced_list, 
+fenced_buffer_list_check_free_locked(struct fenced_buffer_list *fenced_list, 
                                int wait)
 {
    struct pb_fence_ops *ops = fenced_list->ops;
@@ -228,21 +250,28 @@ _fenced_buffer_list_check_free(struct fenced_buffer_list *fenced_list,
    while(curr != &fenced_list->delayed) {
       fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
 
+      pipe_mutex_lock(fenced_buf->mutex);
+
       if(fenced_buf->fence != prev_fence) {
 	 int signaled;
 	 if (wait)
 	    signaled = ops->fence_finish(ops, fenced_buf->fence, 0);
 	 else
 	    signaled = ops->fence_signalled(ops, fenced_buf->fence, 0);
-	 if (signaled != 0)
+	 if (signaled != 0) {
+            pipe_mutex_unlock(fenced_buf->mutex);
 	    break;
+         }
 	 prev_fence = fenced_buf->fence;
       }
       else {
 	 assert(ops->fence_signalled(ops, fenced_buf->fence, 0) == 0);
       }
 
-      _fenced_buffer_remove(fenced_list, fenced_buf);
+      fenced_buffer_remove_locked(fenced_list, fenced_buf);
+      pipe_mutex_unlock(fenced_buf->mutex);
+
+      pb_reference((struct pb_buffer **)&fenced_buf, NULL);
 
       curr = next; 
       next = curr->next;
@@ -256,30 +285,25 @@ fenced_buffer_destroy(struct pb_buffer *buf)
    struct fenced_buffer *fenced_buf = fenced_buffer(buf);   
    struct fenced_buffer_list *fenced_list = fenced_buf->list;
 
-   pipe_mutex_lock(fenced_list->mutex);
    assert(!pipe_is_referenced(&fenced_buf->base.base.reference));
-   if (fenced_buf->fence) {
-      struct pb_fence_ops *ops = fenced_list->ops;
-      if(ops->fence_signalled(ops, fenced_buf->fence, 0) == 0) {
-	 struct list_head *curr, *prev;
-	 curr = &fenced_buf->head;
-	 prev = curr->prev;
-	 do {
-	    fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
-	    assert(ops->fence_signalled(ops, fenced_buf->fence, 0) == 0);
-	    _fenced_buffer_remove(fenced_list, fenced_buf);
-	    curr = prev;
-	    prev = curr->prev;
-	 } while (curr != &fenced_list->delayed);
-      }	  
-      else {
-	 /* delay destruction */
-      }
-   }
-   else {
-      _fenced_buffer_destroy(fenced_buf);
-   }
+   assert(!fenced_buf->fence);
+
+#ifdef DEBUG
+   pipe_mutex_lock(fenced_list->mutex);
+   assert(fenced_buf->head.prev);
+   assert(fenced_buf->head.next);
+   LIST_DEL(&fenced_buf->head);
+   assert(fenced_list->numUnfenced);
+   --fenced_list->numUnfenced;
    pipe_mutex_unlock(fenced_list->mutex);
+#else
+   (void)fenced_list;
+#endif
+
+   pb_reference(&fenced_buf->buffer, NULL);
+
+   pipe_mutex_destroy(fenced_buf->mutex);
+   FREE(fenced_buf);
 }
 
 
@@ -290,24 +314,23 @@ fenced_buffer_map(struct pb_buffer *buf,
    struct fenced_buffer *fenced_buf = fenced_buffer(buf);
    struct fenced_buffer_list *fenced_list = fenced_buf->list;
    struct pb_fence_ops *ops = fenced_list->ops;
-   void *map;
+   void *map = NULL;
+
+   pipe_mutex_lock(fenced_buf->mutex);
 
    assert(!(flags & PIPE_BUFFER_USAGE_GPU_READ_WRITE));
    
    /* Serialize writes */
    if((fenced_buf->flags & PIPE_BUFFER_USAGE_GPU_WRITE) ||
       ((fenced_buf->flags & PIPE_BUFFER_USAGE_GPU_READ) && (flags & PIPE_BUFFER_USAGE_CPU_WRITE))) {
-      if(flags & PIPE_BUFFER_USAGE_DONTBLOCK) {
+      if((flags & PIPE_BUFFER_USAGE_DONTBLOCK) &&
+          ops->fence_signalled(ops, fenced_buf->fence, 0) == 0) {
          /* Don't wait for the GPU to finish writing */
-         if(ops->fence_signalled(ops, fenced_buf->fence, 0) == 0)
-            _fenced_buffer_remove(fenced_list, fenced_buf);
-         else
-            return NULL;
-      }
-      else {
-         /* Wait for the GPU to finish writing */
-         _fenced_buffer_finish(fenced_buf);
+         goto finish;
       }
+
+      /* Wait for the GPU to finish writing */
+      fenced_buffer_finish_locked(fenced_list, fenced_buf);
    }
 
 #if 0
@@ -324,6 +347,9 @@ fenced_buffer_map(struct pb_buffer *buf,
       fenced_buf->flags |= flags & PIPE_BUFFER_USAGE_CPU_READ_WRITE;
    }
 
+finish:
+   pipe_mutex_unlock(fenced_buf->mutex);
+   
    return map;
 }
 
@@ -332,6 +358,9 @@ static void
 fenced_buffer_unmap(struct pb_buffer *buf)
 {
    struct fenced_buffer *fenced_buf = fenced_buffer(buf);
+   
+   pipe_mutex_lock(fenced_buf->mutex);
+   
    assert(fenced_buf->mapcount);
    if(fenced_buf->mapcount) {
       pb_unmap(fenced_buf->buffer);
@@ -339,6 +368,8 @@ fenced_buffer_unmap(struct pb_buffer *buf)
       if(!fenced_buf->mapcount)
 	 fenced_buf->flags &= ~PIPE_BUFFER_USAGE_CPU_READ_WRITE;
    }
+   
+   pipe_mutex_unlock(fenced_buf->mutex);
 }
 
 
@@ -350,11 +381,14 @@ fenced_buffer_validate(struct pb_buffer *buf,
    struct fenced_buffer *fenced_buf = fenced_buffer(buf);
    enum pipe_error ret;
    
+   pipe_mutex_lock(fenced_buf->mutex);
+
    if(!vl) {
       /* invalidate */
       fenced_buf->vl = NULL;
       fenced_buf->validation_flags = 0;
-      return PIPE_OK;
+      ret = PIPE_OK;
+      goto finish;
    }
    
    assert(flags & PIPE_BUFFER_USAGE_GPU_READ_WRITE);
@@ -362,14 +396,17 @@ fenced_buffer_validate(struct pb_buffer *buf,
    flags &= PIPE_BUFFER_USAGE_GPU_READ_WRITE;
 
    /* Buffer cannot be validated in two different lists */ 
-   if(fenced_buf->vl && fenced_buf->vl != vl)
-      return PIPE_ERROR_RETRY;
+   if(fenced_buf->vl && fenced_buf->vl != vl) {
+      ret = PIPE_ERROR_RETRY;
+      goto finish;
+   }
    
 #if 0
    /* Do not validate if buffer is still mapped */
    if(fenced_buf->flags & PIPE_BUFFER_USAGE_CPU_READ_WRITE) {
       /* TODO: wait for the thread that mapped the buffer to unmap it */
-      return PIPE_ERROR_RETRY;
+      ret = PIPE_ERROR_RETRY;
+      goto finish;
    }
    /* Final sanity checking */
    assert(!(fenced_buf->flags & PIPE_BUFFER_USAGE_CPU_READ_WRITE));
@@ -379,17 +416,21 @@ fenced_buffer_validate(struct pb_buffer *buf,
    if(fenced_buf->vl == vl &&
       (fenced_buf->validation_flags & flags) == flags) {
       /* Nothing to do -- buffer already validated */
-      return PIPE_OK;
+      ret = PIPE_OK;
+      goto finish;
    }
    
    ret = pb_validate(fenced_buf->buffer, vl, flags);
    if (ret != PIPE_OK)
-      return ret;
+      goto finish;
    
    fenced_buf->vl = vl;
    fenced_buf->validation_flags |= flags;
    
-   return PIPE_OK;
+finish:
+   pipe_mutex_unlock(fenced_buf->mutex);
+
+   return ret;
 }
 
 
@@ -404,29 +445,36 @@ fenced_buffer_fence(struct pb_buffer *buf,
    fenced_buf = fenced_buffer(buf);
    fenced_list = fenced_buf->list;
    ops = fenced_list->ops;
-   
-   if(fence == fenced_buf->fence) {
-      /* Nothing to do */
-      return;
-   }
 
-   assert(fenced_buf->vl);
-   assert(fenced_buf->validation_flags);
-   
    pipe_mutex_lock(fenced_list->mutex);
-   if (fenced_buf->fence)
-      _fenced_buffer_remove(fenced_list, fenced_buf);
-   if (fence) {
-      ops->fence_reference(ops, &fenced_buf->fence, fence);
-      fenced_buf->flags |= fenced_buf->validation_flags;
-      _fenced_buffer_add(fenced_buf);
-   }
-   pipe_mutex_unlock(fenced_list->mutex);
+   pipe_mutex_lock(fenced_buf->mutex);
+
+   assert(pipe_is_referenced(&fenced_buf->base.base.reference));
+
+   if(fence != fenced_buf->fence) {
+      assert(fenced_buf->vl);
+      assert(fenced_buf->validation_flags);
+      
+      if (fenced_buf->fence) {
+         fenced_buffer_remove_locked(fenced_list, fenced_buf);
+         p_atomic_dec(&fenced_buf->base.base.reference.count);
+         assert(pipe_is_referenced(&fenced_buf->base.base.reference));
+      }
+      if (fence) {
+         ops->fence_reference(ops, &fenced_buf->fence, fence);
+         fenced_buf->flags |= fenced_buf->validation_flags;
+         p_atomic_inc(&fenced_buf->base.base.reference.count);
+         fenced_buffer_add_locked(fenced_list, fenced_buf);
+      }
+
+      pb_fence(fenced_buf->buffer, fence);
    
-   pb_fence(fenced_buf->buffer, fence);
+      fenced_buf->vl = NULL;
+      fenced_buf->validation_flags = 0;
+   }
 
-   fenced_buf->vl = NULL;
-   fenced_buf->validation_flags = 0;
+   pipe_mutex_unlock(fenced_buf->mutex);
+   pipe_mutex_unlock(fenced_list->mutex);
 }
 
 
@@ -436,6 +484,7 @@ fenced_buffer_get_base_buffer(struct pb_buffer *buf,
                               pb_size *offset)
 {
    struct fenced_buffer *fenced_buf = fenced_buffer(buf);
+   /* NOTE: accesses immutable members only -- mutex not necessary */
    pb_get_base_buffer(fenced_buf->buffer, base_buf, offset);
 }
 
@@ -475,6 +524,8 @@ fenced_buffer_create(struct fenced_buffer_list *fenced_list,
    buf->buffer = buffer;
    buf->list = fenced_list;
    
+   pipe_mutex_init(buf->mutex);
+
 #ifdef DEBUG
    pipe_mutex_lock(fenced_list->mutex);
    LIST_ADDTAIL(&buf->head, &fenced_list->unfenced);
@@ -516,7 +567,7 @@ fenced_buffer_list_check_free(struct fenced_buffer_list *fenced_list,
                               int wait)
 {
    pipe_mutex_lock(fenced_list->mutex);
-   _fenced_buffer_list_check_free(fenced_list, wait);
+   fenced_buffer_list_check_free_locked(fenced_list, wait);
    pipe_mutex_unlock(fenced_list->mutex);
 }
 
@@ -538,11 +589,13 @@ fenced_buffer_list_dump(struct fenced_buffer_list *fenced_list)
    next = curr->next;
    while(curr != &fenced_list->unfenced) {
       fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
+      pipe_mutex_lock(fenced_buf->mutex);
       assert(!fenced_buf->fence);
       debug_printf("%10p %7u %7u\n",
                    (void *) fenced_buf,
                    fenced_buf->base.base.size,
                    p_atomic_read(&fenced_buf->base.base.reference.count));
+      pipe_mutex_unlock(fenced_buf->mutex);
       curr = next; 
       next = curr->next;
    }
@@ -552,6 +605,7 @@ fenced_buffer_list_dump(struct fenced_buffer_list *fenced_list)
    while(curr != &fenced_list->delayed) {
       int signaled;
       fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
+      pipe_mutex_lock(fenced_buf->mutex);
       signaled = ops->fence_signalled(ops, fenced_buf->fence, 0);
       debug_printf("%10p %7u %7u %10p %s\n",
                    (void *) fenced_buf,
@@ -559,6 +613,7 @@ fenced_buffer_list_dump(struct fenced_buffer_list *fenced_list)
                    p_atomic_read(&fenced_buf->base.base.reference.count),
                    (void *) fenced_buf->fence,
                    signaled == 0 ? "y" : "n");
+      pipe_mutex_unlock(fenced_buf->mutex);
       curr = next; 
       next = curr->next;
    }
@@ -579,8 +634,8 @@ fenced_buffer_list_destroy(struct fenced_buffer_list *fenced_list)
 #if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS)
       sched_yield();
 #endif
-      _fenced_buffer_list_check_free(fenced_list, 1);
       pipe_mutex_lock(fenced_list->mutex);
+      fenced_buffer_list_check_free_locked(fenced_list, 1);
    }
 
 #ifdef DEBUG
@@ -588,6 +643,7 @@ fenced_buffer_list_destroy(struct fenced_buffer_list *fenced_list)
 #endif
       
    pipe_mutex_unlock(fenced_list->mutex);
+   pipe_mutex_destroy(fenced_list->mutex);
    
    fenced_list->ops->destroy(fenced_list->ops);
    
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index b7569e74d4..af914f6d08 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -62,9 +62,6 @@
 
 #define FAST_MATH 1
 
-/** for tgsi_full_instruction::Flags */
-#define SOA_DEPENDENCY_FLAG 0x1
-
 #define TILE_TOP_LEFT     0
 #define TILE_TOP_RIGHT    1
 #define TILE_BOTTOM_LEFT  2
@@ -332,20 +329,6 @@ tgsi_exec_machine_bind_shader(
             maxInstructions += 10;
          }
 
-         if (tgsi_check_soa_dependencies(&parse.FullToken.FullInstruction)) {
-            uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
-            parse.FullToken.FullInstruction.Flags = SOA_DEPENDENCY_FLAG;
-            /* XXX we only handle SOA dependencies properly for MOV/SWZ
-             * at this time!
-             */
-            if (opcode != TGSI_OPCODE_MOV) {
-               debug_printf("Warning: SOA dependency in instruction"
-                            " is not handled:\n");
-               tgsi_dump_instruction(&parse.FullToken.FullInstruction,
-                                     numInstructions);
-            }
-         }
-
          memcpy(instructions + numInstructions,
                 &parse.FullToken.FullInstruction,
                 sizeof(instructions[0]));
@@ -595,6 +578,24 @@ micro_exp2(
    dst->f[2] = util_fast_exp2( src->f[2] );
    dst->f[3] = util_fast_exp2( src->f[3] );
 #else
+
+#if DEBUG
+   /* Inf is okay for this instruction, so clamp it to silence assertions. */
+   uint i;
+   union tgsi_exec_channel clamped;
+
+   for (i = 0; i < 4; i++) {
+      if (src->f[i] > 127.99999f) {
+         clamped.f[i] = 127.99999f;
+      } else if (src->f[i] < -126.99999f) {
+         clamped.f[i] = -126.99999f;
+      } else {
+         clamped.f[i] = src->f[i];
+      }
+   }
+   src = &clamped;
+#endif
+
    dst->f[0] = powf( 2.0f, src->f[0] );
    dst->f[1] = powf( 2.0f, src->f[1] );
    dst->f[2] = powf( 2.0f, src->f[2] );
@@ -1478,6 +1479,13 @@ store_dest(
       dst = &mach->Addrs[index].xyzw[chan_index];
       break;
 
+   case TGSI_FILE_LOOP:
+      assert(reg->DstRegister.Index == 0);
+      assert(mach->LoopCounterStackTop > 0);
+      assert(chan_index == CHAN_X);
+      dst = &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[chan_index];
+      break;
+
    case TGSI_FILE_PREDICATE:
       index = reg->DstRegister.Index;
       assert(index < TGSI_EXEC_NUM_PREDS);
@@ -1715,6 +1723,64 @@ exec_tex(struct tgsi_exec_machine *mach,
    }
 }
 
+static void
+exec_txd(struct tgsi_exec_machine *mach,
+         const struct tgsi_full_instruction *inst)
+{
+   const uint unit = inst->FullSrcRegisters[3].SrcRegister.Index;
+   union tgsi_exec_channel r[4];
+   uint chan_index;
+
+   /*
+    * XXX: This is fake TXD -- the derivatives are not taken into account, yet.
+    */
+
+   switch (inst->InstructionExtTexture.Texture) {
+   case TGSI_TEXTURE_1D:
+   case TGSI_TEXTURE_SHADOW1D:
+
+      FETCH(&r[0], 0, CHAN_X);
+
+      fetch_texel(mach->Samplers[unit],
+                  &r[0], &ZeroVec, &ZeroVec, 0.0f,  /* S, T, P, BIAS */
+                  &r[0], &r[1], &r[2], &r[3]);      /* R, G, B, A */
+      break;
+
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+   case TGSI_TEXTURE_SHADOW2D:
+   case TGSI_TEXTURE_SHADOWRECT:
+
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 0, CHAN_Z);
+
+      fetch_texel(mach->Samplers[unit],
+                  &r[0], &r[1], &r[2], 0.0f,    /* inputs */
+                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
+      break;
+
+   case TGSI_TEXTURE_3D:
+   case TGSI_TEXTURE_CUBE:
+
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 0, CHAN_Z);
+
+      fetch_texel(mach->Samplers[unit],
+                  &r[0], &r[1], &r[2], 0.0f,
+                  &r[0], &r[1], &r[2], &r[3]);
+      break;
+
+   default:
+      assert(0);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+      STORE(&r[chan_index], 0, chan_index);
+   }
+}
+
 
 /**
  * Evaluate a constant-valued coefficient at the position of the
@@ -1784,53 +1850,58 @@ typedef void (* eval_coef_func)(
    unsigned chan );
 
 static void
-exec_declaration(
-   struct tgsi_exec_machine *mach,
-   const struct tgsi_full_declaration *decl )
+exec_declaration(struct tgsi_exec_machine *mach,
+                 const struct tgsi_full_declaration *decl)
 {
-   if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
-      if( decl->Declaration.File == TGSI_FILE_INPUT ) {
-         unsigned first, last, mask;
-         eval_coef_func eval;
+   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
+      if (decl->Declaration.File == TGSI_FILE_INPUT) {
+         uint first, last, mask;
 
          first = decl->DeclarationRange.First;
          last = decl->DeclarationRange.Last;
          mask = decl->Declaration.UsageMask;
 
-         switch( decl->Declaration.Interpolate ) {
-         case TGSI_INTERPOLATE_CONSTANT:
-            eval = eval_constant_coef;
-            break;
-
-         case TGSI_INTERPOLATE_LINEAR:
-            eval = eval_linear_coef;
-            break;
-
-         case TGSI_INTERPOLATE_PERSPECTIVE:
-            eval = eval_perspective_coef;
-            break;
+         if (decl->Semantic.SemanticName == TGSI_SEMANTIC_POSITION) {
+            assert(decl->Semantic.SemanticIndex == 0);
+            assert(first == last);
+            assert(mask = TGSI_WRITEMASK_XYZW);
 
-         default:
-            assert( 0 );
-            return;
-         }
+            mach->Inputs[first] = mach->QuadPos;
+         } else if (decl->Semantic.SemanticName == TGSI_SEMANTIC_FACE) {
+            uint i;
 
-         if( mask == TGSI_WRITEMASK_XYZW ) {
-            unsigned i, j;
+            assert(decl->Semantic.SemanticIndex == 0);
+            assert(first == last);
 
-            for( i = first; i <= last; i++ ) {
-               for( j = 0; j < NUM_CHANNELS; j++ ) {
-                  eval( mach, i, j );
-               }
+            for (i = 0; i < QUAD_SIZE; i++) {
+               mach->Inputs[first].xyzw[0].f[i] = mach->Face;
+            }
+         } else {
+            eval_coef_func eval;
+            uint i, j;
+
+            switch (decl->Declaration.Interpolate) {
+            case TGSI_INTERPOLATE_CONSTANT:
+               eval = eval_constant_coef;
+               break;
+
+            case TGSI_INTERPOLATE_LINEAR:
+               eval = eval_linear_coef;
+               break;
+
+            case TGSI_INTERPOLATE_PERSPECTIVE:
+               eval = eval_perspective_coef;
+               break;
+
+            default:
+               assert(0);
+               return;
             }
-         }
-         else {
-            unsigned i, j;
 
-            for( j = 0; j < NUM_CHANNELS; j++ ) {
-               if( mask & (1 << j) ) {
-                  for( i = first; i <= last; i++ ) {
-                     eval( mach, i, j );
+            for (j = 0; j < NUM_CHANNELS; j++) {
+               if (mask & (1 << j)) {
+                  for (i = first; i <= last; i++) {
+                     eval(mach, i, j);
                   }
                }
             }
@@ -1847,6 +1918,7 @@ exec_instruction(
 {
    uint chan_index;
    union tgsi_exec_channel r[10];
+   union tgsi_exec_channel d[8];
 
    (*pc)++;
 
@@ -1855,42 +1927,27 @@ exec_instruction(
    case TGSI_OPCODE_FLR:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_flr( &r[0], &r[0] );
-         STORE( &r[0], 0, chan_index );
+         micro_flr(&d[chan_index], &r[0]);
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
    case TGSI_OPCODE_MOV:
-      if (inst->Flags & SOA_DEPENDENCY_FLAG) {
-         /* Do all fetches into temp regs, then do all stores to avoid
-          * intermediate/accidental clobbering.  This could be done all the
-          * time for MOV but for other instructions we'll need more temps...
-          */
-         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-            FETCH( &r[chan_index], 0, chan_index );
-         }
-         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-            STORE( &r[chan_index], 0, chan_index );
-         }
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&d[chan_index], 0, chan_index);
       }
-      else {
-         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-            FETCH( &r[0], 0, chan_index );
-            STORE( &r[0], 0, chan_index );
-         }
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
    case TGSI_OPCODE_LIT:
-      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
-         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
-      }
-
       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
          FETCH( &r[0], 0, CHAN_X );
          if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-            micro_max( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
-            STORE( &r[0], 0, CHAN_Y );
+            micro_max(&d[CHAN_Y], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
          }
 
          if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
@@ -1901,11 +1958,19 @@ exec_instruction(
             micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
             micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
             micro_pow( &r[1], &r[1], &r[2] );
-            micro_lt( &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
-            STORE( &r[0], 0, CHAN_Z );
+            micro_lt(&d[CHAN_Z], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
          }
-      }
 
+         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+            STORE(&d[CHAN_Y], 0, CHAN_Y);
+         }
+         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+            STORE(&d[CHAN_Z], 0, CHAN_Z);
+         }
+      }
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
+      }
       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
       }
@@ -1973,14 +2038,13 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_MUL:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index )
-      {
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
          FETCH(&r[0], 0, chan_index);
          FETCH(&r[1], 1, chan_index);
-
-         micro_mul( &r[0], &r[0], &r[1] );
-
-         STORE(&r[0], 0, chan_index);
+         micro_mul(&d[chan_index], &r[0], &r[1]);
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
@@ -1988,8 +2052,10 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_add( &r[0], &r[0], &r[1] );
-         STORE( &r[0], 0, chan_index );
+         micro_add(&d[chan_index], &r[0], &r[1]);
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
@@ -2045,25 +2111,29 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_DST:
-      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
-         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
-      }
-
       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
          FETCH( &r[0], 0, CHAN_Y );
          FETCH( &r[1], 1, CHAN_Y);
-         micro_mul( &r[0], &r[0], &r[1] );
-         STORE( &r[0], 0, CHAN_Y );
+         micro_mul(&d[CHAN_Y], &r[0], &r[1]);
       }
-
       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-         FETCH( &r[0], 0, CHAN_Z );
-         STORE( &r[0], 0, CHAN_Z );
+         FETCH(&d[CHAN_Z], 0, CHAN_Z);
       }
-
       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
-         FETCH( &r[0], 1, CHAN_W );
-         STORE( &r[0], 0, CHAN_W );
+         FETCH(&d[CHAN_W], 1, CHAN_W);
+      }
+
+      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
+         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X);
+      }
+      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+         STORE(&d[CHAN_Y], 0, CHAN_Y);
+      }
+      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+         STORE(&d[CHAN_Z], 0, CHAN_Z);
+      }
+      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
+         STORE(&d[CHAN_W], 0, CHAN_W);
       }
       break;
 
@@ -2073,9 +2143,10 @@ exec_instruction(
          FETCH(&r[1], 1, chan_index);
 
          /* XXX use micro_min()?? */
-         micro_lt( &r[0], &r[0], &r[1], &r[0], &r[1] );
-
-         STORE(&r[0], 0, chan_index);
+         micro_lt(&d[chan_index], &r[0], &r[1], &r[0], &r[1]);
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
@@ -2085,9 +2156,10 @@ exec_instruction(
          FETCH(&r[1], 1, chan_index);
 
          /* XXX use micro_max()?? */
-         micro_lt( &r[0], &r[0], &r[1], &r[1], &r[0] );
-
-         STORE(&r[0], 0, chan_index );
+         micro_lt(&d[chan_index], &r[0], &r[1], &r[1], &r[0] );
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
@@ -2096,8 +2168,10 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
-         STORE( &r[0], 0, chan_index );
+         micro_lt(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
@@ -2106,8 +2180,10 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_le( &r[0], &r[1], &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
-         STORE( &r[0], 0, chan_index );
+         micro_le(&d[chan_index], &r[1], &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
@@ -2118,8 +2194,10 @@ exec_instruction(
          FETCH( &r[1], 1, chan_index );
          micro_mul( &r[0], &r[0], &r[1] );
          FETCH( &r[1], 2, chan_index );
-         micro_add( &r[0], &r[0], &r[1] );
-         STORE( &r[0], 0, chan_index );
+         micro_add(&d[chan_index], &r[0], &r[1]);
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
@@ -2127,10 +2205,10 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH(&r[0], 0, chan_index);
          FETCH(&r[1], 1, chan_index);
-
-         micro_sub( &r[0], &r[0], &r[1] );
-
-         STORE(&r[0], 0, chan_index);
+         micro_sub(&d[chan_index], &r[0], &r[1]);
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
@@ -2139,12 +2217,12 @@ exec_instruction(
          FETCH(&r[0], 0, chan_index);
          FETCH(&r[1], 1, chan_index);
          FETCH(&r[2], 2, chan_index);
-
          micro_sub( &r[1], &r[1], &r[2] );
          micro_mul( &r[0], &r[0], &r[1] );
-         micro_add( &r[0], &r[0], &r[2] );
-
-         STORE(&r[0], 0, chan_index);
+         micro_add(&d[chan_index], &r[0], &r[2]);
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
@@ -2153,8 +2231,10 @@ exec_instruction(
          FETCH(&r[0], 0, chan_index);
          FETCH(&r[1], 1, chan_index);
          FETCH(&r[2], 2, chan_index);
-         micro_lt(&r[0], &mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C], &r[2], &r[0], &r[1]);
-         STORE(&r[0], 0, chan_index);
+         micro_lt(&d[chan_index], &mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C], &r[2], &r[0], &r[1]);
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
@@ -2179,8 +2259,10 @@ exec_instruction(
    case TGSI_OPCODE_FRC:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_frc( &r[0], &r[0] );
-         STORE( &r[0], 0, chan_index );
+         micro_frc(&d[chan_index], &r[0]);
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
@@ -2190,8 +2272,10 @@ exec_instruction(
          FETCH(&r[1], 1, chan_index);
          micro_max(&r[0], &r[0], &r[1]);
          FETCH(&r[1], 2, chan_index);
-         micro_min(&r[0], &r[0], &r[1]);
-         STORE(&r[0], 0, chan_index);
+         micro_min(&d[chan_index], &r[0], &r[1]);
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
@@ -2199,19 +2283,17 @@ exec_instruction(
    case TGSI_OPCODE_ARR:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_rnd( &r[0], &r[0] );
-         STORE( &r[0], 0, chan_index );
+         micro_rnd(&d[chan_index], &r[0]);
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
    case TGSI_OPCODE_EX2:
       FETCH(&r[0], 0, CHAN_X);
 
-#if FAST_MATH
       micro_exp2( &r[0], &r[0] );
-#else
-      micro_pow( &r[0], &mach->Temps[TEMP_2_I].xyzw[TEMP_2_C], &r[0] );
-#endif
 
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( &r[0], 0, chan_index );
@@ -2247,11 +2329,7 @@ exec_instruction(
       FETCH(&r[4], 1, CHAN_Y);
 
       micro_mul( &r[5], &r[3], &r[4] );
-      micro_sub( &r[2], &r[2], &r[5] );
-
-      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
-         STORE( &r[2], 0, CHAN_X );
-      }
+      micro_sub(&d[CHAN_X], &r[2], &r[5]);
 
       FETCH(&r[2], 1, CHAN_X);
 
@@ -2260,20 +2338,21 @@ exec_instruction(
       FETCH(&r[5], 0, CHAN_X);
 
       micro_mul( &r[1], &r[1], &r[5] );
-      micro_sub( &r[3], &r[3], &r[1] );
-
-      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-         STORE( &r[3], 0, CHAN_Y );
-      }
+      micro_sub(&d[CHAN_Y], &r[3], &r[1]);
 
       micro_mul( &r[5], &r[5], &r[4] );
       micro_mul( &r[0], &r[0], &r[2] );
-      micro_sub( &r[5], &r[5], &r[0] );
+      micro_sub(&d[CHAN_Z], &r[5], &r[0]);
 
-      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-         STORE( &r[5], 0, CHAN_Z );
+      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
+         STORE(&d[CHAN_X], 0, CHAN_X);
+      }
+      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+         STORE(&d[CHAN_Y], 0, CHAN_Y);
+      }
+      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+         STORE(&d[CHAN_Z], 0, CHAN_Z);
       }
-
       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
          STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
       }
@@ -2282,11 +2361,11 @@ exec_instruction(
     case TGSI_OPCODE_ABS:
        FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
           FETCH(&r[0], 0, chan_index);
-
-          micro_abs( &r[0], &r[0] );
-
-          STORE(&r[0], 0, chan_index);
+          micro_abs(&d[chan_index], &r[0]);
        }
+       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
+      }
        break;
 
    case TGSI_OPCODE_RCC:
@@ -2338,16 +2417,20 @@ exec_instruction(
    case TGSI_OPCODE_DDX:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_ddx( &r[0], &r[0] );
-         STORE( &r[0], 0, chan_index );
+         micro_ddx(&d[chan_index], &r[0]);
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
    case TGSI_OPCODE_DDY:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_ddy( &r[0], &r[0] );
-         STORE( &r[0], 0, chan_index );
+         micro_ddy(&d[chan_index], &r[0]);
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
@@ -2428,10 +2511,10 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_eq( &r[0], &r[0], &r[1],
-                   &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C],
-                   &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
-         STORE( &r[0], 0, chan_index );
+         micro_eq(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
@@ -2445,8 +2528,10 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_le( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
-         STORE( &r[0], 0, chan_index );
+         micro_le(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
@@ -2462,8 +2547,10 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_le( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
-         STORE( &r[0], 0, chan_index );
+         micro_le(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
@@ -2471,8 +2558,10 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_eq( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
-         STORE( &r[0], 0, chan_index );
+         micro_eq(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
@@ -2502,7 +2591,7 @@ exec_instruction(
       /* src[1] = d[strq]/dx */
       /* src[2] = d[strq]/dy */
       /* src[3] = sampler unit */
-      assert (0);
+      exec_txd(mach, inst);
       break;
 
    case TGSI_OPCODE_TXL:
@@ -2546,13 +2635,8 @@ exec_instruction(
          micro_mul(&r[3], &r[3], &r[1]);
          micro_add(&r[2], &r[2], &r[3]);
          FETCH(&r[3], 0, CHAN_X);
-         micro_add(&r[2], &r[2], &r[3]);
-         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
-            STORE(&r[2], 0, CHAN_X);
-         }
-         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
-            STORE(&r[2], 0, CHAN_Z);
-         }
+         micro_add(&d[CHAN_X], &r[2], &r[3]);
+         
       }
       if (IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
           IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
@@ -2562,13 +2646,20 @@ exec_instruction(
          micro_mul(&r[3], &r[3], &r[1]);
          micro_add(&r[2], &r[2], &r[3]);
          FETCH(&r[3], 0, CHAN_Y);
-         micro_add(&r[2], &r[2], &r[3]);
-         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
-            STORE(&r[2], 0, CHAN_Y);
-         }
-         if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
-            STORE(&r[2], 0, CHAN_W);
-         }
+         micro_add(&d[CHAN_Y], &r[2], &r[3]);
+         
+      }
+      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
+         STORE(&d[CHAN_X], 0, CHAN_X);
+      }
+      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+         STORE(&d[CHAN_Y], 0, CHAN_Y);
+      }
+      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+         STORE(&d[CHAN_X], 0, CHAN_Z);
+      }
+      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
+         STORE(&d[CHAN_Y], 0, CHAN_W);
       }
       break;
 
@@ -2653,8 +2744,10 @@ exec_instruction(
    /* TGSI_OPCODE_SGN */
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_sgn( &r[0], &r[0] );
-         STORE( &r[0], 0, chan_index );
+         micro_sgn(&d[chan_index], &r[0]);
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
@@ -2663,10 +2756,10 @@ exec_instruction(
          FETCH(&r[0], 0, chan_index);
          FETCH(&r[1], 1, chan_index);
          FETCH(&r[2], 2, chan_index);
-
-         micro_lt( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2] );
-
-         STORE(&r[0], 0, chan_index);
+         micro_lt(&d[chan_index], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2]);
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
@@ -2841,32 +2934,40 @@ exec_instruction(
    case TGSI_OPCODE_CEIL:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_ceil( &r[0], &r[0] );
-         STORE( &r[0], 0, chan_index );
+         micro_ceil(&d[chan_index], &r[0]);
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
    case TGSI_OPCODE_I2F:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_i2f( &r[0], &r[0] );
-         STORE( &r[0], 0, chan_index );
+         micro_i2f(&d[chan_index], &r[0]);
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
    case TGSI_OPCODE_NOT:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_not( &r[0], &r[0] );
-         STORE( &r[0], 0, chan_index );
+         micro_not(&d[chan_index], &r[0]);
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
    case TGSI_OPCODE_TRUNC:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_trunc( &r[0], &r[0] );
-         STORE( &r[0], 0, chan_index );
+         micro_trunc(&d[chan_index], &r[0]);
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
@@ -2874,8 +2975,10 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_shl( &r[0], &r[0], &r[1] );
-         STORE( &r[0], 0, chan_index );
+         micro_shl(&d[chan_index], &r[0], &r[1]);
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
@@ -2883,8 +2986,10 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_ishr( &r[0], &r[0], &r[1] );
-         STORE( &r[0], 0, chan_index );
+         micro_ishr(&d[chan_index], &r[0], &r[1]);
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
@@ -2892,8 +2997,10 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_and( &r[0], &r[0], &r[1] );
-         STORE( &r[0], 0, chan_index );
+         micro_and(&d[chan_index], &r[0], &r[1]);
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
@@ -2901,8 +3008,10 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_or( &r[0], &r[0], &r[1] );
-         STORE( &r[0], 0, chan_index );
+         micro_or(&d[chan_index], &r[0], &r[1]);
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
@@ -2914,8 +3023,10 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_xor( &r[0], &r[0], &r[1] );
-         STORE( &r[0], 0, chan_index );
+         micro_xor(&d[chan_index], &r[0], &r[1]);
+      }
+      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+         STORE(&d[chan_index], 0, chan_index);
       }
       break;
 
@@ -2946,8 +3057,23 @@ exec_instruction(
       for (chan_index = 0; chan_index < 3; chan_index++) {
          FETCH( &mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[chan_index], 0, chan_index );
       }
-      STORE( &mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[CHAN_Y], 0, CHAN_X );
       ++mach->LoopCounterStackTop;
+      STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X], 0, CHAN_X);
+      /* update LoopMask */
+      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
+         mach->LoopMask &= ~0x1;
+      }
+      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
+         mach->LoopMask &= ~0x2;
+      }
+      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
+         mach->LoopMask &= ~0x4;
+      }
+      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
+         mach->LoopMask &= ~0x8;
+      }
+      /* TODO: if mach->LoopMask == 0, jump to end of loop */
+      UPDATE_EXEC_MASK(mach);
       /* fall-through (for now) */
    case TGSI_OPCODE_BGNLOOP:
       /* push LoopMask and ContMasks */
@@ -2961,28 +3087,28 @@ exec_instruction(
 
    case TGSI_OPCODE_ENDFOR:
       assert(mach->LoopCounterStackTop > 0);
-      micro_sub( &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X], 
-                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
-                 &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
+      micro_sub(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y], 
+                &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
+                &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
       /* update LoopMask */
-      if( mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X].f[0] <= 0) {
+      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
          mach->LoopMask &= ~0x1;
       }
-      if( mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X].f[1] <= 0 ) {
+      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
          mach->LoopMask &= ~0x2;
       }
-      if( mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X].f[2] <= 0 ) {
+      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
          mach->LoopMask &= ~0x4;
       }
-      if( mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X].f[3] <= 0 ) {
+      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
          mach->LoopMask &= ~0x8;
       }
-      micro_add( &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y], 
-                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y], 
-                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Z]);
+      micro_add(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X], 
+                &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X], 
+                &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Z]);
       assert(mach->LoopLabelStackTop > 0);
       inst = mach->Instructions + mach->LoopLabelStack[mach->LoopLabelStackTop - 1];
-      STORE( &mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[CHAN_Y], 0, CHAN_X );
+      STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[CHAN_X], 0, CHAN_X);
       /* Restore ContMask, but don't pop */
       assert(mach->ContStackTop > 0);
       mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
@@ -3060,6 +3186,8 @@ exec_instruction(
    }
 }
 
+#define DEBUG_EXECUTION 0
+
 
 /**
  * Run TGSI interpreter.
@@ -3103,10 +3231,67 @@ tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
       exec_declaration( mach, mach->Declarations+i );
    }
 
-   /* execute instructions, until pc is set to -1 */
-   while (pc != -1) {
-      assert(pc < (int) mach->NumInstructions);
-      exec_instruction( mach, mach->Instructions + pc, &pc );
+   {
+#if DEBUG_EXECUTION
+      struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
+      struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
+      uint inst = 1;
+
+      memcpy(temps, mach->Temps, sizeof(temps));
+      memcpy(outputs, mach->Outputs, sizeof(outputs));
+#endif
+
+      /* execute instructions, until pc is set to -1 */
+      while (pc != -1) {
+
+#if DEBUG_EXECUTION
+         uint i;
+
+         tgsi_dump_instruction(&mach->Instructions[pc], inst++);
+#endif
+
+         assert(pc < (int) mach->NumInstructions);
+         exec_instruction(mach, mach->Instructions + pc, &pc);
+
+#if DEBUG_EXECUTION
+         for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
+            if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
+               uint j;
+
+               memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
+               debug_printf("TEMP[%2u] = ", i);
+               for (j = 0; j < 4; j++) {
+                  if (j > 0) {
+                     debug_printf("           ");
+                  }
+                  debug_printf("(%6f, %6f, %6f, %6f)\n",
+                               temps[i].xyzw[0].f[j],
+                               temps[i].xyzw[1].f[j],
+                               temps[i].xyzw[2].f[j],
+                               temps[i].xyzw[3].f[j]);
+               }
+            }
+         }
+         for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
+            if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
+               uint j;
+
+               memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
+               debug_printf("OUT[%2u] =  ", i);
+               for (j = 0; j < 4; j++) {
+                  if (j > 0) {
+                     debug_printf("           ");
+                  }
+                  debug_printf("{%6f, %6f, %6f, %6f}\n",
+                               outputs[i].xyzw[0].f[j],
+                               outputs[i].xyzw[1].f[j],
+                               outputs[i].xyzw[2].f[j],
+                               outputs[i].xyzw[3].f[j]);
+               }
+            }
+         }
+#endif
+      }
    }
 
 #if 0
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index 471f591dd6..3dff69a505 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -232,6 +232,7 @@ struct tgsi_exec_machine
    /* FRAGMENT processor only. */
    const struct tgsi_interp_coef *InterpCoefs;
    struct tgsi_exec_vector       QuadPos;
+   float                         Face;    /**< +1 if front facing, -1 if back facing */
 
    /* Conditional execution masks */
    uint CondMask;  /**< For IF/ELSE/ENDIF */
@@ -252,7 +253,7 @@ struct tgsi_exec_machine
    uint LoopLabelStack[TGSI_EXEC_MAX_LOOP_NESTING];
    int LoopLabelStackTop;
 
-   /** Loop counter stack (x = count, y = current, z = step) */
+   /** Loop counter stack (x = index, y = counter, z = step) */
    struct tgsi_exec_vector LoopCounterStack[TGSI_EXEC_MAX_LOOP_NESTING];
    int LoopCounterStackTop;
    
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index 75b075f160..731a11475e 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -499,6 +499,9 @@ util_next_power_of_two(unsigned x)
 #define MIN2( A, B )   ( (A)<(B) ? (A) : (B) )
 #define MAX2( A, B )   ( (A)>(B) ? (A) : (B) )
 
+#define MIN3( A, B, C ) MIN2( MIN2( A, B ), C )
+#define MAX3( A, B, C ) MAX2( MAX2( A, B ), C )
+
 
 static INLINE int
 align(int value, int alignment)
diff --git a/src/gallium/drivers/identity/id_objects.c b/src/gallium/drivers/identity/id_objects.c
index e893e59940..bc9bc7121d 100644
--- a/src/gallium/drivers/identity/id_objects.c
+++ b/src/gallium/drivers/identity/id_objects.c
@@ -180,3 +180,42 @@ identity_transfer_destroy(struct identity_transfer *id_transfer)
    screen->tex_transfer_destroy(id_transfer->transfer);
    FREE(id_transfer);
 }
+
+struct pipe_video_surface *
+identity_video_surface_create(struct identity_screen *id_screen,
+                              struct pipe_video_surface *video_surface)
+{
+   struct identity_video_surface *id_video_surface;
+
+   if (!video_surface) {
+      goto error;
+   }
+
+   assert(video_surface->screen == id_screen->screen);
+
+   id_video_surface = CALLOC_STRUCT(identity_video_surface);
+   if (!id_video_surface) {
+      goto error;
+   }
+
+   memcpy(&id_video_surface->base,
+          video_surface,
+          sizeof(struct pipe_video_surface));
+
+   pipe_reference_init(&id_video_surface->base.reference, 1);
+   id_video_surface->base.screen = &id_screen->base;
+   id_video_surface->video_surface = video_surface;
+
+   return &id_video_surface->base;
+
+error:
+   pipe_video_surface_reference(&video_surface, NULL);
+   return NULL;
+}
+
+void
+identity_video_surface_destroy(struct identity_video_surface *id_video_surface)
+{
+   pipe_video_surface_reference(&id_video_surface->video_surface, NULL);
+   FREE(id_video_surface);
+}
diff --git a/src/gallium/drivers/identity/id_objects.h b/src/gallium/drivers/identity/id_objects.h
index ce58faa3c7..77cc719079 100644
--- a/src/gallium/drivers/identity/id_objects.h
+++ b/src/gallium/drivers/identity/id_objects.h
@@ -31,6 +31,7 @@
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_state.h"
+#include "pipe/p_video_state.h"
 
 #include "id_screen.h"
 
@@ -67,6 +68,14 @@ struct identity_transfer
 };
 
 
+struct identity_video_surface
+{
+   struct pipe_video_surface base;
+
+   struct pipe_video_surface *video_surface;
+};
+
+
 static INLINE struct identity_buffer *
 identity_buffer(struct pipe_buffer *_buffer)
 {
@@ -103,6 +112,15 @@ identity_transfer(struct pipe_transfer *_transfer)
    return (struct identity_transfer *)_transfer;
 }
 
+static INLINE struct identity_video_surface *
+identity_video_surface(struct pipe_video_surface *_video_surface)
+{
+   if (!_video_surface) {
+      return NULL;
+   }
+   (void)identity_screen(_video_surface->screen);
+   return (struct identity_video_surface *)_video_surface;
+}
 
 static INLINE struct pipe_buffer *
 identity_buffer_unwrap(struct pipe_buffer *_buffer)
@@ -165,5 +183,12 @@ identity_transfer_create(struct identity_texture *id_texture,
 void
 identity_transfer_destroy(struct identity_transfer *id_transfer);
 
+struct pipe_video_surface *
+identity_video_surface_create(struct identity_screen *id_screen,
+                              struct pipe_video_surface *video_surface);
+
+void
+identity_video_surface_destroy(struct identity_video_surface *id_video_surface);
+
 
 #endif /* ID_OBJECTS_H */
diff --git a/src/gallium/drivers/identity/id_public.h b/src/gallium/drivers/identity/id_public.h
index cac14cfd60..3d2862eaa0 100644
--- a/src/gallium/drivers/identity/id_public.h
+++ b/src/gallium/drivers/identity/id_public.h
@@ -37,4 +37,4 @@ identity_screen_create(struct pipe_screen *screen);
 struct pipe_context *
 identity_context_create(struct pipe_screen *screen, struct pipe_context *pipe);
 
-#endif /* PT_PUBLIC_H */
+#endif /* ID_PUBLIC_H */
diff --git a/src/gallium/drivers/identity/id_screen.c b/src/gallium/drivers/identity/id_screen.c
index 26439637d0..53eae3ef54 100644
--- a/src/gallium/drivers/identity/id_screen.c
+++ b/src/gallium/drivers/identity/id_screen.c
@@ -379,6 +379,33 @@ identity_screen_buffer_destroy(struct pipe_buffer *_buffer)
    identity_buffer_destroy(identity_buffer(_buffer));
 }
 
+static struct pipe_video_surface *
+identity_screen_video_surface_create(struct pipe_screen *_screen,
+                                     enum pipe_video_chroma_format chroma_format,
+                                     unsigned width,
+                                     unsigned height)
+{
+   struct identity_screen *id_screen = identity_screen(_screen);
+   struct pipe_screen *screen = id_screen->screen;
+   struct pipe_video_surface *result;
+
+   result = screen->video_surface_create(screen,
+                                         chroma_format,
+                                         width,
+                                         height);
+
+   if (result) {
+      return identity_video_surface_create(id_screen, result);
+   }
+   return NULL;
+}
+
+static void
+identity_screen_video_surface_destroy(struct pipe_video_surface *_vsfc)
+{
+   identity_video_surface_destroy(identity_video_surface(_vsfc));
+}
+
 static void
 identity_screen_flush_frontbuffer(struct pipe_screen *_screen,
                                   struct pipe_surface *_surface,
@@ -472,6 +499,12 @@ identity_screen_create(struct pipe_screen *screen)
    if (screen->buffer_unmap)
       id_screen->base.buffer_unmap = identity_screen_buffer_unmap;
    id_screen->base.buffer_destroy = identity_screen_buffer_destroy;
+   if (screen->video_surface_create) {
+      id_screen->base.video_surface_create = identity_screen_video_surface_create;
+   }
+   if (screen->video_surface_destroy) {
+      id_screen->base.video_surface_destroy = identity_screen_video_surface_destroy;
+   }
    id_screen->base.flush_frontbuffer = identity_screen_flush_frontbuffer;
    id_screen->base.fence_reference = identity_screen_fence_reference;
    id_screen->base.fence_signalled = identity_screen_fence_signalled;
diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
index ae23329b83..769733b6dd 100644
--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -69,6 +69,7 @@ static void r300_destroy_context(struct pipe_context* context)
     FREE(r300->blend_color_state);
     FREE(r300->rs_block);
     FREE(r300->scissor_state);
+    FREE(r300->vertex_info);
     FREE(r300->viewport_state);
     FREE(r300);
 }
@@ -123,15 +124,24 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
 
     r300->context.clear = r300_clear;
 
-    if (r300screen->caps->has_tcl)
-    {
+    if (r300screen->caps->has_tcl) {
         r300->context.draw_arrays = r300_draw_arrays;
         r300->context.draw_elements = r300_draw_elements;
         r300->context.draw_range_elements = r300_draw_range_elements;
-    }
-    else
-    {
-        assert(0);
+    } else {
+        r300->context.draw_arrays = r300_swtcl_draw_arrays;
+        r300->context.draw_elements = r300_draw_elements;
+        r300->context.draw_range_elements = r300_swtcl_draw_range_elements;
+
+        /* Create a Draw. This is used for SW TCL. */
+        r300->draw = draw_create();
+        /* Enable our renderer. */
+        draw_set_rasterize_stage(r300->draw, r300_draw_stage(r300));
+        /* Enable Draw's clipping. */
+        draw_set_driver_clipping(r300->draw, FALSE);
+        /* Force Draw to never do viewport transform, since we can do
+         * transform in hardware, always. */
+        draw_set_viewport_state(r300->draw, &r300_viewport_identity);
     }
 
     r300->context.is_texture_referenced = r300_is_texture_referenced;
@@ -143,18 +153,9 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
     r300->blend_color_state = CALLOC_STRUCT(r300_blend_color_state);
     r300->rs_block = CALLOC_STRUCT(r300_rs_block);
     r300->scissor_state = CALLOC_STRUCT(r300_scissor_state);
+    r300->vertex_info = CALLOC_STRUCT(r300_vertex_info);
     r300->viewport_state = CALLOC_STRUCT(r300_viewport_state);
 
-    /* Create a Draw. This is used for vert collation and SW TCL. */
-    r300->draw = draw_create();
-    /* Enable our renderer. */
-    draw_set_rasterize_stage(r300->draw, r300_draw_stage(r300));
-    /* Disable Draw's clipping if TCL is present. */
-    draw_set_driver_clipping(r300->draw, r300_screen(screen)->caps->has_tcl);
-    /* Force Draw to never do viewport transform, since (again) we can do
-     * transform in hardware, always. */
-    draw_set_viewport_state(r300->draw, &r300_viewport_identity);
-
     /* Open up the OQ BO. */
     r300->oqbo = screen->buffer_create(screen, 4096,
             PIPE_BUFFER_USAGE_VERTEX, 4096);
diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h
index f954ba7f9a..39c0914cff 100644
--- a/src/gallium/drivers/r300/r300_context.h
+++ b/src/gallium/drivers/r300/r300_context.h
@@ -92,6 +92,10 @@ struct r300_sampler_state {
     uint32_t filter0;      /* R300_TX_FILTER0: 0x4400 */
     uint32_t filter1;      /* R300_TX_FILTER1: 0x4440 */
     uint32_t border_color; /* R300_TX_BORDER_COLOR: 0x45c0 */
+
+    /* Min/max LOD must be clamped to [0, last_level], thus
+     * it's dependent on a currently bound texture */
+    unsigned min_lod, max_lod;
 };
 
 struct r300_scissor_state {
@@ -219,11 +223,6 @@ struct r300_texture {
 struct r300_vertex_info {
     /* Parent class */
     struct vertex_info vinfo;
-    /* Map of vertex attributes into PVS memory for HW TCL,
-     * or GA memory for SW TCL. */
-    int vs_tab[16];
-    /* Map of rasterizer attributes from GB through RS to US. */
-    int fs_tab[16];
 
     /* R300_VAP_PROG_STREAK_CNTL_[0-7] */
     uint32_t vap_prog_stream_cntl[8];
diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c
index eeb97a2d37..ad7dff36be 100644
--- a/src/gallium/drivers/r300/r300_emit.c
+++ b/src/gallium/drivers/r300/r300_emit.c
@@ -129,7 +129,9 @@ static const float * get_shader_constant(
     struct rc_constant * constant,
     struct r300_constant_buffer * externals)
 {
-    static const float zero[4] = { 0.0, 0.0, 0.0, 0.0 };
+    static float vec[4] = { 0.0, 0.0, 0.0, 1.0 };
+    struct pipe_texture *tex;
+
     switch(constant->Type) {
         case RC_CONSTANT_EXTERNAL:
             return externals->constants[constant->u.External];
@@ -137,11 +139,31 @@ static const float * get_shader_constant(
         case RC_CONSTANT_IMMEDIATE:
             return constant->u.Immediate;
 
+        case RC_CONSTANT_STATE:
+            switch (constant->u.State[0]) {
+                /* Factor for converting rectangle coords to
+                 * normalized coords. Should only show up on non-r500. */
+                case RC_STATE_R300_TEXRECT_FACTOR:
+                    tex = &r300->textures[constant->u.State[1]]->tex;
+                    vec[0] = 1.0 / tex->width[0];
+                    vec[1] = 1.0 / tex->height[0];
+                    break;
+
+                default:
+                    debug_printf("r300: Implementation error: "
+                        "Unknown RC_CONSTANT type %d\n", constant->u.State[0]);
+            }
+            break;
+
         default:
-            debug_printf("r300: Implementation error: Unhandled constant type %i\n",
-                constant->Type);
-            return zero;
+            debug_printf("r300: Implementation error: "
+                "Unhandled constant type %d\n", constant->Type);
     }
+
+    /* This should either be (0, 0, 0, 1), which should be a relatively safe
+     * RGBA or STRQ value, or it could be one of the RC_CONSTANT_STATE
+     * state factors. */
+    return vec;
 }
 
 /* Convert a normal single-precision float into the 7.16 format
@@ -561,6 +583,8 @@ void r300_emit_texture(struct r300_context* r300,
                        unsigned offset)
 {
     uint32_t filter0 = sampler->filter0;
+    uint32_t format0 = tex->state.format0;
+    unsigned min_level, max_level;
     CS_LOCALS(r300);
 
     /* to emulate 1D textures through 2D ones correctly */
@@ -569,13 +593,20 @@ void r300_emit_texture(struct r300_context* r300,
         filter0 |= R300_TX_WRAP_T(R300_TX_CLAMP_TO_EDGE);
     }
 
+    /* determine min/max levels */
+    /* the MAX_MIP level is the largest (finest) one */
+    max_level = MIN2(sampler->max_lod, tex->tex.last_level);
+    min_level = MIN2(sampler->min_lod, max_level);
+    format0 |= R300_TX_NUM_LEVELS(max_level);
+    filter0 |= R300_TX_MAX_MIP_LEVEL(min_level);
+
     BEGIN_CS(16);
     OUT_CS_REG(R300_TX_FILTER0_0 + (offset * 4), filter0 |
         (offset << 28));
     OUT_CS_REG(R300_TX_FILTER1_0 + (offset * 4), sampler->filter1);
     OUT_CS_REG(R300_TX_BORDER_COLOR_0 + (offset * 4), sampler->border_color);
 
-    OUT_CS_REG(R300_TX_FORMAT0_0 + (offset * 4), tex->state.format0);
+    OUT_CS_REG(R300_TX_FORMAT0_0 + (offset * 4), format0);
     OUT_CS_REG(R300_TX_FORMAT1_0 + (offset * 4), tex->state.format1);
     OUT_CS_REG(R300_TX_FORMAT2_0 + (offset * 4), tex->state.format2);
     OUT_CS_REG_SEQ(R300_TX_OFFSET_0 + (offset * 4), 1);
@@ -690,12 +721,37 @@ void r300_emit_vertex_format_state(struct r300_context* r300)
     END_CS;
 }
 
+/* XXX This should go to util ... */
+/* Return the number of bits set in the given number. */
+static unsigned bitcount(unsigned n)
+{
+    unsigned bits = 0;
+
+    while (n) {
+        if (n & 1) {
+            bits++;
+        }
+        n >>= 1;
+    }
+
+    return bits;
+}
+
 void r300_emit_vertex_program_code(struct r300_context* r300,
                                    struct r300_vertex_program_code* code)
 {
     int i;
     struct r300_screen* r300screen = r300_screen(r300->context.screen);
     unsigned instruction_count = code->length / 4;
+
+    int vtx_mem_size = r300screen->caps->is_r500 ? 128 : 72;
+    int input_count = MAX2(bitcount(code->InputsRead), 1);
+    int output_count = MAX2(bitcount(code->OutputsWritten), 1);
+    int temp_count = MAX2(code->num_temporaries, 1);
+    int pvs_num_slots = MIN3(vtx_mem_size / input_count,
+                             vtx_mem_size / output_count, 10);
+    int pvs_num_controllers = MIN2(vtx_mem_size / temp_count, 6);
+
     CS_LOCALS(r300);
 
     if (!r300screen->caps->has_tcl) {
@@ -708,8 +764,7 @@ void r300_emit_vertex_program_code(struct r300_context* r300,
     /* R300_VAP_PVS_CODE_CNTL_0
      * R300_VAP_PVS_CONST_CNTL
      * R300_VAP_PVS_CODE_CNTL_1
-     * See the r5xx docs for instructions on how to use these.
-     * XXX these could be optimized to select better values... */
+     * See the r5xx docs for instructions on how to use these. */
     OUT_CS_REG_SEQ(R300_VAP_PVS_CODE_CNTL_0, 3);
     OUT_CS(R300_PVS_FIRST_INST(0) |
             R300_PVS_XYZW_VALID_INST(instruction_count - 1) |
@@ -722,10 +777,11 @@ void r300_emit_vertex_program_code(struct r300_context* r300,
     for (i = 0; i < code->length; i++)
         OUT_CS(code->body.d[i]);
 
-    OUT_CS_REG(R300_VAP_CNTL, R300_PVS_NUM_SLOTS(10) |
-            R300_PVS_NUM_CNTLRS(5) |
+    OUT_CS_REG(R300_VAP_CNTL, R300_PVS_NUM_SLOTS(pvs_num_slots) |
+            R300_PVS_NUM_CNTLRS(pvs_num_controllers) |
             R300_PVS_NUM_FPUS(r300screen->caps->num_vert_fpus) |
-            R300_PVS_VF_MAX_VTX_NUM(12));
+            R300_PVS_VF_MAX_VTX_NUM(12) |
+            (r300screen->caps->is_r500 ? R500_TCL_STATE_OPTIMIZATION : 0));
     END_CS;
 }
 
@@ -790,13 +846,22 @@ void r300_emit_viewport_state(struct r300_context* r300,
     END_CS;
 }
 
+void r300_emit_texture_count(struct r300_context* r300)
+{
+    CS_LOCALS(r300);
+
+    BEGIN_CS(2);
+    OUT_CS_REG(R300_TX_ENABLE, (1 << r300->texture_count) - 1);
+    END_CS;
+
+}
+
 void r300_flush_textures(struct r300_context* r300)
 {
     CS_LOCALS(r300);
 
-    BEGIN_CS(4);
+    BEGIN_CS(2);
     OUT_CS_REG(R300_TX_INVALTAGS, 0);
-    OUT_CS_REG(R300_TX_ENABLE, (1 << r300->texture_count) - 1);
     END_CS;
 }
 
@@ -950,6 +1015,8 @@ validate:
     /* Samplers and textures are tracked separately but emitted together. */
     if (r300->dirty_state &
             (R300_ANY_NEW_SAMPLERS | R300_ANY_NEW_TEXTURES)) {
+        r300_emit_texture_count(r300);
+
         for (i = 0; i < MIN2(r300->sampler_count, r300->texture_count); i++) {
   	    if (r300->dirty_state &
 		((R300_NEW_SAMPLER << i) | (R300_NEW_TEXTURE << i))) {
diff --git a/src/gallium/drivers/r300/r300_emit.h b/src/gallium/drivers/r300/r300_emit.h
index 7c83c5166d..3797d3d332 100644
--- a/src/gallium/drivers/r300/r300_emit.h
+++ b/src/gallium/drivers/r300/r300_emit.h
@@ -92,6 +92,8 @@ void r300_emit_vertex_shader(struct r300_context* r300,
 void r300_emit_viewport_state(struct r300_context* r300,
                               struct r300_viewport_state* viewport);
 
+void r300_emit_texture_count(struct r300_context* r300);
+
 void r300_flush_textures(struct r300_context* r300);
 
 /* Emit all dirty state. */
diff --git a/src/gallium/drivers/r300/r300_reg.h b/src/gallium/drivers/r300/r300_reg.h
index 8ca785cb58..3a419b24b0 100644
--- a/src/gallium/drivers/r300/r300_reg.h
+++ b/src/gallium/drivers/r300/r300_reg.h
@@ -1293,7 +1293,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #        define R500_RS_INST_TEX_ID(x)                  ((x) << 0)
 #define R500_RS_INST_TEX_CN_WRITE			(1 << 4)
 #define R500_RS_INST_TEX_ADDR_SHIFT			5
-#        define R500_RS_INST_TEX_ADDR(x)                ((x) << 0)
+#        define R500_RS_INST_TEX_ADDR(x)                ((x) << 5)
 #define R500_RS_INST_COL_ID_SHIFT			12
 #        define R500_RS_INST_COL_ID(x)                  ((x) << 12)
 #define R500_RS_INST_COL_CN_NO_WRITE			(0 << 16)
@@ -1463,6 +1463,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #	define R300_TX_MIN_FILTER_MIP_NEAREST    (1 << 13)
 #	define R300_TX_MIN_FILTER_MIP_LINEAR     (2 << 13)
 #	define R300_TX_MIN_FILTER_MIP_MASK       (3 << 13)
+#       define R300_TX_MAX_MIP_LEVEL_SHIFT       17
+#       define R300_TX_MAX_MIP_LEVEL_MASK        (0xf << 17)
 #	define R300_TX_MAX_ANISO_1_TO_1          (0 << 21)
 #	define R300_TX_MAX_ANISO_2_TO_1          (1 << 21)
 #	define R300_TX_MAX_ANISO_4_TO_1          (2 << 21)
@@ -1471,6 +1473,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #	define R300_TX_MAX_ANISO_MASK            (7 << 21)
 #       define R300_TX_WRAP_S(x)                 ((x) << 0)
 #       define R300_TX_WRAP_T(x)                 ((x) << 3)
+#       define R300_TX_MAX_MIP_LEVEL(x)          ((x) << 17)
 
 #define R300_TX_FILTER1_0                      0x4440
 #	define R300_CHROMA_KEY_MODE_DISABLE    0
@@ -1500,8 +1503,6 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_TX_HEIGHTMASK_MASK           (2047 << 11)
 #	define R300_TX_DEPTHMASK_SHIFT           22
 #	define R300_TX_DEPTHMASK_MASK            (0xf << 22)
-#       define R300_TX_MAX_MIP_LEVEL_SHIFT       26
-#       define R300_TX_MAX_MIP_LEVEL_MASK        (0xf << 26)
 #       define R300_TX_SIZE_PROJECTED            (1 << 30)
 #       define R300_TX_PITCH_EN                  (1 << 31)
 #       define R300_TX_WIDTH(x)                  ((x) << 0)
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c
index 62e1456ed3..4c5fb405c6 100644
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -183,8 +183,6 @@ boolean r300_draw_range_elements(struct pipe_context* pipe,
         return FALSE;
     }
 
-    setup_vertex_attributes(r300);
-
     setup_index_buffer(r300, indexBuffer, indexSize);
 
     r300_emit_dirty_state(r300);
@@ -226,8 +224,6 @@ boolean r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
         return FALSE;
     }
 
-    setup_vertex_attributes(r300);
-
     r300_emit_dirty_state(r300);
 
     r300_emit_aos(r300, start);
diff --git a/src/gallium/drivers/r300/r300_screen.h b/src/gallium/drivers/r300/r300_screen.h
index 41df31f670..1ce5ff3904 100644
--- a/src/gallium/drivers/r300/r300_screen.h
+++ b/src/gallium/drivers/r300/r300_screen.h
@@ -27,6 +27,8 @@
 
 #include "r300_chipset.h"
 
+struct r300_winsys;
+
 struct r300_screen {
     /* Parent class */
     struct pipe_screen screen;
diff --git a/src/gallium/drivers/r300/r300_state.c b/src/gallium/drivers/r300/r300_state.c
index d1eced61db..a88d66db24 100644
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -523,6 +523,11 @@ static void*
                                                    state->mag_img_filter,
                                                    state->min_mip_filter);
 
+    /* Unfortunately, r300-r500 don't support floating-point mipmap lods. */
+    /* We must pass these to the emit function to clamp them properly. */
+    sampler->min_lod = MAX2((unsigned)state->min_lod, 0);
+    sampler->max_lod = MAX2((unsigned)ceilf(state->max_lod), 0);
+
     lod_bias = CLAMP((int)(state->lod_bias * 32), -(1 << 9), (1 << 9) - 1);
 
     sampler->filter1 |= lod_bias << R300_LOD_BIAS_SHIFT;
@@ -571,6 +576,7 @@ static void r300_set_sampler_textures(struct pipe_context* pipe,
                                       struct pipe_texture** texture)
 {
     struct r300_context* r300 = r300_context(pipe);
+    boolean is_r500 = r300_screen(r300->context.screen)->caps->is_r500;
     int i;
 
     /* XXX magic num */
@@ -585,6 +591,13 @@ static void r300_set_sampler_textures(struct pipe_context* pipe,
             pipe_texture_reference((struct pipe_texture**)&r300->textures[i],
                 texture[i]);
             r300->dirty_state |= (R300_NEW_TEXTURE << i);
+
+            /* R300-specific - set the texrect factor in a fragment shader */
+            if (!is_r500 && r300->textures[i]->is_npot) {
+                /* XXX It would be nice to re-emit just 1 constant,
+                 * XXX not all of them */
+                r300->dirty_state |= R300_NEW_FRAGMENT_SHADER_CONSTANTS;
+            }
         }
     }
 
@@ -674,6 +687,8 @@ static void r300_set_vertex_buffers(struct pipe_context* pipe,
         draw_flush(r300->draw);
         draw_set_vertex_buffers(r300->draw, count, buffers);
     }
+
+    r300->dirty_state |= R300_NEW_VERTEX_FORMAT;
 }
 
 static void r300_set_vertex_elements(struct pipe_context* pipe,
@@ -706,9 +721,6 @@ static void* r300_create_vs_state(struct pipe_context* pipe,
 
         tgsi_scan_shader(shader->tokens, &vs->info);
 
-        /* Appease Draw. */
-        vs->draw = draw_create_vertex_shader(r300->draw, shader);
-
         return (void*)vs;
     } else {
         return draw_create_vertex_shader(r300->draw, shader);
@@ -719,8 +731,6 @@ static void r300_bind_vs_state(struct pipe_context* pipe, void* shader)
 {
     struct r300_context* r300 = r300_context(pipe);
 
-    draw_flush(r300->draw);
-
     if (r300_screen(pipe->screen)->caps->has_tcl) {
         struct r300_vertex_shader* vs = (struct r300_vertex_shader*)shader;
 
@@ -731,10 +741,10 @@ static void r300_bind_vs_state(struct pipe_context* pipe, void* shader)
             r300_translate_vertex_shader(r300, vs);
         }
 
-        draw_bind_vertex_shader(r300->draw, vs->draw);
         r300->vs = vs;
         r300->dirty_state |= R300_NEW_VERTEX_SHADER | R300_NEW_VERTEX_SHADER_CONSTANTS;
     } else {
+        draw_flush(r300->draw);
         draw_bind_vertex_shader(r300->draw,
                 (struct draw_vertex_shader*)shader);
     }
@@ -748,7 +758,6 @@ static void r300_delete_vs_state(struct pipe_context* pipe, void* shader)
         struct r300_vertex_shader* vs = (struct r300_vertex_shader*)shader;
 
         rc_constants_destroy(&vs->code.constants);
-        draw_delete_vertex_shader(r300->draw, vs->draw);
         FREE((void*)vs->state.tokens);
         FREE(shader);
     } else {
diff --git a/src/gallium/drivers/r300/r300_state_derived.c b/src/gallium/drivers/r300/r300_state_derived.c
index 7166694edf..6fb780cb29 100644
--- a/src/gallium/drivers/r300/r300_state_derived.c
+++ b/src/gallium/drivers/r300/r300_state_derived.c
@@ -47,8 +47,8 @@ struct r300_shader_derived_value {
 
 unsigned r300_shader_key_hash(void* key) {
     struct r300_shader_key* shader_key = (struct r300_shader_key*)key;
-    unsigned vs = (unsigned)shader_key->vs;
-    unsigned fs = (unsigned)shader_key->fs;
+    unsigned vs = (intptr_t)shader_key->vs;
+    unsigned fs = (intptr_t)shader_key->fs;
 
     return (vs << 16) | (fs & 0xffff);
 }
@@ -61,88 +61,46 @@ int r300_shader_key_compare(void* key1, void* key2) {
         (shader_key1->fs == shader_key2->fs);
 }
 
-/* Set up the vs_tab and routes. */
-static void r300_vs_tab_routes(struct r300_context* r300,
-                               struct r300_vertex_info* vformat)
+/* Set up the vs_output_tab and routes. */
+static void r300_vs_output_tab_routes(struct r300_context* r300,
+                                      int* vs_output_tab)
 {
-    struct r300_screen* r300screen = r300_screen(r300->context.screen);
-    struct vertex_info* vinfo = &vformat->vinfo;
-    int* tab = vformat->vs_tab;
+    struct vertex_info* vinfo = &r300->vertex_info->vinfo;
     boolean pos = FALSE, psize = FALSE, fog = FALSE;
     int i, texs = 0, cols = 0;
-    struct tgsi_shader_info* info;
-
-    if (r300screen->caps->has_tcl) {
-        /* Use vertex shader to determine required routes. */
-        info = &r300->vs->info;
-    } else {
-        /* Use fragment shader to determine required routes. */
-        info = &r300->fs->info;
-    }
+    struct tgsi_shader_info* info = &r300->fs->info;
 
-    assert(info->num_inputs <= 16);
+    /* XXX One day we should figure out how to handle a different number of
+     * VS outputs and FS inputs, as well as a different number of vertex streams
+     * and VS inputs. It's definitely one of the sources of hardlocks. */
 
-    if (!r300screen->caps->has_tcl || !r300->rs_state->enable_vte)
-    {
-        for (i = 0; i < info->num_inputs; i++) {
-            switch (r300->vs->code.inputs[i]) {
-                case TGSI_SEMANTIC_POSITION:
-                    pos = TRUE;
-                    tab[i] = 0;
-                    break;
-                case TGSI_SEMANTIC_COLOR:
-                    tab[i] = 2 + cols;
-                    cols++;
-                    break;
-                case TGSI_SEMANTIC_PSIZE:
-                    assert(psize == FALSE);
-                    psize = TRUE;
-                    tab[i] = 15;
-                    break;
-                case TGSI_SEMANTIC_FOG:
-                    assert(fog == FALSE);
-                    fog = TRUE;
-                    /* Fall through */
-                case TGSI_SEMANTIC_GENERIC:
-                    tab[i] = 6 + texs;
-                    texs++;
-                    break;
-                default:
-                    debug_printf("r300: Unknown vertex input %d\n",
-                        info->input_semantic_name[i]);
-                    break;
-            }
-        }
-    }
-    else
-    {
-        /* Just copy vert attribs over as-is. */
-        for (i = 0; i < info->num_inputs; i++) {
-            tab[i] = i;
-        }
-
-        for (i = 0; i < info->num_outputs; i++) {
-            switch (info->output_semantic_name[i]) {
-                case TGSI_SEMANTIC_POSITION:
-                    pos = TRUE;
-                    break;
-                case TGSI_SEMANTIC_COLOR:
-                    cols++;
-                    break;
-                case TGSI_SEMANTIC_PSIZE:
-                    psize = TRUE;
-                    break;
-                case TGSI_SEMANTIC_FOG:
-                    fog = TRUE;
-                    /* Fall through */
-                case TGSI_SEMANTIC_GENERIC:
-                    texs++;
-                    break;
-                default:
-                    debug_printf("r300: Unknown vertex output %d\n",
-                        info->output_semantic_name[i]);
-                    break;
-            }
+    for (i = 0; i < info->num_inputs; i++) {
+        switch (info->input_semantic_name[i]) {
+            case TGSI_SEMANTIC_POSITION:
+                pos = TRUE;
+                vs_output_tab[i] = 0;
+                break;
+            case TGSI_SEMANTIC_COLOR:
+                vs_output_tab[i] = 2 + cols;
+                cols++;
+                break;
+            case TGSI_SEMANTIC_PSIZE:
+                assert(psize == FALSE);
+                psize = TRUE;
+                vs_output_tab[i] = 15;
+                break;
+            case TGSI_SEMANTIC_FOG:
+                assert(fog == FALSE);
+                fog = TRUE;
+                /* Fall through */
+            case TGSI_SEMANTIC_GENERIC:
+                vs_output_tab[i] = 6 + texs;
+                texs++;
+                break;
+            default:
+                debug_printf("r300: Unknown vertex input %d\n",
+                    info->input_semantic_name[i]);
+                break;
         }
     }
 
@@ -161,14 +119,13 @@ static void r300_vs_tab_routes(struct r300_context* r300,
 
     /* We need to add vertex position attribute only for SW TCL case,
      * for HW TCL case it could be generated by vertex shader */
-    if (!pos && !r300screen->caps->has_tcl) {
-        debug_printf("r300: Forcing vertex position attribute emit...\n");
+    if (!pos) {
         /* Make room for the position attribute
-         * at the beginning of the tab. */
+         * at the beginning of the vs_output_tab. */
         for (i = 15; i > 0; i--) {
-            tab[i] = tab[i-1];
+            vs_output_tab[i] = vs_output_tab[i-1];
         }
-        tab[0] = 0;
+        vs_output_tab[0] = 0;
     }
 
     /* Position. */
@@ -227,43 +184,78 @@ static void r300_vs_tab_routes(struct r300_context* r300,
 }
 
 /* Update the PSC tables. */
-static void r300_vertex_psc(struct r300_context* r300,
-                            struct r300_vertex_info* vformat)
+static void r300_vertex_psc(struct r300_context* r300)
 {
-    struct r300_screen* r300screen = r300_screen(r300->context.screen);
-    struct vertex_info* vinfo = &vformat->vinfo;
-    int* tab = vformat->vs_tab;
+    struct r300_vertex_info *vformat = r300->vertex_info;
     uint16_t type, swizzle;
     enum pipe_format format;
-    unsigned i, attrib_count;
+    unsigned i;
 
     /* Vertex shaders have no semantics on their inputs,
-     * so PSC should just route stuff based on their info,
+     * so PSC should just route stuff based on the vertex elements,
      * and not on attrib information. */
-    if (r300screen->caps->has_tcl) {
-        attrib_count = r300->vs->info.num_inputs;
-        DBG(r300, DBG_DRAW, "r300: routing %d attribs in psc for vs\n",
-                attrib_count);
-    } else {
-        attrib_count = vinfo->num_attribs;
-        DBG(r300, DBG_DRAW, "r300: attrib count: %d\n", attrib_count);
-        for (i = 0; i < attrib_count; i++) {
-            DBG(r300, DBG_DRAW, "r300: attrib: offset %d, interp %d, size %d,"
-                   " tab %d\n", vinfo->attrib[i].src_index,
-                   vinfo->attrib[i].interp_mode, vinfo->attrib[i].emit,
-                   tab[i]);
+    DBG(r300, DBG_DRAW, "r300: vs expects %d attribs, routing %d elements"
+            " in psc\n",
+            r300->vs->info.num_inputs,
+            r300->vertex_element_count);
+
+    for (i = 0; i < r300->vertex_element_count; i++) {
+        format = r300->vertex_element[i].src_format;
+
+        type = r300_translate_vertex_data_type(format) |
+            (i << R300_DST_VEC_LOC_SHIFT);
+        swizzle = r300_translate_vertex_data_swizzle(format);
+
+        if (i % 2) {
+            vformat->vap_prog_stream_cntl[i >> 1] |= type << 16;
+            vformat->vap_prog_stream_cntl_ext[i >> 1] |= swizzle << 16;
+        } else {
+            vformat->vap_prog_stream_cntl[i >> 1] |= type;
+            vformat->vap_prog_stream_cntl_ext[i >> 1] |= swizzle;
         }
     }
 
+
+    assert(i <= 15);
+
+    /* Set the last vector in the PSC. */
+    if (i) {
+        i -= 1;
+    }
+    vformat->vap_prog_stream_cntl[i >> 1] |=
+        (R300_LAST_VEC << (i & 1 ? 16 : 0));
+}
+
+/* Update the PSC tables for SW TCL, using Draw. */
+static void r300_swtcl_vertex_psc(struct r300_context* r300,
+                                  int* vs_output_tab)
+{
+    struct r300_vertex_info *vformat = r300->vertex_info;
+    struct vertex_info* vinfo = &vformat->vinfo;
+    uint16_t type, swizzle;
+    enum pipe_format format;
+    unsigned i, attrib_count;
+
+    /* For each Draw attribute, route it to the fragment shader according
+     * to the vs_output_tab. */
+    attrib_count = vinfo->num_attribs;
+    DBG(r300, DBG_DRAW, "r300: attrib count: %d\n", attrib_count);
+    for (i = 0; i < attrib_count; i++) {
+        DBG(r300, DBG_DRAW, "r300: attrib: offset %d, interp %d, size %d,"
+               " vs_output_tab %d\n", vinfo->attrib[i].src_index,
+               vinfo->attrib[i].interp_mode, vinfo->attrib[i].emit,
+               vs_output_tab[i]);
+    }
+
     for (i = 0; i < attrib_count; i++) {
         /* Make sure we have a proper destination for our attribute. */
-        assert(tab[i] != -1);
+        assert(vs_output_tab[i] != -1);
 
         format = draw_translate_vinfo_format(vinfo->attrib[i].emit);
 
         /* Obtain the type of data in this attribute. */
         type = r300_translate_vertex_data_type(format) |
-            tab[i] << R300_DST_VEC_LOC_SHIFT;
+            vs_output_tab[i] << R300_DST_VEC_LOC_SHIFT;
 
         /* Obtain the swizzle for this attribute. Note that the default
          * swizzle in the hardware is not XYZW! */
@@ -272,12 +264,10 @@ static void r300_vertex_psc(struct r300_context* r300,
         /* Add the attribute to the PSC table. */
         if (i & 1) {
             vformat->vap_prog_stream_cntl[i >> 1] |= type << 16;
-
             vformat->vap_prog_stream_cntl_ext[i >> 1] |= swizzle << 16;
         } else {
-            vformat->vap_prog_stream_cntl[i >> 1] |= type <<  0;
-
-            vformat->vap_prog_stream_cntl_ext[i >> 1] |= swizzle << 0;
+            vformat->vap_prog_stream_cntl[i >> 1] |= type;
+            vformat->vap_prog_stream_cntl_ext[i >> 1] |= swizzle;
         }
     }
 
@@ -289,74 +279,12 @@ static void r300_vertex_psc(struct r300_context* r300,
         (R300_LAST_VEC << (i & 1 ? 16 : 0));
 }
 
-/* Set up the mappings from GB to US, for RS block. */
-static void r300_update_fs_tab(struct r300_context* r300,
-                               struct r300_vertex_info* vformat)
-{
-    struct tgsi_shader_info* info = &r300->fs->info;
-    int i, cols = 0, texs = 0, cols_emitted = 0;
-    int* tab = vformat->fs_tab;
-
-    for (i = 0; i < 16; i++) {
-        tab[i] = -1;
-    }
-
-    assert(info->num_inputs <= 16);
-    for (i = 0; i < info->num_inputs; i++) {
-        switch (info->input_semantic_name[i]) {
-            case TGSI_SEMANTIC_COLOR:
-                tab[i] = INTERP_LINEAR;
-                cols++;
-                break;
-            case TGSI_SEMANTIC_POSITION:
-            case TGSI_SEMANTIC_PSIZE:
-                debug_printf("r300: Implementation error: Can't use "
-                        "pos attribs in fragshader yet!\n");
-                /* Pass through for now */
-            case TGSI_SEMANTIC_FOG:
-            case TGSI_SEMANTIC_GENERIC:
-                tab[i] = INTERP_PERSPECTIVE;
-                break;
-            default:
-                debug_printf("r300: Unknown vertex input %d\n",
-                    info->input_semantic_name[i]);
-                break;
-        }
-    }
-
-    /* Now that we know where everything is... */
-    DBG(r300, DBG_DRAW, "r300: fp input count: %d\n", info->num_inputs);
-    for (i = 0; i < info->num_inputs; i++) {
-        switch (tab[i]) {
-            case INTERP_LINEAR:
-                DBG(r300, DBG_DRAW, "r300: attrib: "
-                        "stack offset %d, color,    tab %d\n",
-                        i, cols_emitted);
-                tab[i] = cols_emitted;
-                cols_emitted++;
-                break;
-            case INTERP_PERSPECTIVE:
-                DBG(r300, DBG_DRAW, "r300: attrib: "
-                        "stack offset %d, texcoord, tab %d\n",
-                        i, cols + texs);
-                tab[i] = cols + texs;
-                texs++;
-                break;
-            case -1:
-                debug_printf("r300: Implementation error: Bad fp interp!\n");
-            default:
-                break;
-        }
-    }
-
-}
-
 /* Set up the RS block. This is the part of the chipset that actually does
  * the rasterization of vertices into fragments. This is also the part of the
  * chipset that locks up if any part of it is even slightly wrong. */
-static void r300_update_rs_block(struct r300_context* r300,
-                                 struct r300_rs_block* rs)
+static void r300_update_rs_block(struct r300_context* r300)
 {
+    struct r300_rs_block* rs = r300->rs_block;
     struct tgsi_shader_info* info = &r300->fs->info;
     int col_count = 0, fp_offset = 0, i, tex_count = 0;
     int rs_tex_comp = 0;
@@ -390,17 +318,18 @@ static void r300_update_rs_block(struct r300_context* r300,
             col_count++;
         }
 
+        for (i = 0; i < col_count; i++) {
+            rs->inst[i] |= R500_RS_INST_COL_ID(i) |
+                R500_RS_INST_COL_CN_WRITE | R500_RS_INST_COL_ADDR(fp_offset);
+            fp_offset++;
+        }
+
         for (i = 0; i < tex_count; i++) {
             rs->inst[i] |= R500_RS_INST_TEX_ID(i) |
                 R500_RS_INST_TEX_CN_WRITE | R500_RS_INST_TEX_ADDR(fp_offset);
             fp_offset++;
         }
 
-        for (i = 0; i < col_count; i++) {
-            rs->inst[i] |= R500_RS_INST_COL_ID(i) |
-                R500_RS_INST_COL_CN_WRITE | R500_RS_INST_COL_ADDR(fp_offset);
-            fp_offset++;
-        }
     } else {
         for (i = 0; i < info->num_inputs; i++) {
             switch (info->input_semantic_name[i]) {
@@ -425,8 +354,10 @@ static void r300_update_rs_block(struct r300_context* r300,
             }
         }
 
+        /* Rasterize at least one color, or bad things happen. */
         if (col_count == 0) {
             rs->ip[0] |= R300_RS_COL_FMT(R300_RS_COL_FMT_0001);
+            col_count++;
         }
 
         if (tex_count == 0) {
@@ -437,9 +368,10 @@ static void r300_update_rs_block(struct r300_context* r300,
                 R300_RS_SEL_Q(R300_RS_SEL_K1);
         }
 
-        /* Rasterize at least one color, or bad things happen. */
-        if ((col_count == 0) && (tex_count == 0)) {
-            col_count++;
+        for (i = 0; i < col_count; i++) {
+            rs->inst[i] |= R300_RS_INST_COL_ID(i) |
+                R300_RS_INST_COL_CN_WRITE | R300_RS_INST_COL_ADDR(fp_offset);
+            fp_offset++;
         }
 
         for (i = 0; i < tex_count; i++) {
@@ -447,28 +379,22 @@ static void r300_update_rs_block(struct r300_context* r300,
                 R300_RS_INST_TEX_CN_WRITE | R300_RS_INST_TEX_ADDR(fp_offset);
             fp_offset++;
         }
-
-        for (i = 0; i < col_count; i++) {
-            rs->inst[i] |= R300_RS_INST_COL_ID(i) |
-                R300_RS_INST_COL_CN_WRITE | R300_RS_INST_COL_ADDR(fp_offset);
-            fp_offset++;
-        }
     }
 
     rs->count = (rs_tex_comp) | (col_count << R300_IC_COUNT_SHIFT) |
         R300_HIRES_EN;
 
-    rs->inst_count = MAX2(MAX2(col_count - 1, tex_count - 1), 0);
+    rs->inst_count = MAX3(col_count - 1, tex_count - 1, 0);
 }
 
 /* Update the vertex format. */
 static void r300_update_derived_shader_state(struct r300_context* r300)
 {
     struct r300_screen* r300screen = r300_screen(r300->context.screen);
-    struct r300_vertex_info* vformat;
-    struct r300_rs_block* rs_block;
+    int vs_output_tab[16];
     int i;
 
+
     /*
     struct r300_shader_key* key;
     struct r300_shader_derived_value* value;
@@ -495,27 +421,26 @@ static void r300_update_derived_shader_state(struct r300_context* r300)
             (void*)key, (void*)value);
     } */
 
-    /* XXX This will be refactored ASAP. */
-    vformat = CALLOC_STRUCT(r300_vertex_info);
-    rs_block = CALLOC_STRUCT(r300_rs_block);
+    /* Reset structures */
+    memset(r300->rs_block, 0, sizeof(struct r300_rs_block));
+    memset(r300->vertex_info, 0, sizeof(struct r300_vertex_info));
 
     for (i = 0; i < 16; i++) {
-        vformat->vs_tab[i] = -1;
-        vformat->fs_tab[i] = -1;
+        vs_output_tab[i] = -1;
     }
 
-    r300_vs_tab_routes(r300, vformat);
-    r300_vertex_psc(r300, vformat);
-    r300_update_fs_tab(r300, vformat);
+    /* Update states */
+    r300_vs_output_tab_routes(r300, vs_output_tab);
 
-    r300_update_rs_block(r300, rs_block);
+    if (r300screen->caps->has_tcl) {
+        r300_vertex_psc(r300);
+    } else {
+        r300_swtcl_vertex_psc(r300, vs_output_tab);
+    }
 
-    FREE(r300->vertex_info);
-    FREE(r300->rs_block);
+    r300_update_rs_block(r300);
 
-    r300->vertex_info = vformat;
-    r300->rs_block = rs_block;
-    r300->dirty_state |= (R300_NEW_VERTEX_FORMAT | R300_NEW_RS_BLOCK);
+    r300->dirty_state |= R300_NEW_RS_BLOCK;
 }
 
 static void r300_update_ztop(struct r300_context* r300)
@@ -553,9 +478,9 @@ static void r300_update_ztop(struct r300_context* r300)
 
 void r300_update_derived_state(struct r300_context* r300)
 {
-    /* XXX */
-    if (TRUE || r300->dirty_state &
-        (R300_NEW_FRAGMENT_SHADER | R300_NEW_VERTEX_SHADER)) {
+    if (r300->dirty_state &
+        (R300_NEW_FRAGMENT_SHADER | R300_NEW_VERTEX_SHADER |
+         R300_NEW_VERTEX_FORMAT)) {
         r300_update_derived_shader_state(r300);
     }
 
diff --git a/src/gallium/drivers/r300/r300_state_invariant.c b/src/gallium/drivers/r300/r300_state_invariant.c
index c07e6ae676..46d1cb39b5 100644
--- a/src/gallium/drivers/r300/r300_state_invariant.c
+++ b/src/gallium/drivers/r300/r300_state_invariant.c
@@ -84,7 +84,7 @@ void r300_emit_invariant_state(struct r300_context* r300)
     END_CS;
 
     /* XXX unsorted stuff from surface_fill */
-    BEGIN_CS(60 + (caps->has_tcl ? 5 : 0) + (caps->is_r500 ? 4 : 0));
+    BEGIN_CS(56 + (caps->has_tcl ? 5 : 0) + (caps->is_r500 ? 4 : 0));
     /* Flush PVS. */
     OUT_CS_REG(R300_VAP_PVS_STATE_FLUSH_REG, 0x0);
 
@@ -135,8 +135,6 @@ void r300_emit_invariant_state(struct r300_context* r300)
     OUT_CS_REG(R300_ZB_DEPTHCLEARVALUE, 0x00000000);
     OUT_CS_REG(R300_ZB_HIZ_OFFSET, 0x00000000);
     OUT_CS_REG(R300_ZB_HIZ_PITCH, 0x00000000);
-    OUT_CS_REG(R300_VAP_VTX_STATE_CNTL, 0x1);
-    OUT_CS_REG(R300_VAP_VSM_VTX_ASSM, 0x405);
     OUT_CS_REG(R300_SE_VTE_CNTL, 0x0000043F);
 
     /* XXX */
diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c
index aea25cf71d..d13aa8f036 100644
--- a/src/gallium/drivers/r300/r300_texture.c
+++ b/src/gallium/drivers/r300/r300_texture.c
@@ -43,8 +43,7 @@ static void r300_setup_texture_state(struct r300_texture* tex, boolean is_r500)
         state->format2 = (tex->pitch[0] - 1) & 0x1fff;
     } else {
         /* power of two textures (3D, mipmaps, and no pitch) */
-        state->format0 |= R300_TX_DEPTH(util_logbase2(pt->depth[0]) & 0xf) |
-                          R300_TX_NUM_LEVELS(pt->last_level & 0xf);
+        state->format0 |= R300_TX_DEPTH(util_logbase2(pt->depth[0]) & 0xf);
     }
 
     state->format1 = r300_translate_texformat(pt->format);
diff --git a/src/gallium/drivers/r300/r300_vbo.c b/src/gallium/drivers/r300/r300_vbo.c
index a6a159667a..6ebaf715dc 100644
--- a/src/gallium/drivers/r300/r300_vbo.c
+++ b/src/gallium/drivers/r300/r300_vbo.c
@@ -34,52 +34,6 @@
 #include "r300_reg.h"
 #include "r300_winsys.h"
 
-static INLINE void setup_vertex_attribute(struct r300_vertex_info *vinfo,
-                                          struct pipe_vertex_element *vert_elem,
-                                          unsigned attr_num)
-{
-    uint16_t hw_fmt1, hw_fmt2;
-
-    hw_fmt1 = r300_translate_vertex_data_type(vert_elem->src_format) |
-        (attr_num << R300_DST_VEC_LOC_SHIFT);
-    hw_fmt2 = r300_translate_vertex_data_swizzle(vert_elem->src_format);
-
-    if (attr_num % 2 == 0)
-    {
-        vinfo->vap_prog_stream_cntl[attr_num >> 1] = hw_fmt1;
-        vinfo->vap_prog_stream_cntl_ext[attr_num >> 1] = hw_fmt2;
-    }
-    else
-    {
-        vinfo->vap_prog_stream_cntl[attr_num >> 1] |= hw_fmt1 << 16;
-        vinfo->vap_prog_stream_cntl_ext[attr_num >> 1] |= hw_fmt2 << 16;
-    }
-}
-
-static void finish_vertex_attribs_setup(struct r300_vertex_info *vinfo,
-                                        unsigned attribs_num)
-{
-    uint32_t last_vec_bit = (attribs_num % 2 == 0) ?
-        (R300_LAST_VEC << 16) : R300_LAST_VEC;
-
-    assert(attribs_num > 0 && attribs_num <= 16);
-    vinfo->vap_prog_stream_cntl[(attribs_num - 1) >> 1] |= last_vec_bit;
-}
-
-void setup_vertex_attributes(struct r300_context *r300)
-{
-    struct pipe_vertex_element *vert_elem;
-    int i;
-
-    for (i = 0; i < r300->vertex_element_count; i++) {
-        vert_elem = &r300->vertex_element[i];
-        setup_vertex_attribute(r300->vertex_info, vert_elem, i);
-    }
-
-    finish_vertex_attribs_setup(r300->vertex_info,
-        r300->vertex_element_count);
-}
-
 static INLINE int get_buffer_offset(struct r300_context *r300,
                                     unsigned int buf_nr,
                                     unsigned int elem_offset)
diff --git a/src/gallium/drivers/r300/r300_vs.h b/src/gallium/drivers/r300/r300_vs.h
index 2a4ce315e3..00b02bf510 100644
--- a/src/gallium/drivers/r300/r300_vs.h
+++ b/src/gallium/drivers/r300/r300_vs.h
@@ -35,9 +35,6 @@ struct r300_vertex_shader {
     struct pipe_shader_state state;
     struct tgsi_shader_info info;
 
-    /* Fallback shader, because Draw has issues */
-    struct draw_vertex_shader* draw;
-
     /* Has this shader been translated yet? */
     boolean translated;
 
diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c
index 5f60139968..bdbb7fa9b9 100644
--- a/src/gallium/drivers/softpipe/sp_context.c
+++ b/src/gallium/drivers/softpipe/sp_context.c
@@ -90,14 +90,15 @@ softpipe_destroy( struct pipe_context *pipe )
    if (softpipe->draw)
       draw_destroy( softpipe->draw );
 
-      softpipe->quad.shade->destroy( softpipe->quad.shade );
-      softpipe->quad.depth_test->destroy( softpipe->quad.depth_test );
-      softpipe->quad.blend->destroy( softpipe->quad.blend );
+   softpipe->quad.shade->destroy( softpipe->quad.shade );
+   softpipe->quad.depth_test->destroy( softpipe->quad.depth_test );
+   softpipe->quad.blend->destroy( softpipe->quad.blend );
 
    for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
       sp_destroy_tile_cache(softpipe->cbuf_cache[i]);
       pipe_surface_reference(&softpipe->framebuffer.cbufs[i], NULL);
    }
+
    sp_destroy_tile_cache(softpipe->zsbuf_cache);
    pipe_surface_reference(&softpipe->framebuffer.zsbuf, NULL);
 
diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
index 4076114d39..a8999ed347 100644
--- a/src/gallium/drivers/softpipe/sp_fs_exec.c
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -126,7 +126,13 @@ exec_run( const struct sp_fragment_shader *base,
    setup_pos_vector(quad->posCoef, 
                     (float)quad->input.x0, (float)quad->input.y0, 
                     &machine->QuadPos);
-   
+
+   if (quad->input.facing) {
+      machine->Face = -1.0f;
+   } else {
+      machine->Face = 1.0f;
+   }
+
    quad->inout.mask &= tgsi_exec_machine_run( machine );
    if (quad->inout.mask == 0)
       return FALSE;
diff --git a/src/gallium/state_trackers/xorg/xorg_exa_tgsi.c b/src/gallium/state_trackers/xorg/xorg_exa_tgsi.c
index 3bf64b6331..52b97af163 100644
--- a/src/gallium/state_trackers/xorg/xorg_exa_tgsi.c
+++ b/src/gallium/state_trackers/xorg/xorg_exa_tgsi.c
@@ -47,22 +47,22 @@ static void
 print_fs_traits(int fs_traits)
 {
    const char *strings[] = {
-      "FS_COMPOSITE",       // = 1 << 0,
-      "FS_MASK",            // = 1 << 1,
-      "FS_SOLID_FILL",      // = 1 << 2,
-      "FS_LINGRAD_FILL",    // = 1 << 3,
-      "FS_RADGRAD_FILL",    // = 1 << 4,
-      "FS_CA_FULL",         // = 1 << 5, /* src.rgba * mask.rgba */
-      "FS_CA_SRCALPHA",     // = 1 << 6, /* src.aaaa * mask.rgba */
-      "FS_YUV",             // = 1 << 7,
-      "FS_SRC_REPEAT_NONE", // = 1 << 8,
-      "FS_MASK_REPEAT_NONE",// = 1 << 9,
-      "FS_SRC_SWIZZLE_RGB", // = 1 << 10,
-      "FS_MASK_SWIZZLE_RGB",// = 1 << 11,
-      "FS_SRC_SET_ALPHA",   // = 1 << 12,
-      "FS_MASK_SET_ALPHA",  // = 1 << 13,
-      "FS_SRC_LUMINANCE",   // = 1 << 14,
-      "FS_MASK_LUMINANCE",  // = 1 << 15,
+      "FS_COMPOSITE",       /* = 1 << 0 */
+      "FS_MASK",            /* = 1 << 1 */
+      "FS_SOLID_FILL",      /* = 1 << 2 */
+      "FS_LINGRAD_FILL",    /* = 1 << 3 */
+      "FS_RADGRAD_FILL",    /* = 1 << 4 */
+      "FS_CA_FULL",         /* = 1 << 5 - src.rgba * mask.rgba */
+      "FS_CA_SRCALPHA",     /* = 1 << 6 - src.aaaa * mask.rgba */
+      "FS_YUV",             /* = 1 << 7 */
+      "FS_SRC_REPEAT_NONE", /* = 1 << 8 */
+      "FS_MASK_REPEAT_NONE",/* = 1 << 9 */
+      "FS_SRC_SWIZZLE_RGB", /* = 1 << 10 */
+      "FS_MASK_SWIZZLE_RGB",/* = 1 << 11 */
+      "FS_SRC_SET_ALPHA",   /* = 1 << 12 */
+      "FS_MASK_SET_ALPHA",  /* = 1 << 13 */
+      "FS_SRC_LUMINANCE",   /* = 1 << 14 */
+      "FS_MASK_LUMINANCE",  /* = 1 << 15 */
    };
    int i, k;
    debug_printf("%s: ", __func__);
diff --git a/src/gallium/state_trackers/xorg/xorg_xv.c b/src/gallium/state_trackers/xorg/xorg_xv.c
index e7f4900528..4a701e93ec 100644
--- a/src/gallium/state_trackers/xorg/xorg_xv.c
+++ b/src/gallium/state_trackers/xorg/xorg_xv.c
@@ -73,10 +73,11 @@ static XF86VideoEncodingRec DummyEncoding[1] = {
    }
 };
 
-#define NUM_IMAGES 2
+#define NUM_IMAGES 3
 static XF86ImageRec Images[NUM_IMAGES] = {
    XVIMAGE_UYVY,
    XVIMAGE_YUY2,
+   XVIMAGE_YV12,
 };
 
 struct xorg_xv_port_priv {
@@ -545,6 +546,7 @@ put_image(ScrnInfoPtr pScrn,
    switch (id) {
    case FOURCC_UYVY:
    case FOURCC_YUY2:
+   case FOURCC_YV12:
    default:
       srcPitch = width << 1;
       break;
@@ -593,6 +595,7 @@ query_image_attributes(ScrnInfoPtr pScrn,
    switch (id) {
    case FOURCC_UYVY:
    case FOURCC_YUY2:
+   case FOURCC_YV12:
    default:
       size = *w << 1;
       if (pitches)