diff --git a/configs/default b/configs/default
index 2ca6fe4..a4069cb 100644
--- a/configs/default
+++ b/configs/default
@@ -10,7 +10,7 @@ CONFIG_NAME = default
 # Version info
 MESA_MAJOR=8
 MESA_MINOR=0
-MESA_TINY=0
+MESA_TINY=2
 MESA_VERSION = $(MESA_MAJOR).$(MESA_MINOR).$(MESA_TINY)
 
 # external projects.  This should be useless now that we use libdrm.
diff --git a/docs/relnotes-8.0.2.html b/docs/relnotes-8.0.2.html
index ad1818c..d73ba9f 100644
--- a/docs/relnotes-8.0.2.html
+++ b/docs/relnotes-8.0.2.html
@@ -28,7 +28,9 @@ for DRI hardware acceleration.
 
 <h2>MD5 checksums</h2>
 <pre>
-tdb
+70eb3dc74fbfcd72f6776268ee1db52e  MesaLib-8.0.2.tar.gz
+a368104e5700707048dc3e8691a9a7a1  MesaLib-8.0.2.tar.bz2
+d5e5cdb85d2afdbcd1c0623d3ed1c54d  MesaLib-8.0.2.zip
 </pre>
 
 <h2>New features</h2>
diff --git a/src/egl/main/eglimage.c b/src/egl/main/eglimage.c
index d5deae7..1174d0a 100644
--- a/src/egl/main/eglimage.c
+++ b/src/egl/main/eglimage.c
@@ -45,7 +45,7 @@ _eglParseImageAttribList(_EGLImageAttribs *attrs, _EGLDisplay *dpy,
 
    (void) dpy;
 
-   memset(attrs, 0, sizeof(attrs));
+   memset(attrs, 0, sizeof(*attrs));
    attrs->ImagePreserved = EGL_FALSE;
    attrs->GLTextureLevel = 0;
    attrs->GLTextureZOffset = 0;
diff --git a/src/gallium/auxiliary/util/u_double_list.h b/src/gallium/auxiliary/util/u_double_list.h
index 2384c36..9d1129b 100644
--- a/src/gallium/auxiliary/util/u_double_list.h
+++ b/src/gallium/auxiliary/util/u_double_list.h
@@ -105,6 +105,11 @@ static INLINE void list_delinit(struct list_head *item)
 #define LIST_IS_EMPTY(__list)                   \
     ((__list)->next == (__list))
 
+/**
+ * Cast from a pointer to a member of a struct back to the containing struct.
+ *
+ * 'sample' MUST be initialized, or else the result is undefined!
+ */
 #ifndef container_of
 #define container_of(ptr, sample, member)				\
     (void *)((char *)(ptr)						\
@@ -112,29 +117,29 @@ static INLINE void list_delinit(struct list_head *item)
 #endif
 
 #define LIST_FOR_EACH_ENTRY(pos, head, member)				\
-   for (pos = container_of((head)->next, pos, member);			\
+   for (pos = NULL, pos = container_of((head)->next, pos, member);	\
 	&pos->member != (head);						\
 	pos = container_of(pos->member.next, pos, member))
 
 #define LIST_FOR_EACH_ENTRY_SAFE(pos, storage, head, member)	\
-   for (pos = container_of((head)->next, pos, member),			\
+   for (pos = NULL, pos = container_of((head)->next, pos, member),	\
 	storage = container_of(pos->member.next, pos, member);	\
 	&pos->member != (head);						\
 	pos = storage, storage = container_of(storage->member.next, storage, member))
 
 #define LIST_FOR_EACH_ENTRY_SAFE_REV(pos, storage, head, member)	\
-   for (pos = container_of((head)->prev, pos, member),			\
+   for (pos = NULL, pos = container_of((head)->prev, pos, member),	\
 	storage = container_of(pos->member.prev, pos, member);		\
 	&pos->member != (head);						\
 	pos = storage, storage = container_of(storage->member.prev, storage, member))
 
 #define LIST_FOR_EACH_ENTRY_FROM(pos, start, head, member)		\
-   for (pos = container_of((start), pos, member);			\
+   for (pos = NULL, pos = container_of((start), pos, member);		\
 	&pos->member != (head);						\
 	pos = container_of(pos->member.next, pos, member))
 
 #define LIST_FOR_EACH_ENTRY_FROM_REV(pos, start, head, member)		\
-   for (pos = container_of((start), pos, member);			\
+   for (pos = NULL, pos = container_of((start), pos, member);		\
 	&pos->member != (head);						\
 	pos = container_of(pos->member.prev, pos, member))
 
diff --git a/src/gallium/auxiliary/util/u_linkage.h b/src/gallium/auxiliary/util/u_linkage.h
index 43ec917..7b23123 100644
--- a/src/gallium/auxiliary/util/u_linkage.h
+++ b/src/gallium/auxiliary/util/u_linkage.h
@@ -49,15 +49,16 @@ unsigned util_semantic_set_from_program_file(struct util_semantic_set *set, cons
  *
  * num_slots is the size of the layout array and hardware limit instead.
  *
- * efficient_slots == 0 or efficient_solts == num_slots are typical settings.
+ * efficient_slots == 0 or efficient_slots == num_slots are typical settings.
  */
 void util_semantic_layout_from_set(unsigned char *layout, const struct util_semantic_set *set, unsigned efficient_slots, unsigned num_slots);
 
 static INLINE void
-util_semantic_table_from_layout(unsigned char *table, unsigned char *layout, unsigned char first_slot_value, unsigned char num_slots)
+util_semantic_table_from_layout(unsigned char *table, size_t table_size, unsigned char *layout,
+                                unsigned char first_slot_value, unsigned char num_slots)
 {
-   int i;
-   memset(table, 0xff, sizeof(table));
+   unsigned char i;
+   memset(table, 0xff, table_size);
 
    for(i = 0; i < num_slots; ++i)
       table[layout[i]] = first_slot_value + i;
diff --git a/src/gallium/drivers/nvfx/nvfx_fragprog.c b/src/gallium/drivers/nvfx/nvfx_fragprog.c
index dbd7c77..0babcbb 100644
--- a/src/gallium/drivers/nvfx/nvfx_fragprog.c
+++ b/src/gallium/drivers/nvfx/nvfx_fragprog.c
@@ -977,7 +977,8 @@ nvfx_fragprog_prepare(struct nvfx_context* nvfx, struct nvfx_fpc *fpc)
 	if(fpc->fp->num_slots > num_texcoords)
 		return FALSE;
 	util_semantic_layout_from_set(fpc->fp->slot_to_generic, &set, 0, num_texcoords);
-	util_semantic_table_from_layout(fpc->generic_to_slot, fpc->fp->slot_to_generic, 0, num_texcoords);
+	util_semantic_table_from_layout(fpc->generic_to_slot, sizeof fpc->generic_to_slot,
+                                        fpc->fp->slot_to_generic, 0, num_texcoords);
 
 	memset(fpc->fp->slot_to_fp_input, 0xff, sizeof(fpc->fp->slot_to_fp_input));
 
diff --git a/src/gallium/drivers/r300/compiler/radeon_program_alu.c b/src/gallium/drivers/r300/compiler/radeon_program_alu.c
index c48f936..b3da311 100644
--- a/src/gallium/drivers/r300/compiler/radeon_program_alu.c
+++ b/src/gallium/drivers/r300/compiler/radeon_program_alu.c
@@ -41,13 +41,16 @@
 
 static struct rc_instruction *emit1(
 	struct radeon_compiler * c, struct rc_instruction * after,
-	rc_opcode Opcode, rc_saturate_mode Saturate, struct rc_dst_register DstReg,
-	struct rc_src_register SrcReg)
+	rc_opcode Opcode, struct rc_sub_instruction * base,
+	struct rc_dst_register DstReg, struct rc_src_register SrcReg)
 {
 	struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
 
+	if (base) {
+		memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
+	}
+
 	fpi->U.I.Opcode = Opcode;
-	fpi->U.I.SaturateMode = Saturate;
 	fpi->U.I.DstReg = DstReg;
 	fpi->U.I.SrcReg[0] = SrcReg;
 	return fpi;
@@ -55,13 +58,17 @@ static struct rc_instruction *emit1(
 
 static struct rc_instruction *emit2(
 	struct radeon_compiler * c, struct rc_instruction * after,
-	rc_opcode Opcode, rc_saturate_mode Saturate, struct rc_dst_register DstReg,
+	rc_opcode Opcode, struct rc_sub_instruction * base,
+	struct rc_dst_register DstReg,
 	struct rc_src_register SrcReg0, struct rc_src_register SrcReg1)
 {
 	struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
 
+	if (base) {
+		memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
+	}
+
 	fpi->U.I.Opcode = Opcode;
-	fpi->U.I.SaturateMode = Saturate;
 	fpi->U.I.DstReg = DstReg;
 	fpi->U.I.SrcReg[0] = SrcReg0;
 	fpi->U.I.SrcReg[1] = SrcReg1;
@@ -70,14 +77,18 @@ static struct rc_instruction *emit2(
 
 static struct rc_instruction *emit3(
 	struct radeon_compiler * c, struct rc_instruction * after,
-	rc_opcode Opcode, rc_saturate_mode Saturate, struct rc_dst_register DstReg,
+	rc_opcode Opcode, struct rc_sub_instruction * base,
+	struct rc_dst_register DstReg,
 	struct rc_src_register SrcReg0, struct rc_src_register SrcReg1,
 	struct rc_src_register SrcReg2)
 {
 	struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
 
+	if (base) {
+		memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
+	}
+
 	fpi->U.I.Opcode = Opcode;
-	fpi->U.I.SaturateMode = Saturate;
 	fpi->U.I.DstReg = DstReg;
 	fpi->U.I.SrcReg[0] = SrcReg0;
 	fpi->U.I.SrcReg[1] = SrcReg1;
@@ -221,7 +232,7 @@ static void transform_ABS(struct radeon_compiler* c,
 	struct rc_src_register src = inst->U.I.SrcReg[0];
 	src.Abs = 1;
 	src.Negate = RC_MASK_NONE;
-	emit1(c, inst->Prev, RC_OPCODE_MOV, inst->U.I.SaturateMode, inst->U.I.DstReg, src);
+	emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I, inst->U.I.DstReg, src);
 	rc_remove_instruction(inst);
 }
 
@@ -240,7 +251,7 @@ static void transform_CEIL(struct radeon_compiler* c,
 
 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, negate(inst->U.I.SrcReg[0]));
-	emit2(c, inst->Prev, RC_OPCODE_ADD, inst->U.I.SaturateMode, inst->U.I.DstReg,
+	emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg,
 		inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index));
 	rc_remove_instruction(inst);
 }
@@ -256,7 +267,7 @@ static void transform_CLAMP(struct radeon_compiler *c,
 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 	emit2(c, inst->Prev, RC_OPCODE_MIN, 0, dst,
 		inst->U.I.SrcReg[0], inst->U.I.SrcReg[2]);
-	emit2(c, inst->Prev, RC_OPCODE_MAX, inst->U.I.SaturateMode, inst->U.I.DstReg,
+	emit2(c, inst->Prev, RC_OPCODE_MAX, &inst->U.I, inst->U.I.DstReg,
 		srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1]);
 	rc_remove_instruction(inst);
 }
@@ -272,7 +283,7 @@ static void transform_DP2(struct radeon_compiler* c,
 	src1.Negate &= ~(RC_MASK_Z | RC_MASK_W);
 	src1.Swizzle &= ~(63 << (3 * 2));
 	src1.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
-	emit2(c, inst->Prev, RC_OPCODE_DP3, inst->U.I.SaturateMode, inst->U.I.DstReg, src0, src1);
+	emit2(c, inst->Prev, RC_OPCODE_DP3, &inst->U.I, inst->U.I.DstReg, src0, src1);
 	rc_remove_instruction(inst);
 }
 
@@ -283,7 +294,7 @@ static void transform_DPH(struct radeon_compiler* c,
 	src0.Negate &= ~RC_MASK_W;
 	src0.Swizzle &= ~(7 << (3 * 3));
 	src0.Swizzle |= RC_SWIZZLE_ONE << (3 * 3);
-	emit2(c, inst->Prev, RC_OPCODE_DP4, inst->U.I.SaturateMode, inst->U.I.DstReg, src0, inst->U.I.SrcReg[1]);
+	emit2(c, inst->Prev, RC_OPCODE_DP4, &inst->U.I, inst->U.I.DstReg, src0, inst->U.I.SrcReg[1]);
 	rc_remove_instruction(inst);
 }
 
@@ -294,7 +305,7 @@ static void transform_DPH(struct radeon_compiler* c,
 static void transform_DST(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	emit2(c, inst->Prev, RC_OPCODE_MUL, inst->U.I.SaturateMode, inst->U.I.DstReg,
+	emit2(c, inst->Prev, RC_OPCODE_MUL, &inst->U.I, inst->U.I.DstReg,
 		swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_ONE),
 		swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_ONE, RC_SWIZZLE_W));
 	rc_remove_instruction(inst);
@@ -305,7 +316,7 @@ static void transform_FLR(struct radeon_compiler* c,
 {
 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, inst->U.I.SrcReg[0]);
-	emit2(c, inst->Prev, RC_OPCODE_ADD, inst->U.I.SaturateMode, inst->U.I.DstReg,
+	emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg,
 		inst->U.I.SrcReg[0], negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
 	rc_remove_instruction(inst);
 }
@@ -379,14 +390,14 @@ static void transform_LIT(struct radeon_compiler* c,
 		swizzle_wwww(srctemp));
 
 	/* tmp.z = (tmp.x > 0) ? tmp.w : 0.0 */
-	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode,
+	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I,
 		dstregtmpmask(temp, RC_MASK_Z),
 		negate(swizzle_xxxx(srctemp)),
 		swizzle_wwww(srctemp),
 		builtin_zero);
 
 	/* tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0 */
-	emit1(c, inst->Prev, RC_OPCODE_MOV, inst->U.I.SaturateMode,
+	emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I,
 		dstregtmpmask(temp, RC_MASK_XYW),
 		swizzle(srctemp, RC_SWIZZLE_ONE, RC_SWIZZLE_X, RC_SWIZZLE_ONE, RC_SWIZZLE_ONE));
 
@@ -401,7 +412,7 @@ static void transform_LRP(struct radeon_compiler* c,
 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
 		dst,
 		inst->U.I.SrcReg[1], negate(inst->U.I.SrcReg[2]));
-	emit3(c, inst->Prev, RC_OPCODE_MAD, inst->U.I.SaturateMode,
+	emit3(c, inst->Prev, RC_OPCODE_MAD, &inst->U.I,
 		inst->U.I.DstReg,
 		inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[2]);
 
@@ -418,7 +429,7 @@ static void transform_POW(struct radeon_compiler* c,
 
 	emit1(c, inst->Prev, RC_OPCODE_LG2, 0, tempdst, swizzle_xxxx(inst->U.I.SrcReg[0]));
 	emit2(c, inst->Prev, RC_OPCODE_MUL, 0, tempdst, tempsrc, swizzle_xxxx(inst->U.I.SrcReg[1]));
-	emit1(c, inst->Prev, RC_OPCODE_EX2, inst->U.I.SaturateMode, inst->U.I.DstReg, tempsrc);
+	emit1(c, inst->Prev, RC_OPCODE_EX2, &inst->U.I, inst->U.I.DstReg, tempsrc);
 
 	rc_remove_instruction(inst);
 }
@@ -472,7 +483,7 @@ static void transform_SEQ(struct radeon_compiler* c,
 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
-	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
+	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
 		negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_zero, builtin_one);
 
 	rc_remove_instruction(inst);
@@ -481,7 +492,7 @@ static void transform_SEQ(struct radeon_compiler* c,
 static void transform_SFL(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	emit1(c, inst->Prev, RC_OPCODE_MOV, inst->U.I.SaturateMode, inst->U.I.DstReg, builtin_zero);
+	emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I, inst->U.I.DstReg, builtin_zero);
 	rc_remove_instruction(inst);
 }
 
@@ -491,7 +502,7 @@ static void transform_SGE(struct radeon_compiler* c,
 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
-	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
+	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
 		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
 
 	rc_remove_instruction(inst);
@@ -503,7 +514,7 @@ static void transform_SGT(struct radeon_compiler* c,
 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
-	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
+	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
 		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
 
 	rc_remove_instruction(inst);
@@ -515,7 +526,7 @@ static void transform_SLE(struct radeon_compiler* c,
 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
-	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
+	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
 		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
 
 	rc_remove_instruction(inst);
@@ -527,7 +538,7 @@ static void transform_SLT(struct radeon_compiler* c,
 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
-	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
+	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
 		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
 
 	rc_remove_instruction(inst);
@@ -539,7 +550,7 @@ static void transform_SNE(struct radeon_compiler* c,
 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
-	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
+	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
 		negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_one, builtin_zero);
 
 	rc_remove_instruction(inst);
@@ -604,7 +615,7 @@ static void transform_XPD(struct radeon_compiler* c,
 	emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dst,
 		swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
 		swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W));
-	emit3(c, inst->Prev, RC_OPCODE_MAD, inst->U.I.SaturateMode, inst->U.I.DstReg,
+	emit3(c, inst->Prev, RC_OPCODE_MAD, &inst->U.I, inst->U.I.DstReg,
 		swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W),
 		swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
 		negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
@@ -719,7 +730,7 @@ static void transform_r300_vertex_DP3(struct radeon_compiler* c,
 	src1.Negate &= ~RC_MASK_W;
 	src1.Swizzle &= ~(7 << (3 * 3));
 	src1.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
-	emit2(c, inst->Prev, RC_OPCODE_DP4, inst->U.I.SaturateMode, inst->U.I.DstReg, src0, src1);
+	emit2(c, inst->Prev, RC_OPCODE_DP4, &inst->U.I, inst->U.I.DstReg, src0, src1);
 	rc_remove_instruction(inst);
 }
 
@@ -1043,22 +1054,22 @@ static void r300_transform_SIN_COS_SCS(struct radeon_compiler *c,
 	unsigned srctmp)
 {
 	if (inst->U.I.Opcode == RC_OPCODE_COS) {
-		emit1(c, inst->Prev, RC_OPCODE_COS, inst->U.I.SaturateMode, inst->U.I.DstReg,
+		emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, inst->U.I.DstReg,
 			srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
 	} else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
-		emit1(c, inst->Prev, RC_OPCODE_SIN, inst->U.I.SaturateMode,
+		emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I,
 			inst->U.I.DstReg, srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
 	} else if (inst->U.I.Opcode == RC_OPCODE_SCS) {
 		struct rc_dst_register moddst = inst->U.I.DstReg;
 
 		if (inst->U.I.DstReg.WriteMask & RC_MASK_X) {
 			moddst.WriteMask = RC_MASK_X;
-			emit1(c, inst->Prev, RC_OPCODE_COS, inst->U.I.SaturateMode, moddst,
+			emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, moddst,
 				srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
 		}
 		if (inst->U.I.DstReg.WriteMask & RC_MASK_Y) {
 			moddst.WriteMask = RC_MASK_Y;
-			emit1(c, inst->Prev, RC_OPCODE_SIN, inst->U.I.SaturateMode, moddst,
+			emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I, moddst,
 				srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
 		}
 	}
diff --git a/src/gallium/drivers/r300/r300_blit.c b/src/gallium/drivers/r300/r300_blit.c
index d132638..920612b 100644
--- a/src/gallium/drivers/r300/r300_blit.c
+++ b/src/gallium/drivers/r300/r300_blit.c
@@ -63,8 +63,13 @@ static void r300_blitter_begin(struct r300_context* r300, enum r300_blitter_op o
     util_blitter_save_vertex_shader(r300->blitter, r300->vs_state.state);
     util_blitter_save_viewport(r300->blitter, &r300->viewport);
     util_blitter_save_vertex_elements(r300->blitter, r300->velems);
-    util_blitter_save_vertex_buffers(r300->blitter, r300->vbuf_mgr->nr_vertex_buffers,
-                                     r300->vbuf_mgr->vertex_buffer);
+    if (r300->vbuf_mgr) {
+        util_blitter_save_vertex_buffers(r300->blitter, r300->vbuf_mgr->nr_vertex_buffers,
+                                         r300->vbuf_mgr->vertex_buffer);
+    } else {
+        util_blitter_save_vertex_buffers(r300->blitter, r300->swtcl_nr_vertex_buffers,
+                                         r300->swtcl_vertex_buffer);
+    }
 
     if (op & R300_SAVE_FRAMEBUFFER) {
         util_blitter_save_framebuffer(r300->blitter, r300->fb_state.state);
diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
index 7d289ca..1626768 100644
--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -419,17 +419,19 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
     r300_init_query_functions(r300);
     r300_init_state_functions(r300);
     r300_init_resource_functions(r300);
-    
+
     r300->context.create_video_decoder = vl_create_decoder;
     r300->context.create_video_buffer = vl_video_buffer_create;
 
-    r300->vbuf_mgr = u_vbuf_create(&r300->context, 1024 * 1024, 16,
+    if (r300->screen->caps.has_tcl) {
+        r300->vbuf_mgr = u_vbuf_create(&r300->context, 1024 * 1024, 16,
                                        PIPE_BIND_VERTEX_BUFFER |
                                        PIPE_BIND_INDEX_BUFFER,
                                        U_VERTEX_FETCH_DWORD_ALIGNED);
-    if (!r300->vbuf_mgr)
-        goto fail;
-    r300->vbuf_mgr->caps.format_fixed32 = 0;
+        if (!r300->vbuf_mgr)
+            goto fail;
+        r300->vbuf_mgr->caps.format_fixed32 = 0;
+    }
 
     r300->blitter = util_blitter_create(&r300->context);
     if (r300->blitter == NULL)
diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h
index e40b7af..8264b28 100644
--- a/src/gallium/drivers/r300/r300_context.h
+++ b/src/gallium/drivers/r300/r300_context.h
@@ -581,6 +581,9 @@ struct r300_context {
     void *dsa_decompress_zmask;
 
     struct u_vbuf *vbuf_mgr;
+    struct pipe_index_buffer swtcl_index_buffer;
+    struct pipe_vertex_buffer swtcl_vertex_buffer[PIPE_MAX_ATTRIBS];
+    unsigned swtcl_nr_vertex_buffers;
 
     struct util_slab_mempool pool_transfers;
 
diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c
index 3897e99..e4afe78 100644
--- a/src/gallium/drivers/r300/r300_emit.c
+++ b/src/gallium/drivers/r300/r300_emit.c
@@ -1030,20 +1030,18 @@ void r300_emit_vs_state(struct r300_context* r300, unsigned size, void* state)
             R300_PVS_VF_MAX_VTX_NUM(12) |
             (r300screen->caps.is_r500 ? R500_TCL_STATE_OPTIMIZATION : 0));
 
-    /* Emit flow control instructions. */
-    if (code->num_fc_ops) {
-
-        OUT_CS_REG(R300_VAP_PVS_FLOW_CNTL_OPC, code->fc_ops);
-        if (r300screen->caps.is_r500) {
-            OUT_CS_REG_SEQ(R500_VAP_PVS_FLOW_CNTL_ADDRS_LW_0, code->num_fc_ops * 2);
-            OUT_CS_TABLE(code->fc_op_addrs.r500, code->num_fc_ops * 2);
-        } else {
-            OUT_CS_REG_SEQ(R300_VAP_PVS_FLOW_CNTL_ADDRS_0, code->num_fc_ops);
-            OUT_CS_TABLE(code->fc_op_addrs.r300, code->num_fc_ops);
-        }
-        OUT_CS_REG_SEQ(R300_VAP_PVS_FLOW_CNTL_LOOP_INDEX_0, code->num_fc_ops);
-        OUT_CS_TABLE(code->fc_loop_index, code->num_fc_ops);
+    /* Emit flow control instructions.  Even if there are no fc instructions,
+     * we still need to write the registers to make sure they are cleared. */
+    OUT_CS_REG(R300_VAP_PVS_FLOW_CNTL_OPC, code->fc_ops);
+    if (r300screen->caps.is_r500) {
+        OUT_CS_REG_SEQ(R500_VAP_PVS_FLOW_CNTL_ADDRS_LW_0, R300_VS_MAX_FC_OPS * 2);
+        OUT_CS_TABLE(code->fc_op_addrs.r500, R300_VS_MAX_FC_OPS * 2);
+    } else {
+        OUT_CS_REG_SEQ(R300_VAP_PVS_FLOW_CNTL_ADDRS_0, R300_VS_MAX_FC_OPS);
+        OUT_CS_TABLE(code->fc_op_addrs.r300, R300_VS_MAX_FC_OPS);
     }
+    OUT_CS_REG_SEQ(R300_VAP_PVS_FLOW_CNTL_LOOP_INDEX_0, R300_VS_MAX_FC_OPS);
+    OUT_CS_TABLE(code->fc_loop_index, R300_VS_MAX_FC_OPS);
 
     END_CS;
 }
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c
index 83cad42..1542648 100644
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -818,7 +818,7 @@ static void r300_swtcl_draw_vbo(struct pipe_context* pipe,
     struct pipe_transfer *ib_transfer = NULL;
     int i;
     void *indices = NULL;
-    boolean indexed = info->indexed && r300->vbuf_mgr->index_buffer.buffer;
+    boolean indexed = info->indexed && r300->swtcl_index_buffer.buffer;
 
     if (r300->skip_rendering) {
         return;
@@ -831,10 +831,10 @@ static void r300_swtcl_draw_vbo(struct pipe_context* pipe,
             (indexed ? PREP_INDEXED : 0),
             indexed ? 256 : 6);
 
-    for (i = 0; i < r300->vbuf_mgr->nr_vertex_buffers; i++) {
-        if (r300->vbuf_mgr->vertex_buffer[i].buffer) {
+    for (i = 0; i < r300->swtcl_nr_vertex_buffers; i++) {
+        if (r300->swtcl_vertex_buffer[i].buffer) {
             void *buf = pipe_buffer_map(pipe,
-                                  r300->vbuf_mgr->vertex_buffer[i].buffer,
+                                  r300->swtcl_vertex_buffer[i].buffer,
                                   PIPE_TRANSFER_READ |
                                   PIPE_TRANSFER_UNSYNCHRONIZED,
                                   &vb_transfer[i]);
@@ -843,7 +843,7 @@ static void r300_swtcl_draw_vbo(struct pipe_context* pipe,
     }
 
     if (indexed) {
-        indices = pipe_buffer_map(pipe, r300->vbuf_mgr->index_buffer.buffer,
+        indices = pipe_buffer_map(pipe, r300->swtcl_index_buffer.buffer,
                                   PIPE_TRANSFER_READ |
                                   PIPE_TRANSFER_UNSYNCHRONIZED, &ib_transfer);
     }
@@ -856,8 +856,8 @@ static void r300_swtcl_draw_vbo(struct pipe_context* pipe,
     draw_flush(r300->draw);
     r300->draw_vbo_locked = FALSE;
 
-    for (i = 0; i < r300->vbuf_mgr->nr_vertex_buffers; i++) {
-        if (r300->vbuf_mgr->vertex_buffer[i].buffer) {
+    for (i = 0; i < r300->swtcl_nr_vertex_buffers; i++) {
+        if (r300->swtcl_vertex_buffer[i].buffer) {
             pipe_buffer_unmap(pipe, vb_transfer[i]);
             draw_set_mapped_vertex_buffer(r300->draw, i, NULL);
         }
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 763321b..f28b0be 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -212,6 +212,7 @@ static int r300_get_shader_param(struct pipe_screen *pscreen, unsigned shader, e
         switch (param)
         {
         case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
+        case PIPE_SHADER_CAP_SUBROUTINES:
             return 0;
         default:;
         }
diff --git a/src/gallium/drivers/r300/r300_state.c b/src/gallium/drivers/r300/r300_state.c
index 2bc7036..8a656e6 100644
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -1048,6 +1048,10 @@ static void* r300_create_rs_state(struct pipe_context* pipe,
 
     /* Override some states for Draw. */
     rs->rs_draw.sprite_coord_enable = 0; /* We can do this in HW. */
+    rs->rs_draw.offset_point = 0;
+    rs->rs_draw.offset_line = 0;
+    rs->rs_draw.offset_tri = 0;
+    rs->rs_draw.offset_clamp = 0;
 
 #ifdef PIPE_ARCH_LITTLE_ENDIAN
     vap_control_status = R300_VC_NO_SWAP;
@@ -1595,7 +1599,6 @@ static void r300_set_vertex_buffers(struct pipe_context* pipe,
                                     const struct pipe_vertex_buffer* buffers)
 {
     struct r300_context* r300 = r300_context(pipe);
-    unsigned i;
     struct pipe_vertex_buffer dummy_vb = {0};
 
     /* There must be at least one vertex buffer set, otherwise it locks up. */
@@ -1605,18 +1608,13 @@ static void r300_set_vertex_buffers(struct pipe_context* pipe,
         count = 1;
     }
 
-    u_vbuf_set_vertex_buffers(r300->vbuf_mgr, count, buffers);
-
     if (r300->screen->caps.has_tcl) {
-        /* HW TCL. */
-        for (i = 0; i < count; i++) {
-            if (buffers[i].buffer &&
-		!r300_resource(buffers[i].buffer)->b.user_ptr) {
-            }
-        }
+        u_vbuf_set_vertex_buffers(r300->vbuf_mgr, count, buffers);
         r300->vertex_arrays_dirty = TRUE;
     } else {
-        /* SW TCL. */
+        util_copy_vertex_buffers(r300->swtcl_vertex_buffer,
+                                 &r300->swtcl_nr_vertex_buffers,
+                                 buffers, count);
         draw_set_vertex_buffers(r300->draw, count, buffers);
     }
 }
@@ -1626,9 +1624,15 @@ static void r300_set_index_buffer(struct pipe_context* pipe,
 {
     struct r300_context* r300 = r300_context(pipe);
 
-    u_vbuf_set_index_buffer(r300->vbuf_mgr, ib);
-
-    if (!r300->screen->caps.has_tcl) {
+    if (r300->screen->caps.has_tcl) {
+        u_vbuf_set_index_buffer(r300->vbuf_mgr, ib);
+    } else {
+        if (ib) {
+            pipe_resource_reference(&r300->swtcl_index_buffer.buffer, ib->buffer);
+            memcpy(&r300->swtcl_index_buffer, ib, sizeof(*ib));
+        } else {
+            pipe_resource_reference(&r300->swtcl_index_buffer.buffer, NULL);
+        }
         draw_set_index_buffer(r300->draw, ib);
     }
 }
@@ -1702,11 +1706,11 @@ static void* r300_create_vertex_elements_state(struct pipe_context* pipe,
         return NULL;
 
     velems->count = count;
-    velems->vmgr_elements =
-        u_vbuf_create_vertex_elements(r300->vbuf_mgr, count, attribs,
-                                          velems->velem);
 
     if (r300_screen(pipe->screen)->caps.has_tcl) {
+        velems->vmgr_elements =
+            u_vbuf_create_vertex_elements(r300->vbuf_mgr, count, attribs,
+                                          velems->velem);
         /* Setup PSC.
          * The unused components will be replaced by (..., 0, 1). */
         r300_vertex_psc(velems);
@@ -1716,6 +1720,8 @@ static void* r300_create_vertex_elements_state(struct pipe_context* pipe,
                 align(util_format_get_blocksize(velems->velem[i].src_format), 4);
             velems->vertex_size_dwords += velems->format_size[i] / 4;
         }
+    } else {
+        memcpy(velems->velem, attribs, count * sizeof(struct pipe_vertex_element));
     }
 
     return velems;
@@ -1733,9 +1739,9 @@ static void r300_bind_vertex_elements_state(struct pipe_context *pipe,
 
     r300->velems = velems;
 
-    u_vbuf_bind_vertex_elements(r300->vbuf_mgr, state, velems->vmgr_elements);
-
-    if (r300->draw) {
+    if (r300->screen->caps.has_tcl) {
+        u_vbuf_bind_vertex_elements(r300->vbuf_mgr, state, velems->vmgr_elements);
+    } else {
         draw_set_vertex_elements(r300->draw, velems->count, velems->velem);
         return;
     }
@@ -1750,7 +1756,9 @@ static void r300_delete_vertex_elements_state(struct pipe_context *pipe, void *s
     struct r300_context *r300 = r300_context(pipe);
     struct r300_vertex_element_state *velems = state;
 
-    u_vbuf_destroy_vertex_elements(r300->vbuf_mgr, velems->vmgr_elements);
+    if (r300->screen->caps.has_tcl) {
+        u_vbuf_destroy_vertex_elements(r300->vbuf_mgr, velems->vmgr_elements);
+    }
     FREE(state);
 }
 
@@ -1765,10 +1773,10 @@ static void* r300_create_vs_state(struct pipe_context* pipe,
     vs->state.tokens = tgsi_dup_tokens(shader->tokens);
 
     if (r300->screen->caps.has_tcl) {
-        r300_init_vs_outputs(vs);
+        r300_init_vs_outputs(r300, vs);
         r300_translate_vertex_shader(r300, vs);
     } else {
-        r300_draw_init_vertex_shader(r300->draw, vs);
+        r300_draw_init_vertex_shader(r300, vs);
     }
 
     return vs;
@@ -1794,9 +1802,8 @@ static void r300_bind_vs_state(struct pipe_context* pipe, void* shader)
     if (r300->screen->caps.has_tcl) {
         unsigned fc_op_dwords = r300->screen->caps.is_r500 ? 3 : 2;
         r300_mark_atom_dirty(r300, &r300->vs_state);
-        r300->vs_state.size =
-                vs->code.length + 9 +
-        (vs->code.num_fc_ops ? vs->code.num_fc_ops * fc_op_dwords + 4 : 0);
+        r300->vs_state.size = vs->code.length + 9 +
+			(R300_VS_MAX_FC_OPS * fc_op_dwords + 4);
 
         r300_mark_atom_dirty(r300, &r300->vs_constants);
         r300->vs_constants.size =
diff --git a/src/gallium/drivers/r300/r300_vs.c b/src/gallium/drivers/r300/r300_vs.c
index 1eef071..4faf2b5 100644
--- a/src/gallium/drivers/r300/r300_vs.c
+++ b/src/gallium/drivers/r300/r300_vs.c
@@ -36,6 +36,7 @@
 
 /* Convert info about VS output semantics into r300_shader_semantics. */
 static void r300_shader_read_vs_outputs(
+    struct r300_context *r300,
     struct tgsi_shader_info* info,
     struct r300_shader_semantics* vs_outputs)
 {
@@ -83,6 +84,14 @@ static void r300_shader_read_vs_outputs(
                 fprintf(stderr, "r300 VP: cannot handle edgeflag output.\n");
                 break;
 
+            case TGSI_SEMANTIC_CLIPVERTEX:
+                assert(index == 0);
+                /* Draw does clip vertex for us. */
+                if (r300->screen->caps.has_tcl) {
+                    fprintf(stderr, "r300 VP: cannot handle clip vertex output.\n");
+                }
+                break;
+
             default:
                 fprintf(stderr, "r300 VP: unknown vertex output semantic: %i.\n",
                         info->output_semantic_name[i]);
@@ -160,10 +169,11 @@ static void set_vertex_inputs_outputs(struct r300_vertex_program_compiler * c)
     c->code->outputs[outputs->wpos] = reg++;
 }
 
-void r300_init_vs_outputs(struct r300_vertex_shader *vs)
+void r300_init_vs_outputs(struct r300_context *r300,
+                          struct r300_vertex_shader *vs)
 {
     tgsi_scan_shader(vs->state.tokens, &vs->info);
-    r300_shader_read_vs_outputs(&vs->info, &vs->outputs);
+    r300_shader_read_vs_outputs(r300, &vs->info, &vs->outputs);
 }
 
 static void r300_dummy_vertex_shader(
@@ -187,7 +197,7 @@ static void r300_dummy_vertex_shader(
     ureg_destroy(ureg);
 
     shader->dummy = TRUE;
-    r300_init_vs_outputs(shader);
+    r300_init_vs_outputs(r300, shader);
     r300_translate_vertex_shader(r300, shader);
 }
 
diff --git a/src/gallium/drivers/r300/r300_vs.h b/src/gallium/drivers/r300/r300_vs.h
index a482ddc..b02d5d7 100644
--- a/src/gallium/drivers/r300/r300_vs.h
+++ b/src/gallium/drivers/r300/r300_vs.h
@@ -56,12 +56,13 @@ struct r300_vertex_shader {
     void *draw_vs;
 };
 
-void r300_init_vs_outputs(struct r300_vertex_shader *vs);
+void r300_init_vs_outputs(struct r300_context *r300,
+                          struct r300_vertex_shader *vs);
 
 void r300_translate_vertex_shader(struct r300_context *r300,
                                   struct r300_vertex_shader *vs);
 
-void r300_draw_init_vertex_shader(struct draw_context *draw,
+void r300_draw_init_vertex_shader(struct r300_context *r300,
                                   struct r300_vertex_shader *vs);
 
 #endif /* R300_VS_H */
diff --git a/src/gallium/drivers/r300/r300_vs_draw.c b/src/gallium/drivers/r300/r300_vs_draw.c
index 2939963..69d6758 100644
--- a/src/gallium/drivers/r300/r300_vs_draw.c
+++ b/src/gallium/drivers/r300/r300_vs_draw.c
@@ -29,7 +29,7 @@
  *
  * Transformations:
  * 1) If the secondary color output is present, the primary color must be
- *    inserted before it.
+ *    present too.
  * 2) If any back-face color output is present, there must be all 4 color
  *    outputs and missing ones must be inserted.
  * 3) Insert a trailing texcoord output containing a copy of POS, for WPOS.
@@ -52,7 +52,6 @@ struct vs_transform_context {
 
     boolean color_used[2];
     boolean bcolor_used[2];
-    boolean temp_used[128];
 
     /* Index of the pos output, typically 0. */
     unsigned pos_output;
@@ -72,6 +71,8 @@ struct vs_transform_context {
     boolean first_instruction;
     /* End instruction processed? */
     boolean end_instruction;
+
+    boolean temp_used[1024];
 };
 
 static void emit_temp(struct tgsi_transform_context *ctx, unsigned reg)
@@ -102,9 +103,9 @@ static void emit_output(struct tgsi_transform_context *ctx,
     ++vsctx->num_outputs;
 }
 
-static void insert_output(struct tgsi_transform_context *ctx,
-                          struct tgsi_full_declaration *before,
-                          unsigned name, unsigned index, unsigned interp)
+static void insert_output_before(struct tgsi_transform_context *ctx,
+                                 struct tgsi_full_declaration *before,
+                                 unsigned name, unsigned index, unsigned interp)
 {
     struct vs_transform_context *vsctx = (struct vs_transform_context *)ctx;
     unsigned i;
@@ -115,28 +116,29 @@ static void insert_output(struct tgsi_transform_context *ctx,
     }
 
     /* Insert the new output. */
-    emit_output(ctx, name, index, interp, before->Range.First);
+    emit_output(ctx, name, index, interp,
+                before->Range.First + vsctx->decl_shift);
 
     ++vsctx->decl_shift;
 }
 
-static void insert_trailing_bcolor(struct tgsi_transform_context *ctx,
-                                   struct tgsi_full_declaration *before)
+static void insert_output_after(struct tgsi_transform_context *ctx,
+                                struct tgsi_full_declaration *after,
+                                unsigned name, unsigned index, unsigned interp)
 {
     struct vs_transform_context *vsctx = (struct vs_transform_context *)ctx;
+    unsigned i;
 
-    /* If BCOLOR0 is used, make sure BCOLOR1 is present too. Otherwise
-     * the rasterizer doesn't do the color selection correctly. */
-    if (vsctx->bcolor_used[0] && !vsctx->bcolor_used[1]) {
-        if (before) {
-            insert_output(ctx, before, TGSI_SEMANTIC_BCOLOR, 1,
-                          TGSI_INTERPOLATE_LINEAR);
-        } else {
-            emit_output(ctx, TGSI_SEMANTIC_BCOLOR, 1,
-                        TGSI_INTERPOLATE_LINEAR, vsctx->num_outputs);
-        }
-        vsctx->bcolor_used[1] = TRUE;
+    /* Make a place for the new output. */
+    for (i = after->Range.First+1; i < Elements(vsctx->out_remap); i++) {
+        ++vsctx->out_remap[i];
     }
+
+    /* Insert the new output. */
+    emit_output(ctx, name, index, interp,
+                after->Range.First + 1);
+
+    ++vsctx->decl_shift;
 }
 
 static void transform_decl(struct tgsi_transform_context *ctx,
@@ -153,41 +155,38 @@ static void transform_decl(struct tgsi_transform_context *ctx,
 
             case TGSI_SEMANTIC_COLOR:
                 assert(decl->Semantic.Index < 2);
-                vsctx->color_used[decl->Semantic.Index] = TRUE;
 
                 /* We must rasterize the first color if the second one is
                  * used, otherwise the rasterizer doesn't do the color
                  * selection correctly. Declare it, but don't write to it. */
                 if (decl->Semantic.Index == 1 && !vsctx->color_used[0]) {
-                    insert_output(ctx, decl, TGSI_SEMANTIC_COLOR, 0,
-                                  TGSI_INTERPOLATE_LINEAR);
+                    insert_output_before(ctx, decl, TGSI_SEMANTIC_COLOR, 0,
+                                         TGSI_INTERPOLATE_LINEAR);
                     vsctx->color_used[0] = TRUE;
                 }
                 break;
 
             case TGSI_SEMANTIC_BCOLOR:
                 assert(decl->Semantic.Index < 2);
-                vsctx->bcolor_used[decl->Semantic.Index] = TRUE;
 
                 /* We must rasterize all 4 colors if back-face colors are
                  * used, otherwise the rasterizer doesn't do the color
                  * selection correctly. Declare it, but don't write to it. */
                 if (!vsctx->color_used[0]) {
-                    insert_output(ctx, decl, TGSI_SEMANTIC_COLOR, 0,
-                                  TGSI_INTERPOLATE_LINEAR);
+                    insert_output_before(ctx, decl, TGSI_SEMANTIC_COLOR, 0,
+                                         TGSI_INTERPOLATE_LINEAR);
                     vsctx->color_used[0] = TRUE;
                 }
                 if (!vsctx->color_used[1]) {
-                    insert_output(ctx, decl, TGSI_SEMANTIC_COLOR, 1,
-                                  TGSI_INTERPOLATE_LINEAR);
+                    insert_output_before(ctx, decl, TGSI_SEMANTIC_COLOR, 1,
+                                         TGSI_INTERPOLATE_LINEAR);
                     vsctx->color_used[1] = TRUE;
                 }
                 if (decl->Semantic.Index == 1 && !vsctx->bcolor_used[0]) {
-                    insert_output(ctx, decl, TGSI_SEMANTIC_BCOLOR, 0,
-                                  TGSI_INTERPOLATE_LINEAR);
+                    insert_output_before(ctx, decl, TGSI_SEMANTIC_BCOLOR, 0,
+                                         TGSI_INTERPOLATE_LINEAR);
                     vsctx->bcolor_used[0] = TRUE;
                 }
-                /* One more case is handled in insert_trailing_bcolor. */
                 break;
 
             case TGSI_SEMANTIC_GENERIC:
@@ -195,11 +194,6 @@ static void transform_decl(struct tgsi_transform_context *ctx,
                 break;
         }
 
-        if (decl->Semantic.Name != TGSI_SEMANTIC_BCOLOR) {
-            /* Insert it as soon as possible. */
-            insert_trailing_bcolor(ctx, decl);
-        }
-
         /* Since we're inserting new outputs in between, the following outputs
          * should be moved to the right so that they don't overlap with
          * the newly added ones. */
@@ -214,6 +208,14 @@ static void transform_decl(struct tgsi_transform_context *ctx,
     }
 
     ctx->emit_declaration(ctx, decl);
+
+    /* Insert BCOLOR1 if needed. */
+    if (decl->Declaration.File == TGSI_FILE_OUTPUT &&
+        decl->Semantic.Name == TGSI_SEMANTIC_BCOLOR &&
+        !vsctx->bcolor_used[1]) {
+        insert_output_after(ctx, decl, TGSI_SEMANTIC_BCOLOR, 1,
+                            TGSI_INTERPOLATE_LINEAR);
+    }
 }
 
 static void transform_inst(struct tgsi_transform_context *ctx,
@@ -226,10 +228,6 @@ static void transform_inst(struct tgsi_transform_context *ctx,
     if (!vsctx->first_instruction) {
         vsctx->first_instruction = TRUE;
 
-        /* The trailing BCOLOR should be inserted before the code
-         * if it hasn't already been done so. */
-        insert_trailing_bcolor(ctx, NULL);
-
         /* Insert the generic output for WPOS. */
         emit_output(ctx, TGSI_SEMANTIC_GENERIC, vsctx->last_generic + 1,
                     TGSI_INTERPOLATE_PERSPECTIVE, vsctx->num_outputs);
@@ -309,14 +307,18 @@ static void transform_inst(struct tgsi_transform_context *ctx,
     ctx->emit_instruction(ctx, inst);
 }
 
-void r300_draw_init_vertex_shader(struct draw_context *draw,
+void r300_draw_init_vertex_shader(struct r300_context *r300,
                                   struct r300_vertex_shader *vs)
 {
+    struct draw_context *draw = r300->draw;
     struct pipe_shader_state new_vs;
+    struct tgsi_shader_info info;
     struct vs_transform_context transform;
     const uint newLen = tgsi_num_tokens(vs->state.tokens) + 100 /* XXX */;
     unsigned i;
 
+    tgsi_scan_shader(vs->state.tokens, &info);
+
     new_vs.tokens = tgsi_alloc_tokens(newLen);
     if (new_vs.tokens == NULL)
         return;
@@ -329,6 +331,22 @@ void r300_draw_init_vertex_shader(struct draw_context *draw,
     transform.base.transform_instruction = transform_inst;
     transform.base.transform_declaration = transform_decl;
 
+    for (i = 0; i < info.num_outputs; i++) {
+        unsigned index = info.output_semantic_index[i];
+
+        switch (info.output_semantic_name[i]) {
+            case TGSI_SEMANTIC_COLOR:
+                assert(index < 2);
+                transform.color_used[index] = TRUE;
+                break;
+
+            case TGSI_SEMANTIC_BCOLOR:
+                assert(index < 2);
+                transform.bcolor_used[index] = TRUE;
+                break;
+        }
+    }
+
     tgsi_transform_shader(vs->state.tokens,
                           (struct tgsi_token*)new_vs.tokens,
                           newLen, &transform.base);
@@ -350,7 +368,7 @@ void r300_draw_init_vertex_shader(struct draw_context *draw,
     vs->state.tokens = new_vs.tokens;
 
     /* Init the VS output table for the rasterizer. */
-    r300_init_vs_outputs(vs);
+    r300_init_vs_outputs(r300, vs);
 
     /* Make the last generic be WPOS. */
     vs->outputs.wpos = vs->outputs.generic[transform.last_generic + 1];
diff --git a/src/gallium/state_trackers/vega/text.c b/src/gallium/state_trackers/vega/text.c
index a183933..27d461c 100644
--- a/src/gallium/state_trackers/vega/text.c
+++ b/src/gallium/state_trackers/vega/text.c
@@ -73,8 +73,8 @@ static void add_glyph(struct vg_font *font,
    glyph = CALLOC_STRUCT(vg_glyph);
    glyph->object = obj;
    glyph->is_hinted = isHinted;
-   memcpy(glyph->glyph_origin, glyphOrigin, sizeof(glyphOrigin));
-   memcpy(glyph->escapement, escapement, sizeof(escapement));
+   memcpy(glyph->glyph_origin, glyphOrigin, sizeof(glyph->glyph_origin));
+   memcpy(glyph->escapement, escapement, sizeof(glyph->glyph_origin));
 
    cso_hash_insert(font->glyphs, (unsigned) glyphIndex, glyph);
 }
diff --git a/src/gallium/targets/egl-static/egl_st.c b/src/gallium/targets/egl-static/egl_st.c
index 81d7bb4..67e3c29 100644
--- a/src/gallium/targets/egl-static/egl_st.c
+++ b/src/gallium/targets/egl-static/egl_st.c
@@ -54,8 +54,9 @@ dlopen_gl_lib_cb(const char *dir, size_t len, void *callback_data)
    int ret;
 
    if (len) {
+      assert(len <= INT_MAX && "path is insanely long!");
       ret = util_snprintf(path, sizeof(path), "%.*s/%s" UTIL_DL_EXT,
-            len, dir, name);
+            (int)len, dir, name);
    }
    else {
       ret = util_snprintf(path, sizeof(path), "%s" UTIL_DL_EXT, name);
diff --git a/src/glsl/Android.mk b/src/glsl/Android.mk
index d7d17dd..84a8655 100644
--- a/src/glsl/Android.mk
+++ b/src/glsl/Android.mk
@@ -39,6 +39,7 @@ LOCAL_SRC_FILES := \
 	$(LIBGLSL_CXX_SOURCES)
 
 LOCAL_C_INCLUDES := \
+	external/astl/include \
 	$(MESA_TOP)/src/mapi \
 	$(MESA_TOP)/src/mesa
 
diff --git a/src/glx/apple/Makefile b/src/glx/apple/Makefile
index dc64295..68fe6ad 100644
--- a/src/glx/apple/Makefile
+++ b/src/glx/apple/Makefile
@@ -26,6 +26,7 @@ SOURCES = \
 	apple_glx.c \
 	apple_glx_context.c \
 	apple_glx_drawable.c \
+	apple_glx_log.c \
 	apple_glx_pbuffer.c \
 	apple_glx_pixmap.c \
 	apple_glx_surface.c \
diff --git a/src/glx/apple/apple_glx.c b/src/glx/apple/apple_glx.c
index d94c1e0..56cff64 100644
--- a/src/glx/apple/apple_glx.c
+++ b/src/glx/apple/apple_glx.c
@@ -33,6 +33,8 @@
 #include <assert.h>
 #include <stdarg.h>
 #include <dlfcn.h>
+#include <pthread.h>
+#include <inttypes.h>
 #include "appledri.h"
 #include "apple_glx.h"
 #include "apple_glx_context.h"
@@ -43,22 +45,6 @@ static int dri_event_base = 0;
 
 const GLuint __glXDefaultPixelStore[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 1 };
 
-static bool diagnostic = false;
-
-void
-apple_glx_diagnostic(const char *fmt, ...)
-{
-   va_list vl;
-
-   if (diagnostic) {
-      fprintf(stderr, "DIAG: ");
-
-      va_start(vl, fmt);
-      vfprintf(stderr, fmt, vl);
-      va_end(vl);
-   }
-}
-
 int
 apple_get_dri_event_base(void)
 {
@@ -125,10 +111,9 @@ apple_init_glx(Display * dpy)
    if (initialized)
       return false;
 
-   if (getenv("LIBGL_DIAGNOSTIC")) {
-      printf("initializing libGL in %s\n", __func__);
-      diagnostic = true;
-   }
+   apple_glx_log_init();
+
+   apple_glx_log(ASL_LEVEL_INFO, "Initializing libGL.");
 
    apple_cgl_init();
    (void) apple_glx_get_client_id();
diff --git a/src/glx/apple/apple_glx.h b/src/glx/apple/apple_glx.h
index ce8c488..0967f18 100644
--- a/src/glx/apple/apple_glx.h
+++ b/src/glx/apple/apple_glx.h
@@ -38,7 +38,8 @@
 #define XP_NO_X_HEADERS
 #include <Xplugin.h>
 
-void apple_glx_diagnostic(const char *fmt, ...);
+#include "apple_glx_log.h"
+
 xp_client_id apple_glx_get_client_id(void);
 bool apple_init_glx(Display * dpy);
 void apple_glx_swap_buffers(void *ptr);
diff --git a/src/glx/apple/apple_glx_context.c b/src/glx/apple/apple_glx_context.c
index c58d05a..0bb25b4 100644
--- a/src/glx/apple/apple_glx_context.c
+++ b/src/glx/apple/apple_glx_context.c
@@ -421,7 +421,7 @@ apple_glx_make_current_context(Display * dpy, void *oldptr, void *ptr,
     */
 
    if (same_drawable && ac->is_current) {
-      apple_glx_diagnostic("%s: same_drawable and ac->is_current\n");
+      apple_glx_diagnostic("same_drawable and ac->is_current\n");
       return false;
    }
 
diff --git a/src/glx/apple/apple_glx_drawable.c b/src/glx/apple/apple_glx_drawable.c
index 5530224..3f84d56 100644
--- a/src/glx/apple/apple_glx_drawable.c
+++ b/src/glx/apple/apple_glx_drawable.c
@@ -32,6 +32,7 @@
 #include <stdlib.h>
 #include <assert.h>
 #include <pthread.h>
+#include <string.h>
 #include "apple_glx.h"
 #include "apple_glx_context.h"
 #include "apple_glx_drawable.h"
@@ -48,8 +49,8 @@ lock_drawables_list(void)
    err = pthread_mutex_lock(&drawables_lock);
 
    if (err) {
-      fprintf(stderr, "pthread_mutex_lock failure in %s: %d\n",
-              __func__, err);
+      fprintf(stderr, "pthread_mutex_lock failure in %s: %s\n",
+              __func__, strerror(err));
       abort();
    }
 }
@@ -62,8 +63,8 @@ unlock_drawables_list(void)
    err = pthread_mutex_unlock(&drawables_lock);
 
    if (err) {
-      fprintf(stderr, "pthread_mutex_unlock failure in %s: %d\n",
-              __func__, err);
+      fprintf(stderr, "pthread_mutex_unlock failure in %s: %s\n",
+              __func__, strerror(err));
       abort();
    }
 }
@@ -95,7 +96,7 @@ drawable_lock(struct apple_glx_drawable *agd)
    err = pthread_mutex_lock(&agd->mutex);
 
    if (err) {
-      fprintf(stderr, "pthread_mutex_lock error: %d\n", err);
+      fprintf(stderr, "pthread_mutex_lock error: %s\n", strerror(err));
       abort();
    }
 }
@@ -108,7 +109,7 @@ drawable_unlock(struct apple_glx_drawable *d)
    err = pthread_mutex_unlock(&d->mutex);
 
    if (err) {
-      fprintf(stderr, "pthread_mutex_unlock error: %d\n", err);
+      fprintf(stderr, "pthread_mutex_unlock error: %s\n", strerror(err));
       abort();
    }
 }
@@ -135,6 +136,7 @@ release_drawable(struct apple_glx_drawable *d)
 static bool
 destroy_drawable(struct apple_glx_drawable *d)
 {
+   int err;
 
    d->lock(d);
 
@@ -172,6 +174,12 @@ destroy_drawable(struct apple_glx_drawable *d)
 
    apple_glx_diagnostic("%s: freeing %p\n", __func__, (void *) d);
 
+   err = pthread_mutex_destroy(&d->mutex);
+   if (err) {
+      fprintf(stderr, "pthread_mutex_destroy error: %s\n", strerror(err));
+      abort();
+   }
+   
    free(d);
 
    /* So that the locks are balanced and the caller correctly unlocks. */
@@ -238,7 +246,7 @@ common_init(Display * dpy, GLXDrawable drawable, struct apple_glx_drawable *d)
    err = pthread_mutexattr_init(&attr);
 
    if (err) {
-      fprintf(stderr, "pthread_mutexattr_init error: %d\n", err);
+      fprintf(stderr, "pthread_mutexattr_init error: %s\n", strerror(err));
       abort();
    }
 
@@ -250,14 +258,14 @@ common_init(Display * dpy, GLXDrawable drawable, struct apple_glx_drawable *d)
    err = pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
 
    if (err) {
-      fprintf(stderr, "error: setting pthread mutex type: %d\n", err);
+      fprintf(stderr, "error: setting pthread mutex type: %s\n", strerror(err));
       abort();
    }
 
    err = pthread_mutex_init(&d->mutex, &attr);
 
    if (err) {
-      fprintf(stderr, "pthread_mutex_init error: %d\n", err);
+      fprintf(stderr, "pthread_mutex_init error: %s\n", strerror(err));
       abort();
    }
 
diff --git a/src/glx/apple/apple_glx_log.c b/src/glx/apple/apple_glx_log.c
new file mode 100644
index 0000000..9ebf666
--- /dev/null
+++ b/src/glx/apple/apple_glx_log.c
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2012 Apple Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation files
+ * (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE ABOVE LISTED COPYRIGHT
+ * HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Except as contained in this notice, the name(s) of the above
+ * copyright holders shall not be used in advertising or otherwise to
+ * promote the sale, use or other dealings in this Software without
+ * prior written authorization.
+ */
+
+#include <sys/cdefs.h>
+#include <asl.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <pthread.h>
+#include "apple_glx_log.h"
+
+static bool diagnostic = false;
+static aslclient aslc;
+
+void apple_glx_log_init(void) {
+    if (getenv("LIBGL_DIAGNOSTIC")) {
+        diagnostic = true;
+    }
+
+    aslc = asl_open(NULL, NULL, 0);
+}
+
+void _apple_glx_log(int level, const char *file, const char *function,
+                    int line, const char *fmt, ...) {
+    va_list v;
+    va_start(v, fmt);
+    _apple_glx_vlog(level, file, function, line, fmt, v);
+    va_end(v);
+}
+
+static const char *
+_asl_level_string(int level)
+{
+        if (level == ASL_LEVEL_EMERG) return ASL_STRING_EMERG;
+        if (level == ASL_LEVEL_ALERT) return ASL_STRING_ALERT;
+        if (level == ASL_LEVEL_CRIT) return ASL_STRING_CRIT;
+        if (level == ASL_LEVEL_ERR) return ASL_STRING_ERR;
+        if (level == ASL_LEVEL_WARNING) return ASL_STRING_WARNING;
+        if (level == ASL_LEVEL_NOTICE) return ASL_STRING_NOTICE;
+        if (level == ASL_LEVEL_INFO) return ASL_STRING_INFO;
+        if (level == ASL_LEVEL_DEBUG) return ASL_STRING_DEBUG;
+        return "unknown";
+}
+
+void _apple_glx_vlog(int level, const char *file, const char *function,
+                     int line, const char *fmt, va_list args) {
+    aslmsg msg;
+    uint64_t thread = 0;
+
+    if (pthread_is_threaded_np()) {
+        pthread_threadid_np(NULL, &thread);
+    }
+
+    if (diagnostic) {
+        va_list args2;
+        va_copy(args2, args);
+
+        fprintf(stderr, "%-9s %24s:%-4d %s(%"PRIu64"): ",
+                _asl_level_string(level), file, line, function, thread);
+        vfprintf(stderr, fmt, args2);
+    }
+
+    msg = asl_new(ASL_TYPE_MSG);
+    if (msg) {
+        if (file)
+            asl_set(msg, "File", file);
+        if (function)
+            asl_set(msg, "Function", function);
+        if (line) {
+            char *_line;
+            asprintf(&_line, "%d", line);
+            if (_line) {
+                asl_set(msg, "Line", _line);
+                free(_line);
+            }
+        }
+        if (pthread_is_threaded_np()) {
+            char *_thread;
+            asprintf(&_thread, "%"PRIu64, thread);
+            if (_thread) {
+                asl_set(msg, "Thread", _thread);
+                free(_thread);
+            }
+        }
+    }
+
+    asl_vlog(aslc, msg, level, fmt, args);
+    if (msg)
+        asl_free(msg);
+}
diff --git a/src/glx/apple/apple_glx_log.h b/src/glx/apple/apple_glx_log.h
new file mode 100644
index 0000000..4b1c531
--- /dev/null
+++ b/src/glx/apple/apple_glx_log.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2012 Apple Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation files
+ * (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE ABOVE LISTED COPYRIGHT
+ * HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Except as contained in this notice, the name(s) of the above
+ * copyright holders shall not be used in advertising or otherwise to
+ * promote the sale, use or other dealings in this Software without
+ * prior written authorization.
+ */
+
+#ifndef APPLE_GLX_LOG_H
+#define APPLE_GLX_LOG_H
+
+#include <sys/cdefs.h>
+#include <asl.h>
+
+void apple_glx_log_init(void);
+
+__printflike(5, 6)
+void _apple_glx_log(int level, const char *file, const char *function,
+                    int line, const char *fmt, ...);
+#define apple_glx_log(l, f, args ...) \
+    _apple_glx_log(l, __FILE__, __FUNCTION__, __LINE__, f, ## args)
+
+
+__printflike(5, 0)
+void _apple_glx_vlog(int level, const char *file, const char *function,
+                     int line, const char *fmt, va_list v);
+#define apple_glx_vlog(l, f, v) \
+    _apple_glx_vlog(l, __FILE__, __FUNCTION__, __LINE__, f, v)
+
+/* This is just here to help the transition.
+ * TODO: Replace calls to apple_glx_diagnostic
+ */
+#define apple_glx_diagnostic(f, args ...) \
+    apple_glx_log(ASL_LEVEL_DEBUG, f, ## args)
+
+#endif
diff --git a/src/glx/apple/apple_glx_surface.c b/src/glx/apple/apple_glx_surface.c
index 39f5130..d42fa3b 100644
--- a/src/glx/apple/apple_glx_surface.c
+++ b/src/glx/apple/apple_glx_surface.c
@@ -206,6 +206,10 @@ apple_glx_surface_destroy(unsigned int uid)
    if (d) {
       d->types.surface.pending_destroy = true;
       d->release(d);
+
+      /* apple_glx_drawable_find_by_uid returns a locked drawable */
+      d->unlock(d);
+
       /* 
        * We release 2 references to the surface.  One was acquired by
        * the find, and the other was leftover from a context, or 
@@ -217,7 +221,5 @@ apple_glx_surface_destroy(unsigned int uid)
        * by a glViewport callback (see apple_glx_context_update()).
        */
       d->destroy(d);
-
-      d->unlock(d);
    }
 }
diff --git a/src/mapi/glapi/glapi_gentable.c b/src/mapi/glapi/glapi_gentable.c
index 5c04801..640c495 100644
--- a/src/mapi/glapi/glapi_gentable.c
+++ b/src/mapi/glapi/glapi_gentable.c
@@ -105,7 +105,7 @@ __glapi_gentable_set_remaining_noop(struct _glapi_table *disp) {
 
 struct _glapi_table *
 _glapi_create_table_from_handle(void *handle, const char *symbol_prefix) {
-    struct _glapi_table *disp = calloc(1, sizeof(struct _glapi_table));
+    struct _glapi_table *disp = calloc(_glapi_get_dispatch_table_size(), sizeof(void *));
     char symboln[512];
 
     if(!disp)
diff --git a/src/mesa/drivers/dri/i915/i915_context.c b/src/mesa/drivers/dri/i915/i915_context.c
index 36563ef..dc32292 100644
--- a/src/mesa/drivers/dri/i915/i915_context.c
+++ b/src/mesa/drivers/dri/i915/i915_context.c
@@ -76,6 +76,8 @@ i915InvalidateState(struct gl_context * ctx, GLuint new_state)
        i915_update_provoking_vertex(ctx);
    if (new_state & (_NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS))
        i915_update_program(ctx);
+   if (new_state & (_NEW_PROGRAM | _NEW_POINT))
+       i915_update_sprite_point_enable(ctx);
 }
 
 
diff --git a/src/mesa/drivers/dri/i915/i915_context.h b/src/mesa/drivers/dri/i915/i915_context.h
index 8167137..7037465 100644
--- a/src/mesa/drivers/dri/i915/i915_context.h
+++ b/src/mesa/drivers/dri/i915/i915_context.h
@@ -40,6 +40,7 @@
 #define I915_FALLBACK_POINT_SMOOTH	 0x80000
 #define I915_FALLBACK_POINT_SPRITE_COORD_ORIGIN	 0x100000
 #define I915_FALLBACK_DRAW_OFFSET	 0x200000
+#define I915_FALLBACK_COORD_REPLACE	 0x400000
 
 #define I915_UPLOAD_CTX              0x1
 #define I915_UPLOAD_BUFFERS          0x2
@@ -338,6 +339,7 @@ extern void i915InitStateFunctions(struct dd_function_table *functions);
 extern void i915InitState(struct i915_context *i915);
 extern void i915_update_stencil(struct gl_context * ctx);
 extern void i915_update_provoking_vertex(struct gl_context *ctx);
+extern void i915_update_sprite_point_enable(struct gl_context *ctx);
 
 
 /*======================================================================
diff --git a/src/mesa/drivers/dri/i915/i915_state.c b/src/mesa/drivers/dri/i915/i915_state.c
index 756001f..94c7327 100644
--- a/src/mesa/drivers/dri/i915/i915_state.c
+++ b/src/mesa/drivers/dri/i915/i915_state.c
@@ -652,6 +652,48 @@ i915PointParameterfv(struct gl_context * ctx, GLenum pname, const GLfloat *param
    }
 }
 
+void
+i915_update_sprite_point_enable(struct gl_context *ctx)
+{
+   struct intel_context *intel = intel_context(ctx);
+   /* _NEW_PROGRAM */
+   struct i915_fragment_program *p =
+      (struct i915_fragment_program *) ctx->FragmentProgram._Current;
+   const GLbitfield64 inputsRead = p->FragProg.Base.InputsRead;
+   struct i915_context *i915 = i915_context(ctx);
+   GLuint s4 = i915->state.Ctx[I915_CTXREG_LIS4] & ~S4_VFMT_MASK;
+   int i;
+   GLuint coord_replace_bits = 0x0;
+   GLuint tex_coord_unit_bits = 0x0;
+
+   for (i = 0; i < ctx->Const.MaxTextureCoordUnits; i++) {
+      /* _NEW_POINT */
+      if (ctx->Point.CoordReplace[i] && ctx->Point.PointSprite)
+         coord_replace_bits |= (1 << i);
+      if (inputsRead & FRAG_BIT_TEX(i))
+         tex_coord_unit_bits |= (1 << i);
+   }
+
+   /*
+    * Here we can't enable the SPRITE_POINT_ENABLE bit when the mis-match
+    * of tex_coord_unit_bits and coord_replace_bits, or this will make all
+    * the other non-point-sprite coords(like varying inputs, as we now use
+    * tex coord to implement varying inputs) be replaced to value (0, 0)-(1, 1).
+    *
+    * Thus, do fallback when needed.
+    */
+   FALLBACK(intel, I915_FALLBACK_COORD_REPLACE,
+            coord_replace_bits && coord_replace_bits != tex_coord_unit_bits);
+
+   s4 &= ~S4_SPRITE_POINT_ENABLE;
+   s4 |= (coord_replace_bits && coord_replace_bits == tex_coord_unit_bits) ?
+         S4_SPRITE_POINT_ENABLE : 0;
+   if (s4 != i915->state.Ctx[I915_CTXREG_LIS4]) {
+      i915->state.Ctx[I915_CTXREG_LIS4] = s4;
+      I915_STATECHANGE(i915, I915_UPLOAD_CTX);
+   }
+}
+
 
 /* =============================================================
  * Color masks
@@ -869,18 +911,7 @@ i915Enable(struct gl_context * ctx, GLenum cap, GLboolean state)
       break;
 
    case GL_POINT_SPRITE:
-      /* This state change is handled in i915_reduced_primitive_state because
-       * the hardware bit should only be set when rendering points.
-       */
-	 dw = i915->state.Ctx[I915_CTXREG_LIS4];
-      if (state)
-	 dw |= S4_SPRITE_POINT_ENABLE;
-      else
-	 dw &= ~S4_SPRITE_POINT_ENABLE;
-      if (dw != i915->state.Ctx[I915_CTXREG_LIS4]) {
-	 i915->state.Ctx[I915_CTXREG_LIS4] = dw;
-	 I915_STATECHANGE(i915, I915_UPLOAD_CTX);
-      }
+      /* Handle it at i915_update_sprite_point_enable () */
       break;
 
    case GL_POINT_SMOOTH:
diff --git a/src/mesa/drivers/dri/i915/intel_tris.c b/src/mesa/drivers/dri/i915/intel_tris.c
index a36011a..68f0e05 100644
--- a/src/mesa/drivers/dri/i915/intel_tris.c
+++ b/src/mesa/drivers/dri/i915/intel_tris.c
@@ -1198,6 +1198,7 @@ static char *fallbackStrings[] = {
    [19] = "Smooth point",
    [20] = "point sprite coord origin",
    [21] = "depth/color drawing offset",
+   [22] = "coord replace(SPRITE POINT ENABLE)",
 };
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index f660222..5064c18 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -1048,6 +1048,7 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p);
 struct brw_instruction *brw_BREAK(struct brw_compile *p);
 struct brw_instruction *brw_CONT(struct brw_compile *p);
 struct brw_instruction *gen6_CONT(struct brw_compile *p);
+struct brw_instruction *gen6_HALT(struct brw_compile *p);
 /* Forward jumps:
  */
 void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx);
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index b2581da..21d3c5a 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -1339,6 +1339,20 @@ struct brw_instruction *brw_CONT(struct brw_compile *p)
    return insn;
 }
 
+struct brw_instruction *gen6_HALT(struct brw_compile *p)
+{
+   struct brw_instruction *insn;
+
+   insn = next_insn(p, BRW_OPCODE_HALT);
+   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
+
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.execution_size = BRW_EXECUTE_8;
+   return insn;
+}
+
 /* DO/WHILE loop:
  *
  * The DO/WHILE is just an unterminated loop -- break or continue are
@@ -2395,8 +2409,8 @@ brw_find_next_block_end(struct brw_compile *p, int start)
 	 return ip;
       }
    }
-   assert(!"not reached");
-   return start + 1;
+
+   return 0;
 }
 
 /* There is no DO instruction on gen6, so to find the end of the loop
@@ -2425,7 +2439,7 @@ brw_find_loop_end(struct brw_compile *p, int start)
 }
 
 /* After program generation, go back and update the UIP and JIP of
- * BREAK and CONT instructions to their correct locations.
+ * BREAK, CONT, and HALT instructions to their correct locations.
  */
 void
 brw_set_uip_jip(struct brw_compile *p)
@@ -2439,21 +2453,50 @@ brw_set_uip_jip(struct brw_compile *p)
 
    for (ip = 0; ip < p->nr_insn; ip++) {
       struct brw_instruction *insn = &p->store[ip];
+      int block_end_ip = 0;
+
+      if (insn->header.opcode == BRW_OPCODE_BREAK ||
+	  insn->header.opcode == BRW_OPCODE_CONTINUE ||
+	  insn->header.opcode == BRW_OPCODE_HALT) {
+	 block_end_ip = brw_find_next_block_end(p, ip);
+      }
 
       switch (insn->header.opcode) {
       case BRW_OPCODE_BREAK:
-	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
+	 assert(block_end_ip != 0);
+	 insn->bits3.break_cont.jip = br * (block_end_ip - ip);
 	 /* Gen7 UIP points to WHILE; Gen6 points just after it */
 	 insn->bits3.break_cont.uip =
 	    br * (brw_find_loop_end(p, ip) - ip + (intel->gen == 6 ? 1 : 0));
 	 break;
       case BRW_OPCODE_CONTINUE:
-	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
+	 assert(block_end_ip != 0);
+	 insn->bits3.break_cont.jip = br * (block_end_ip - ip);
 	 insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip);
 
 	 assert(insn->bits3.break_cont.uip != 0);
 	 assert(insn->bits3.break_cont.jip != 0);
 	 break;
+      case BRW_OPCODE_HALT:
+	 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
+	  *
+	  *    "In case of the halt instruction not inside any conditional code
+	  *     block, the value of <JIP> and <UIP> should be the same. In case
+	  *     of the halt instruction inside conditional code block, the <UIP>
+	  *     should be the end of the program, and the <JIP> should be end of
+	  *     the most inner conditional code block."
+	  *
+	  * The uip will have already been set by whoever set up the
+	  * instruction.
+	  */
+	 if (block_end_ip == 0) {
+	    insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
+	 } else {
+	    insn->bits3.break_cont.jip = br * (block_end_ip - ip);
+	 }
+	 assert(insn->bits3.break_cont.uip != 0);
+	 assert(insn->bits3.break_cont.jip != 0);
+	 break;
       }
    }
 }
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 9a2cc08..b9cd42f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -171,6 +171,26 @@ static const fs_reg reg_undef;
 static const fs_reg reg_null_f(ARF, BRW_ARF_NULL, BRW_REGISTER_TYPE_F);
 static const fs_reg reg_null_d(ARF, BRW_ARF_NULL, BRW_REGISTER_TYPE_D);
 
+class ip_record : public exec_node {
+public:
+   static void* operator new(size_t size, void *ctx)
+   {
+      void *node;
+
+      node = rzalloc_size(ctx, size);
+      assert(node != NULL);
+
+      return node;
+   }
+
+   ip_record(int ip)
+   {
+      this->ip = ip;
+   }
+
+   int ip;
+};
+
 class fs_inst : public exec_node {
 public:
    /* Callers of this ralloc-based new need not call delete. It's
@@ -489,6 +509,7 @@ public:
    bool remove_duplicate_mrf_writes();
    bool virtual_grf_interferes(int a, int b);
    void schedule_instructions();
+   void patch_discard_jumps_to_fb_writes();
    void fail(const char *msg, ...);
 
    void push_force_uncompressed();
@@ -571,6 +592,7 @@ public:
    struct gl_shader_program *prog;
    void *mem_ctx;
    exec_list instructions;
+   exec_list discard_halt_patches;
 
    /* Delayed setup of c->prog_data.params[] due to realloc of
     * ParamValues[] during compile.
diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
index b68d8cb..cc70904 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
@@ -37,11 +37,55 @@ extern "C" {
 #include "glsl/ir_print_visitor.h"
 
 void
+fs_visitor::patch_discard_jumps_to_fb_writes()
+{
+   if (intel->gen < 6 || this->discard_halt_patches.is_empty())
+      return;
+
+   /* There is a somewhat strange undocumented requirement of using
+    * HALT, according to the simulator.  If some channel has HALTed to
+    * a particular UIP, then by the end of the program, every channel
+    * must have HALTed to that UIP.  Furthermore, the tracking is a
+    * stack, so you can't do the final halt of a UIP after starting
+    * halting to a new UIP.
+    *
+    * Symptoms of not emitting this instruction on actual hardware
+    * included GPU hangs and sparkly rendering on the piglit discard
+    * tests.
+    */
+   struct brw_instruction *last_halt = gen6_HALT(p);
+   last_halt->bits3.break_cont.uip = 2;
+   last_halt->bits3.break_cont.jip = 2;
+
+   int ip = p->nr_insn;
+
+   foreach_list(node, &this->discard_halt_patches) {
+      ip_record *patch_ip = (ip_record *)node;
+      struct brw_instruction *patch = &p->store[patch_ip->ip];
+      int br = (intel->gen >= 5) ? 2 : 1;
+
+      /* HALT takes a distance from the pre-incremented IP, so '1'
+       * would be the next instruction after jmpi.
+       */
+      assert(patch->header.opcode == BRW_OPCODE_HALT);
+      patch->bits3.break_cont.uip = (ip - patch_ip->ip) * br;
+   }
+
+   this->discard_halt_patches.make_empty();
+}
+
+void
 fs_visitor::generate_fb_write(fs_inst *inst)
 {
    bool eot = inst->eot;
    struct brw_reg implied_header;
 
+   /* Note that the jumps emitted to this point mean that the g0 ->
+    * base_mrf setup must be inside of this function, so that we jump
+    * to a point containing it.
+    */
+   patch_discard_jumps_to_fb_writes();
+
    /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
     * move, here's g1.
     */
@@ -482,6 +526,17 @@ fs_visitor::generate_discard(fs_inst *inst)
       brw_set_mask_control(p, BRW_MASK_DISABLE);
       brw_AND(p, g1, f0, g1);
       brw_pop_insn_state(p);
+
+      /* GLSL 1.30+ say that discarded channels should stop executing
+       * (so, for example, an infinite loop that would otherwise in
+       * just that channel does not occur.
+       *
+       * This HALT will be patched up at FB write time to point UIP at
+       * the end of the program, and at brw_uip_jip() JIP will be set
+       * to the end of the current block (or the program).
+       */
+      this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
+      gen6_HALT(p);
    } else {
       struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 0632052..cec1e95 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1921,7 +1921,10 @@ fs_visitor::emit_fb_writes()
 {
    this->current_annotation = "FB write header";
    bool header_present = true;
-   int base_mrf = 2;
+   /* We can potentially have a message length of up to 15, so we have to set
+    * base_mrf to either 0 or 1 in order to fit in m0..m15.
+    */
+   int base_mrf = 1;
    int nr = base_mrf;
    int reg_width = c->dispatch_width / 8;
 
diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index 7a1b91f..8bf1d3d 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -115,6 +115,8 @@ brw_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree *mt)
 	       intel_miptree_set_image_offset(mt, level, q, x, y);
 	       x += pack_x_pitch;
 	    }
+            if (x > mt->total_width)
+               mt->total_width = x;
 
 	    x = 0;
 	    y += pack_y_pitch;
@@ -135,10 +137,9 @@ brw_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree *mt)
 	       pack_x_nr <<= 1;
 	    }
 	 } else {
+            pack_x_nr <<= 1;
 	    if (pack_x_pitch > 4) {
 	       pack_x_pitch >>= 1;
-	       pack_x_nr <<= 1;
-	       assert(pack_x_pitch * pack_x_nr <= mt->total_width);
 	    }
 
 	    if (pack_y_pitch > 2) {
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 51d3a46..97ae489 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -916,12 +916,48 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
    struct gl_context *ctx = &intel->ctx;
    struct intel_renderbuffer *irb = intel_renderbuffer(rb);
    struct intel_mipmap_tree *mt = irb->mt;
-   struct intel_region *region = irb->mt->region;
+   struct intel_region *region;
    uint32_t *surf;
    uint32_t tile_x, tile_y;
    uint32_t format = 0;
    gl_format rb_format = intel_rb_format(irb);
 
+   if (irb->tex_image && !brw->has_surface_tile_offset) {
+      intel_renderbuffer_tile_offsets(irb, &tile_x, &tile_y);
+
+      if (tile_x != 0 || tile_y != 0) {
+	 /* Original gen4 hardware couldn't draw to a non-tile-aligned
+	  * destination in a miptree unless you actually setup your renderbuffer
+	  * as a miptree and used the fragile lod/array_index/etc. controls to
+	  * select the image.  So, instead, we just make a new single-level
+	  * miptree and render into that.
+	  */
+	 struct intel_context *intel = intel_context(ctx);
+	 struct intel_texture_image *intel_image =
+	    intel_texture_image(irb->tex_image);
+	 struct intel_mipmap_tree *new_mt;
+	 int width, height, depth;
+
+	 intel_miptree_get_dimensions_for_image(irb->tex_image, &width, &height, &depth);
+
+	 new_mt = intel_miptree_create(intel, irb->tex_image->TexObject->Target,
+				       intel_image->base.Base.TexFormat,
+				       intel_image->base.Base.Level,
+				       intel_image->base.Base.Level,
+				       width, height, depth,
+				       true);
+
+	 intel_miptree_copy_teximage(intel, intel_image, new_mt);
+	 intel_miptree_reference(&irb->mt, intel_image->mt);
+	 intel_renderbuffer_set_draw_offset(irb);
+	 intel_miptree_release(&new_mt);
+
+	 mt = irb->mt;
+      }
+   }
+
+   region = irb->mt->region;
+
    surf = brw_state_batch(brw, AUB_TRACE_SURFACE_STATE,
 			  6 * 4, 32, &brw->bind.surf_offset[unit]);
 
diff --git a/src/mesa/drivers/dri/i965/gen6_sampler_state.c b/src/mesa/drivers/dri/i965/gen6_sampler_state.c
index 15cae0a..a9a9df5 100644
--- a/src/mesa/drivers/dri/i965/gen6_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_sampler_state.c
@@ -41,7 +41,7 @@ upload_sampler_state_pointers(struct brw_context *brw)
 	     GS_SAMPLER_STATE_CHANGE |
 	     PS_SAMPLER_STATE_CHANGE |
 	     (4 - 2));
-   OUT_BATCH(0); /* VS */
+   OUT_BATCH(brw->sampler.offset); /* VS */
    OUT_BATCH(0); /* GS */
    OUT_BATCH(brw->sampler.offset);
    ADVANCE_BATCH();
diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c
index d3c0d70..9cdd804 100644
--- a/src/mesa/drivers/dri/intel/intel_context.c
+++ b/src/mesa/drivers/dri/intel/intel_context.c
@@ -1225,6 +1225,10 @@ intel_process_dri2_buffer_with_separate_stencil(struct intel_context *intel,
    if (!rb)
       return;
 
+   /* Check if we failed to allocate the depth miptree earlier. */
+   if (buffer->attachment == __DRI_BUFFER_HIZ && rb->mt == NULL)
+     return;
+
    /* If the renderbuffer's and DRIbuffer's regions match, then continue. */
    if ((buffer->attachment != __DRI_BUFFER_HIZ &&
 	rb->mt &&
@@ -1266,6 +1270,7 @@ intel_process_dri2_buffer_with_separate_stencil(struct intel_context *intel,
     * due to failure to allocate new storage.
     */
    if (buffer->attachment == __DRI_BUFFER_HIZ) {
+      assert(rb->mt);
       intel_miptree_release(&rb->mt->hiz_mt);
    } else {
       intel_miptree_release(&rb->mt);
@@ -1291,6 +1296,7 @@ intel_process_dri2_buffer_with_separate_stencil(struct intel_context *intel,
 
    /* Associate buffer with new storage. */
    if (buffer->attachment == __DRI_BUFFER_HIZ) {
+      assert(rb->mt);
       rb->mt->hiz_mt = mt;
    } else {
       rb->mt = mt;
diff --git a/src/mesa/drivers/dri/intel/intel_fbo.c b/src/mesa/drivers/dri/intel/intel_fbo.c
index 185602a..c5097c3 100644
--- a/src/mesa/drivers/dri/intel/intel_fbo.c
+++ b/src/mesa/drivers/dri/intel/intel_fbo.c
@@ -553,22 +553,6 @@ intel_renderbuffer_tile_offsets(struct intel_renderbuffer *irb,
    }
 }
 
-#ifndef I915
-static bool
-need_tile_offset_workaround(struct brw_context *brw,
-			    struct intel_renderbuffer *irb)
-{
-   uint32_t tile_x, tile_y;
-
-   if (brw->has_surface_tile_offset)
-      return false;
-
-   intel_renderbuffer_tile_offsets(irb, &tile_x, &tile_y);
-
-   return tile_x != 0 || tile_y != 0;
-}
-#endif
-
 /**
  * Called by glFramebufferTexture[123]DEXT() (and other places) to
  * prepare for rendering into texture memory.  This might be called
@@ -626,42 +610,13 @@ intel_render_texture(struct gl_context * ctx,
        return;
    }
 
+   irb->tex_image = image;
+
    DBG("Begin render %s texture tex=%u w=%d h=%d refcount=%d\n",
        _mesa_get_format_name(image->TexFormat),
        att->Texture->Name, image->Width, image->Height,
        irb->Base.Base.RefCount);
 
-   intel_image->used_as_render_target = true;
-
-#ifndef I915
-   if (need_tile_offset_workaround(brw_context(ctx), irb)) {
-      /* Original gen4 hardware couldn't draw to a non-tile-aligned
-       * destination in a miptree unless you actually setup your
-       * renderbuffer as a miptree and used the fragile
-       * lod/array_index/etc. controls to select the image.  So,
-       * instead, we just make a new single-level miptree and render
-       * into that.
-       */
-      struct intel_context *intel = intel_context(ctx);
-      struct intel_mipmap_tree *new_mt;
-      int width, height, depth;
-
-      intel_miptree_get_dimensions_for_image(image, &width, &height, &depth);
-
-      new_mt = intel_miptree_create(intel, image->TexObject->Target,
-				    intel_image->base.Base.TexFormat,
-				    intel_image->base.Base.Level,
-				    intel_image->base.Base.Level,
-                                    width, height, depth,
-				    true);
-
-      intel_miptree_copy_teximage(intel, intel_image, new_mt);
-      intel_renderbuffer_set_draw_offset(irb);
-
-      intel_miptree_reference(&irb->mt, intel_image->mt);
-      intel_miptree_release(&new_mt);
-   }
-#endif
    /* update drawing region, etc */
    intel_draw_buffer(ctx);
 }
@@ -678,14 +633,13 @@ intel_finish_render_texture(struct gl_context * ctx,
    struct gl_texture_object *tex_obj = att->Texture;
    struct gl_texture_image *image =
       tex_obj->Image[att->CubeMapFace][att->TextureLevel];
-   struct intel_texture_image *intel_image = intel_texture_image(image);
+   struct intel_renderbuffer *irb = intel_renderbuffer(att->Renderbuffer);
 
    DBG("Finish render %s texture tex=%u\n",
        _mesa_get_format_name(image->TexFormat), att->Texture->Name);
 
-   /* Flag that this image may now be validated into the object's miptree. */
-   if (intel_image)
-      intel_image->used_as_render_target = false;
+   if (irb)
+      irb->tex_image = NULL;
 
    /* Since we've (probably) rendered to the texture and will (likely) use
     * it in the texture domain later on in this batchbuffer, flush the
diff --git a/src/mesa/drivers/dri/intel/intel_fbo.h b/src/mesa/drivers/dri/intel/intel_fbo.h
index a2c1b1a..724f141 100644
--- a/src/mesa/drivers/dri/intel/intel_fbo.h
+++ b/src/mesa/drivers/dri/intel/intel_fbo.h
@@ -47,6 +47,9 @@ struct intel_renderbuffer
    struct intel_mipmap_tree *mt; /**< The renderbuffer storage. */
    drm_intel_bo *map_bo;
 
+   /* Current texture image this renderbuffer is attached to. */
+   struct gl_texture_image *tex_image;
+
    /**
     * \name Miptree view
     * \{
diff --git a/src/mesa/drivers/dri/intel/intel_tex_obj.h b/src/mesa/drivers/dri/intel/intel_tex_obj.h
index 8b278ba..d1a5f05 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_obj.h
+++ b/src/mesa/drivers/dri/intel/intel_tex_obj.h
@@ -65,7 +65,6 @@ struct intel_texture_image
     * Else there is no image data.
     */
    struct intel_mipmap_tree *mt;
-   bool used_as_render_target;
 };
 
 static INLINE struct intel_texture_object *
diff --git a/src/mesa/drivers/dri/intel/intel_tex_validate.c b/src/mesa/drivers/dri/intel/intel_tex_validate.c
index b96f2a4..a63068b 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_validate.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_validate.c
@@ -97,14 +97,8 @@ intel_finalize_mipmap_tree(struct intel_context *intel, GLuint unit)
 	 /* skip too small size mipmap */
  	 if (intelImage == NULL)
 		 break;
-	 /* Need to import images in main memory or held in other trees.
-	  * If it's a render target, then its data isn't needed to be in
-	  * the object tree (otherwise we'd be FBO incomplete), and we need
-	  * to keep track of the image's MT as needing to be pulled in still,
-	  * or we'll lose the rendering that's done to it.
-          */
-         if (intelObj->mt != intelImage->mt &&
-	     !intelImage->used_as_render_target) {
+
+         if (intelObj->mt != intelImage->mt) {
             intel_miptree_copy_teximage(intel, intelImage, intelObj->mt);
          }
       }
diff --git a/src/mesa/drivers/windows/gdi/wmesa.c b/src/mesa/drivers/windows/gdi/wmesa.c
index 40aa56e..93da05f 100644
--- a/src/mesa/drivers/windows/gdi/wmesa.c
+++ b/src/mesa/drivers/windows/gdi/wmesa.c
@@ -243,39 +243,9 @@ static void wmesa_flush(struct gl_context *ctx)
 /*****                   CLEAR Functions                          *****/
 /**********************************************************************/
 
-/* If we do not implement these, Mesa clears the buffers via the pixel
- * span writing interface, which is very slow for a clear operation.
- */
-
-/*
- * Set the color used to clear the color buffer.
- */
-static void clear_color(struct gl_context *ctx,
-                        const union gl_color_union color)
-{
-    WMesaContext pwc = wmesa_context(ctx);
-    GLubyte col[3];
-
-    UNCLAMPED_FLOAT_TO_UBYTE(col[0], color.f[0]);
-    UNCLAMPED_FLOAT_TO_UBYTE(col[1], color.f[1]);
-    UNCLAMPED_FLOAT_TO_UBYTE(col[2], color.f[2]);
-    pwc->clearColorRef = RGB(col[0], col[1], col[2]);
-    DeleteObject(pwc->clearPen);
-    DeleteObject(pwc->clearBrush);
-    pwc->clearPen = CreatePen(PS_SOLID, 1, pwc->clearColorRef); 
-    pwc->clearBrush = CreateSolidBrush(pwc->clearColorRef); 
-}
-
-
 /* 
- * Clear the specified region of the color buffer using the clear color 
- * or index as specified by one of the two functions above. 
- * 
- * This procedure clears either the front and/or the back COLOR buffers. 
- * Only the "left" buffer is cleared since we are not stereo. 
- * Clearing of the other non-color buffers is left to the swrast. 
+ * Clear the color/depth/stencil buffers.
  */ 
-
 static void clear(struct gl_context *ctx, GLbitfield mask)
 {
 #define FLIP(Y)  (ctx->DrawBuffer->Height - (Y) - 1)
@@ -298,6 +268,20 @@ static void clear(struct gl_context *ctx, GLbitfield mask)
 	return;
     }
 
+    if (mask & BUFFER_BITS_COLOR) {
+       /* setup the clearing color */
+       const union gl_color_union color = ctx->Color.ClearColor;
+       GLubyte col[3];
+       UNCLAMPED_FLOAT_TO_UBYTE(col[0], color.f[0]);
+       UNCLAMPED_FLOAT_TO_UBYTE(col[1], color.f[1]);
+       UNCLAMPED_FLOAT_TO_UBYTE(col[2], color.f[2]);
+       pwc->clearColorRef = RGB(col[0], col[1], col[2]);
+       DeleteObject(pwc->clearPen);
+       DeleteObject(pwc->clearBrush);
+       pwc->clearPen = CreatePen(PS_SOLID, 1, pwc->clearColorRef); 
+       pwc->clearBrush = CreateSolidBrush(pwc->clearColorRef); 
+    }
+
     /* Back buffer */
     if (mask & BUFFER_BIT_BACK_LEFT) { 
 	
@@ -940,54 +924,6 @@ wmesa_renderbuffer_storage(struct gl_context *ctx,
 
 
 /**
- * Plug in the Get/PutRow/Values functions for a renderbuffer depending
- * on if we're drawing to the front or back color buffer.
- */
-static void
-wmesa_set_renderbuffer_funcs(struct gl_renderbuffer *rb, int pixelformat,
-                             int cColorBits, int double_buffer)
-{
-    if (double_buffer) {
-        /* back buffer */
-	/* Picking the correct span functions is important because
-	 * the DIB was allocated with the indicated depth. */
-	switch(pixelformat) {
-	case PF_5R6G5B:
-	    rb->PutRow = write_rgba_span_16;
-	    rb->PutValues = write_rgba_pixels_16;
-	    rb->GetRow = read_rgba_span_16;
-	    rb->GetValues = read_rgba_pixels_16;
-	    break;
-	case PF_8R8G8B:
-		if (cColorBits == 24)
-		{
-		    rb->PutRow = write_rgba_span_24;
-		    rb->PutValues = write_rgba_pixels_24;
-		    rb->GetRow = read_rgba_span_24;
-		    rb->GetValues = read_rgba_pixels_24;
-		}
-		else
-		{
-                    rb->PutRow = write_rgba_span_32;
-                    rb->PutValues = write_rgba_pixels_32;
-                    rb->GetRow = read_rgba_span_32;
-                    rb->GetValues = read_rgba_pixels_32;
-		}
-	    break;
-	default:
-	    break;
-	}
-    }
-    else {
-        /* front buffer (actual Windows window) */
-	rb->PutRow = write_rgba_span_front;
-	rb->PutValues = write_rgba_pixels_front;
-	rb->GetRow = read_rgba_span_front;
-	rb->GetValues = read_rgba_pixels_front;
-    }
-}
-
-/**
  * Called by ctx->Driver.ResizeBuffers()
  * Resize the front/back colorbuffers to match the latest window size.
  */
@@ -1143,7 +1079,6 @@ WMesaContext WMesaCreateContext(HDC hDC,
     functions.GetBufferSize = wmesa_get_buffer_size;
     functions.Flush = wmesa_flush;
     functions.Clear = clear;
-    functions.ClearColor = clear_color;
     functions.ResizeBuffers = wmesa_resize_buffers;
     functions.Viewport = wmesa_viewport;
 
@@ -1275,11 +1210,9 @@ void WMesaMakeCurrent(WMesaContext c, HDC hdc)
         if (visual->doubleBufferMode == 1) {
             rb = wmesa_new_renderbuffer();
             _mesa_add_renderbuffer(&pwfb->Base, BUFFER_BACK_LEFT, rb);
-            wmesa_set_renderbuffer_funcs(rb, pwfb->pixelformat, pwfb->cColorBits, 1);
 	}
         rb = wmesa_new_renderbuffer();
         _mesa_add_renderbuffer(&pwfb->Base, BUFFER_FRONT_LEFT, rb);
-        wmesa_set_renderbuffer_funcs(rb, pwfb->pixelformat, pwfb->cColorBits, 0);
 
 	/* Let Mesa own the Depth, Stencil, and Accum buffers */
         _swrast_add_soft_renderbuffers(&pwfb->Base,
diff --git a/src/mesa/main/readpix.c b/src/mesa/main/readpix.c
index 5b3c246..f3a0d10 100644
--- a/src/mesa/main/readpix.c
+++ b/src/mesa/main/readpix.c
@@ -196,6 +196,11 @@ read_stencil_pixels( struct gl_context *ctx,
    ctx->Driver.UnmapRenderbuffer(ctx, rb);
 }
 
+
+/**
+ * Try to do glReadPixels of RGBA data using a simple memcpy or swizzle.
+ * \return GL_TRUE if successful, GL_FALSE otherwise (use the slow path)
+ */
 static GLboolean
 fast_read_rgba_pixels_memcpy( struct gl_context *ctx,
 			      GLint x, GLint y,
@@ -208,8 +213,20 @@ fast_read_rgba_pixels_memcpy( struct gl_context *ctx,
    struct gl_renderbuffer *rb = ctx->ReadBuffer->_ColorReadBuffer;
    GLubyte *dst, *map;
    int dstStride, stride, j, texelBytes;
+   GLboolean swizzle_rb = GL_FALSE, copy_xrgb = GL_FALSE;
 
-   if (!_mesa_format_matches_format_and_type(rb->Format, format, type))
+   /* XXX we could check for other swizzle/special cases here as needed */
+   if (rb->Format == MESA_FORMAT_RGBA8888_REV &&
+       format == GL_BGRA &&
+       type == GL_UNSIGNED_INT_8_8_8_8_REV) {
+      swizzle_rb = GL_TRUE;
+   }
+   else if (rb->Format == MESA_FORMAT_XRGB8888 &&
+            format == GL_BGRA &&
+            type == GL_UNSIGNED_INT_8_8_8_8_REV) {
+      copy_xrgb = GL_TRUE;
+   }
+   else if (!_mesa_format_matches_format_and_type(rb->Format, format, type))
       return GL_FALSE;
 
    /* check for things we can't handle here */
@@ -240,10 +257,39 @@ fast_read_rgba_pixels_memcpy( struct gl_context *ctx,
    }
 
    texelBytes = _mesa_get_format_bytes(rb->Format);
-   for (j = 0; j < height; j++) {
-      memcpy(dst, map, width * texelBytes);
-      dst += dstStride;
-      map += stride;
+
+   if (swizzle_rb) {
+      /* swap R/B */
+      for (j = 0; j < height; j++) {
+         int i;
+         for (i = 0; i < width; i++) {
+            GLuint *dst4 = (GLuint *) dst, *map4 = (GLuint *) map;
+            GLuint pixel = map4[i];
+            dst4[i] = (pixel & 0xff00ff00)
+                   | ((pixel & 0x00ff0000) >> 16)
+                   | ((pixel & 0x000000ff) << 16);
+         }
+         dst += dstStride;
+         map += stride;
+      }
+   } else if (copy_xrgb) {
+      /* convert xrgb -> argb */
+      for (j = 0; j < height; j++) {
+         GLuint *dst4 = (GLuint *) dst, *map4 = (GLuint *) map;
+         int i;
+         for (i = 0; i < width; i++) {
+            dst4[i] = map4[i] | 0xff000000;  /* set A=0xff */
+         }
+         dst += dstStride;
+         map += stride;
+      }
+   } else {
+      /* just memcpy */
+      for (j = 0; j < height; j++) {
+         memcpy(dst, map, width * texelBytes);
+         dst += dstStride;
+         map += stride;
+      }
    }
 
    ctx->Driver.UnmapRenderbuffer(ctx, rb);