From 6858dd50c9b696c1c6044f5a403000f9d20b286b Mon Sep 17 00:00:00 2001
From: Younes Manton <younes.m@gmail.com>
Date: Sat, 16 Aug 2008 13:04:23 -0400
Subject: g3dvl: Modularized rendering, refactored to accommodate VAAPI, other
 APIs.

---
 src/gallium/state_trackers/g3dvl/Makefile          |    4 +-
 src/gallium/state_trackers/g3dvl/vl_basic_csc.c    |  694 ++++++
 src/gallium/state_trackers/g3dvl/vl_basic_csc.h    |   13 +
 src/gallium/state_trackers/g3dvl/vl_context.c      | 2272 +------------------
 src/gallium/state_trackers/g3dvl/vl_context.h      |  118 +-
 src/gallium/state_trackers/g3dvl/vl_csc.h          |   53 +
 src/gallium/state_trackers/g3dvl/vl_data.c         |  130 +-
 src/gallium/state_trackers/g3dvl/vl_data.h         |   19 +-
 src/gallium/state_trackers/g3dvl/vl_defs.h         |    1 -
 src/gallium/state_trackers/g3dvl/vl_display.c      |   48 +
 src/gallium/state_trackers/g3dvl/vl_display.h      |   29 +
 src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.c  | 2315 ++++++++++++++++++++
 src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.h  |   18 +
 src/gallium/state_trackers/g3dvl/vl_render.h       |   33 +
 src/gallium/state_trackers/g3dvl/vl_screen.c       |  115 +
 src/gallium/state_trackers/g3dvl/vl_screen.h       |   63 +
 src/gallium/state_trackers/g3dvl/vl_shader_build.c |   37 +-
 src/gallium/state_trackers/g3dvl/vl_shader_build.h |    1 -
 src/gallium/state_trackers/g3dvl/vl_surface.c      |  671 +-----
 src/gallium/state_trackers/g3dvl/vl_surface.h      |   91 +-
 src/gallium/state_trackers/g3dvl/vl_types.h        |  124 +-
 src/gallium/state_trackers/g3dvl/vl_util.c         |    9 +-
 src/gallium/state_trackers/g3dvl/vl_util.h         |    1 -
 23 files changed, 3794 insertions(+), 3065 deletions(-)
 create mode 100644 src/gallium/state_trackers/g3dvl/vl_basic_csc.c
 create mode 100644 src/gallium/state_trackers/g3dvl/vl_basic_csc.h
 create mode 100644 src/gallium/state_trackers/g3dvl/vl_csc.h
 create mode 100644 src/gallium/state_trackers/g3dvl/vl_display.c
 create mode 100644 src/gallium/state_trackers/g3dvl/vl_display.h
 create mode 100644 src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.c
 create mode 100644 src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.h
 create mode 100644 src/gallium/state_trackers/g3dvl/vl_render.h
 create mode 100644 src/gallium/state_trackers/g3dvl/vl_screen.c
 create mode 100644 src/gallium/state_trackers/g3dvl/vl_screen.h

(limited to 'src/gallium')

diff --git a/src/gallium/state_trackers/g3dvl/Makefile b/src/gallium/state_trackers/g3dvl/Makefile
index c6a22cad4e..9995c554ab 100644
--- a/src/gallium/state_trackers/g3dvl/Makefile
+++ b/src/gallium/state_trackers/g3dvl/Makefile
@@ -1,5 +1,6 @@
 TARGET		= libg3dvl.a
-OBJECTS		= vl_context.o vl_data.o vl_surface.o vl_shader_build.o vl_util.o
+OBJECTS		= vl_display.o vl_screen.o vl_context.o vl_surface.o vl_data.o vl_shader_build.o vl_util.o vl_basic_csc.o \
+		  vl_r16snorm_mc.o
 GALLIUMDIR	= ../..
 
 CFLAGS		+= -g -Wall -fPIC -Werror -I${GALLIUMDIR}/include -I${GALLIUMDIR}/auxiliary -I${GALLIUMDIR}/winsys/g3dvl
@@ -15,4 +16,3 @@ ${TARGET}: ${OBJECTS}
 
 clean:
 	rm -rf ${OBJECTS} ${TARGET}
-
diff --git a/src/gallium/state_trackers/g3dvl/vl_basic_csc.c b/src/gallium/state_trackers/g3dvl/vl_basic_csc.c
new file mode 100644
index 0000000000..ea003a31d1
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_basic_csc.c
@@ -0,0 +1,694 @@
+#define VL_INTERNAL
+#include "vl_basic_csc.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <pipe/p_context.h>
+#include <pipe/p_winsys.h>
+#include <pipe/p_state.h>
+#include <tgsi/tgsi_parse.h>
+#include <tgsi/tgsi_build.h>
+#include "vl_csc.h"
+#include "vl_surface.h"
+#include "vl_shader_build.h"
+#include "vl_types.h"
+
+struct vlVertexShaderConsts
+{
+	struct vlVertex4f	src_scale;
+	struct vlVertex4f	src_trans;
+};
+
+struct vlFragmentShaderConsts
+{
+	struct vlVertex4f	bias;
+	float			matrix[16];
+};
+
+struct vlBasicCSC
+{
+	struct vlCSC				base;
+
+	struct pipe_context			*pipe;
+	struct pipe_viewport_state		viewport;
+	struct pipe_framebuffer_state		framebuffer;
+	void					*sampler;
+	void					*vertex_shader, *fragment_shader;
+	struct pipe_vertex_buffer 		vertex_bufs[2];
+	struct pipe_vertex_element		vertex_elems[2];
+	struct pipe_constant_buffer		vs_const_buf, fs_const_buf;
+};
+
+static int vlResizeFrameBuffer
+(
+	struct vlCSC *csc,
+	unsigned int width,
+	unsigned int height
+)
+{
+	struct vlBasicCSC	*basic_csc;
+	struct pipe_context	*pipe;
+
+	assert(csc);
+
+	basic_csc = (struct vlBasicCSC*)csc;
+	pipe = basic_csc->pipe;
+
+	if (basic_csc->framebuffer.width == width && basic_csc->framebuffer.height == height)
+		return 0;
+
+	if (basic_csc->framebuffer.cbufs[0])
+		pipe->winsys->surface_release
+		(
+			pipe->winsys,
+			&basic_csc->framebuffer.cbufs[0]
+		);
+
+	basic_csc->viewport.scale[0] = width;
+	basic_csc->viewport.scale[1] = height;
+	basic_csc->viewport.scale[2] = 1;
+	basic_csc->viewport.scale[3] = 1;
+	basic_csc->viewport.translate[0] = 0;
+	basic_csc->viewport.translate[1] = 0;
+	basic_csc->viewport.translate[2] = 0;
+	basic_csc->viewport.translate[3] = 0;
+
+	basic_csc->framebuffer.width = width;
+	basic_csc->framebuffer.height = height;
+	basic_csc->framebuffer.cbufs[0] = pipe->winsys->surface_alloc(pipe->winsys);
+	pipe->winsys->surface_alloc_storage
+	(
+		pipe->winsys,
+		basic_csc->framebuffer.cbufs[0],
+		width,
+		height,
+		PIPE_FORMAT_A8R8G8B8_UNORM,
+		/* XXX: SoftPipe doesn't change GPU usage to CPU like it does for textures */
+		PIPE_BUFFER_USAGE_CPU_READ | PIPE_BUFFER_USAGE_CPU_WRITE,
+		0
+	);
+
+	return 0;
+}
+
+static int vlBegin
+(
+	struct vlCSC *csc
+)
+{
+	struct vlBasicCSC	*basic_csc;
+	struct pipe_context	*pipe;
+
+	assert(csc);
+
+	basic_csc = (struct vlBasicCSC*)csc;
+	pipe = basic_csc->pipe;
+
+	pipe->set_framebuffer_state(pipe, &basic_csc->framebuffer);
+	pipe->set_viewport_state(pipe, &basic_csc->viewport);
+	pipe->bind_sampler_states(pipe, 1, (void**)&basic_csc->sampler);
+	/* Source texture set in vlPutSurface() */
+	pipe->bind_vs_state(pipe, basic_csc->vertex_shader);
+	pipe->bind_fs_state(pipe, basic_csc->fragment_shader);
+	pipe->set_vertex_buffers(pipe, 2, basic_csc->vertex_bufs);
+	pipe->set_vertex_elements(pipe, 2, basic_csc->vertex_elems);
+	pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &basic_csc->vs_const_buf);
+	pipe->set_constant_buffer(pipe, PIPE_SHADER_FRAGMENT, 0, &basic_csc->fs_const_buf);
+
+	return 0;
+}
+
+static int vlPutPictureCSC
+(
+	struct vlCSC *csc,
+	struct vlSurface *surface,
+	int srcx,
+	int srcy,
+	int srcw,
+	int srch,
+	int destx,
+	int desty,
+	int destw,
+	int desth,
+	enum vlPictureType picture_type
+)
+{
+	struct vlBasicCSC		*basic_csc;
+	struct pipe_context		*pipe;
+	struct vlVertexShaderConsts	*vs_consts;
+
+	assert(csc);
+	assert(surface);
+
+	basic_csc = (struct vlBasicCSC*)csc;
+	pipe = basic_csc->pipe;
+
+	vs_consts = pipe->winsys->buffer_map
+	(
+		pipe->winsys,
+		basic_csc->vs_const_buf.buffer,
+		PIPE_BUFFER_USAGE_CPU_WRITE
+	);
+
+	vs_consts->src_scale.x = srcw / (float)surface->texture->width[0];
+	vs_consts->src_scale.y = srch / (float)surface->texture->height[0];
+	vs_consts->src_scale.z = 1;
+	vs_consts->src_scale.w = 1;
+	vs_consts->src_trans.x = srcx / (float)surface->texture->width[0];
+	vs_consts->src_trans.y = srcy / (float)surface->texture->height[0];
+	vs_consts->src_trans.z = 0;
+	vs_consts->src_trans.w = 0;
+
+	pipe->winsys->buffer_unmap(pipe->winsys, basic_csc->vs_const_buf.buffer);
+
+	pipe->set_sampler_textures(pipe, 1, &surface->texture);
+	pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLE_STRIP, 0, 4);
+
+	return 0;
+}
+
+static int vlEnd
+(
+	struct vlCSC *csc
+)
+{
+	assert(csc);
+
+	return 0;
+}
+
+static struct pipe_surface* vlGetFrameBuffer
+(
+	struct vlCSC *csc
+)
+{
+	struct vlBasicCSC	*basic_csc;
+
+	assert(csc);
+
+	basic_csc = (struct vlBasicCSC*)csc;
+
+	return basic_csc->framebuffer.cbufs[0];
+}
+
+static int vlDestroy
+(
+	struct vlCSC *csc
+)
+{
+	struct vlBasicCSC	*basic_csc;
+	struct pipe_context	*pipe;
+	unsigned int		i;
+
+	assert(csc);
+
+	basic_csc = (struct vlBasicCSC*)csc;
+	pipe = basic_csc->pipe;
+
+	if (basic_csc->framebuffer.cbufs[0])
+		pipe->winsys->surface_release
+		(
+			pipe->winsys,
+			&basic_csc->framebuffer.cbufs[0]
+		);
+
+	pipe->delete_sampler_state(pipe, basic_csc->sampler);
+	pipe->delete_vs_state(pipe, basic_csc->vertex_shader);
+	pipe->delete_fs_state(pipe, basic_csc->fragment_shader);
+
+	for (i = 0; i < 2; ++i)
+		pipe->winsys->buffer_destroy(pipe->winsys, basic_csc->vertex_bufs[i].buffer);
+
+	pipe->winsys->buffer_destroy(pipe->winsys, basic_csc->vs_const_buf.buffer);
+	pipe->winsys->buffer_destroy(pipe->winsys, basic_csc->fs_const_buf.buffer);
+
+	free(basic_csc);
+
+	return 0;
+}
+
+/*
+ * Represents 2 triangles in a strip in normalized coords.
+ * Used to render the surface onto the frame buffer.
+ */
+static const struct vlVertex2f surface_verts[4] =
+{
+	{0.0f, 0.0f},
+	{0.0f, 1.0f},
+	{1.0f, 0.0f},
+	{1.0f, 1.0f}
+};
+
+/*
+ * Represents texcoords for the above. We can use the position values directly.
+ * TODO: Duplicate these in the shader, no need to create a buffer.
+ */
+static const struct vlVertex2f *surface_texcoords = surface_verts;
+
+/*
+ * Identity color conversion constants, for debugging
+ */
+static const struct vlFragmentShaderConsts identity =
+{
+	{
+		0.0f, 0.0f, 0.0f, 0.0f
+	},
+	{
+		1.0f, 0.0f, 0.0f, 0.0f,
+		0.0f, 1.0f, 0.0f, 0.0f,
+		0.0f, 0.0f, 1.0f, 0.0f,
+		0.0f, 0.0f, 0.0f, 1.0f
+	}
+};
+
+/*
+ * Converts ITU-R BT.601 YCbCr pixels to RGB pixels where:
+ * Y is in [16,235], Cb and Cr are in [16,240]
+ * R, G, and B are in [16,235]
+ */
+static const struct vlFragmentShaderConsts bt_601 =
+{
+	{
+		0.0f,		0.501960784f,	0.501960784f,	0.0f
+	},
+	{
+		1.0f,		0.0f,		1.371f,		0.0f,
+		1.0f,		-0.336f,	-0.698f,	0.0f,
+		1.0f,		1.732f,		0.0f,		0.0f,
+		0.0f,		0.0f,		0.0f,		1.0f
+	}
+};
+
+/*
+ * Converts ITU-R BT.601 YCbCr pixels to RGB pixels where:
+ * Y is in [16,235], Cb and Cr are in [16,240]
+ * R, G, and B are in [0,255]
+ */
+static const struct vlFragmentShaderConsts bt_601_full =
+{
+	{
+		0.062745098f,	0.501960784f,	0.501960784f,	0.0f
+	},
+	{
+		1.164f,		0.0f,		1.596f,		0.0f,
+		1.164f,		-0.391f,	-0.813f,	0.0f,
+		1.164f,		2.018f,		0.0f,		0.0f,
+		0.0f,		0.0f,		0.0f,		1.0f
+	}
+};
+
+/*
+ * Converts ITU-R BT.709 YCbCr pixels to RGB pixels where:
+ * Y is in [16,235], Cb and Cr are in [16,240]
+ * R, G, and B are in [16,235]
+ */
+static const struct vlFragmentShaderConsts bt_709 =
+{
+	{
+		0.0f,		0.501960784f,	0.501960784f,	0.0f
+	},
+	{
+		1.0f,		0.0f,		1.540f,		0.0f,
+		1.0f,		-0.183f,	-0.459f,	0.0f,
+		1.0f,		1.816f,		0.0f,		0.0f,
+		0.0f,		0.0f,		0.0f,		1.0f
+	}
+};
+
+/*
+ * Converts ITU-R BT.709 YCbCr pixels to RGB pixels where:
+ * Y is in [16,235], Cb and Cr are in [16,240]
+ * R, G, and B are in [0,255]
+ */
+const struct vlFragmentShaderConsts bt_709_full =
+{
+	{
+		0.062745098f,	0.501960784f,	0.501960784f,	0.0f
+	},
+	{
+		1.164f,		0.0f,		1.793f,		0.0f,
+		1.164f,		-0.213f,	-0.534f,	0.0f,
+		1.164f,		2.115f,		0.0f,		0.0f,
+		0.0f,		0.0f,		0.0f,		1.0f
+	}
+};
+
+static int vlCreateVertexShader
+(
+	struct vlBasicCSC *csc
+)
+{
+	const unsigned int		max_tokens = 50;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	vs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(context);
+
+	pipe = csc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0		; Vertex pos
+	 * decl i1		; Vertex texcoords
+	 */
+	for (i = 0; i < 2; i++)
+	{
+		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl c0		; Scaling vector to scale texcoord rect to source size
+	 * decl c1		; Translation vector to move texcoord rect into position
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl o0		; Vertex pos
+	 * decl o1		; Vertex texcoords
+	 */
+	for (i = 0; i < 2; i++)
+	{
+		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl t0 */
+	decl = vl_decl_temps(0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* mov o0, i0		; Move pos in to pos out */
+	inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, 0, TGSI_FILE_INPUT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t0, i1, c0	; Scale unit texcoord rect to source size */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 1, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add o1, t0, c1	; Translate texcoord rect into position */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 1, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	vs.tokens = tokens;
+	csc->vertex_shader = pipe->create_vs_state(pipe, &vs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateFragmentShader
+(
+	struct vlBasicCSC *csc
+)
+{
+	const unsigned int		max_tokens = 50;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	fs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(context);
+
+	pipe = csc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+	ti = 3;
+
+	/* decl i0		; Texcoords for s0 */
+	decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, 1, 0, 0, TGSI_INTERPOLATE_LINEAR);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl c0		; Bias vector for CSC
+	 * decl c1-c4		; CSC matrix c1-c4
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 4);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl o0		; Fragment color */
+	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl t0 */
+	decl = vl_decl_temps(0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl s0		; Sampler for tex containing picture to display */
+	decl = vl_decl_samplers(0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* tex2d t0, i0, s0	; Read src pixel */
+	inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 0, TGSI_FILE_SAMPLER, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* sub t0, t0, c0	; Subtract bias vector from pixel */
+	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * dp4 o0.x, t0, c1	; Multiply pixel by the color conversion matrix
+	 * dp4 o0.y, t0, c2
+	 * dp4 o0.z, t0, c3
+	 * dp4 o0.w, t0, c4	; XXX: Don't need 4th coefficient
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		inst = vl_inst3(TGSI_OPCODE_DP4, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, i + 1);
+		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	fs.tokens = tokens;
+	csc->fragment_shader = pipe->create_fs_state(pipe, &fs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateDataBufs
+(
+	struct vlBasicCSC *csc
+)
+{
+	struct pipe_context *pipe;
+
+	assert(csc);
+
+	pipe = csc->pipe;
+
+	/*
+	Create our vertex buffer and vertex buffer element
+	VB contains 4 vertices that render a quad covering the entire window
+	to display a rendered surface
+	Quad is rendered as a tri strip
+	*/
+	csc->vertex_bufs[0].pitch = sizeof(struct vlVertex2f);
+	csc->vertex_bufs[0].max_index = 3;
+	csc->vertex_bufs[0].buffer_offset = 0;
+	csc->vertex_bufs[0].buffer = pipe->winsys->buffer_create
+	(
+		pipe->winsys,
+		1,
+		PIPE_BUFFER_USAGE_VERTEX,
+		sizeof(struct vlVertex2f) * 4
+	);
+
+	memcpy
+	(
+		pipe->winsys->buffer_map(pipe->winsys, csc->vertex_bufs[0].buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+		surface_verts,
+		sizeof(struct vlVertex2f) * 4
+	);
+
+	pipe->winsys->buffer_unmap(pipe->winsys, csc->vertex_bufs[0].buffer);
+
+	csc->vertex_elems[0].src_offset = 0;
+	csc->vertex_elems[0].vertex_buffer_index = 0;
+	csc->vertex_elems[0].nr_components = 2;
+	csc->vertex_elems[0].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+	/*
+	Create our texcoord buffer and texcoord buffer element
+	Texcoord buffer contains the TCs for mapping the rendered surface to the 4 vertices
+	*/
+	csc->vertex_bufs[1].pitch = sizeof(struct vlVertex2f);
+	csc->vertex_bufs[1].max_index = 3;
+	csc->vertex_bufs[1].buffer_offset = 0;
+	csc->vertex_bufs[1].buffer = pipe->winsys->buffer_create
+	(
+		pipe->winsys,
+		1,
+		PIPE_BUFFER_USAGE_VERTEX,
+		sizeof(struct vlVertex2f) * 4
+	);
+
+	memcpy
+	(
+		pipe->winsys->buffer_map(pipe->winsys, csc->vertex_bufs[1].buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+		surface_texcoords,
+		sizeof(struct vlVertex2f) * 4
+	);
+
+	pipe->winsys->buffer_unmap(pipe->winsys, csc->vertex_bufs[1].buffer);
+
+	csc->vertex_elems[1].src_offset = 0;
+	csc->vertex_elems[1].vertex_buffer_index = 1;
+	csc->vertex_elems[1].nr_components = 2;
+	csc->vertex_elems[1].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+	/*
+	Create our vertex shader's constant buffer
+	Const buffer contains scaling and translation vectors
+	*/
+	csc->vs_const_buf.size = sizeof(struct vlVertexShaderConsts);
+	csc->vs_const_buf.buffer = pipe->winsys->buffer_create
+	(
+		pipe->winsys,
+		1,
+		PIPE_BUFFER_USAGE_CONSTANT,
+		csc->vs_const_buf.size
+	);
+
+	/*
+	Create our fragment shader's constant buffer
+	Const buffer contains the color conversion matrix and bias vectors
+	*/
+	csc->fs_const_buf.size = sizeof(struct vlFragmentShaderConsts);
+	csc->fs_const_buf.buffer = pipe->winsys->buffer_create
+	(
+		pipe->winsys,
+		1,
+		PIPE_BUFFER_USAGE_CONSTANT,
+		csc->fs_const_buf.size
+	);
+
+	/*
+	TODO: Refactor this into a seperate function,
+	allow changing the CSC matrix at runtime to switch between regular & full versions
+	*/
+	memcpy
+	(
+		pipe->winsys->buffer_map(pipe->winsys, csc->fs_const_buf.buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+		&bt_601,
+		sizeof(struct vlFragmentShaderConsts)
+	);
+
+	pipe->winsys->buffer_unmap(pipe->winsys, csc->fs_const_buf.buffer);
+
+	return 0;
+}
+
+static int vlInit
+(
+	struct vlBasicCSC *csc
+)
+{
+	struct pipe_context		*pipe;
+	struct pipe_sampler_state	sampler;
+
+	assert(csc);
+
+	pipe = csc->pipe;
+
+	/* Delay creating the FB until vlPutSurface() so we know window size */
+	csc->framebuffer.num_cbufs = 1;
+	csc->framebuffer.cbufs[0] = NULL;
+	csc->framebuffer.zsbuf = NULL;
+
+	sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+	sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+	sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+	sampler.min_img_filter = PIPE_TEX_FILTER_LINEAR;
+	sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+	sampler.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
+	sampler.compare_mode = PIPE_TEX_COMPARE_NONE;
+	sampler.compare_func = PIPE_FUNC_ALWAYS;
+	sampler.normalized_coords = 1;
+	/*sampler.prefilter = ;*/
+	/*sampler.shadow_ambient = ;*/
+	/*sampler.lod_bias = ;*/
+	/*sampler.min_lod = ;*/
+	/*sampler.max_lod = ;*/
+	/*sampler.border_color[i] = ;*/
+	/*sampler.max_anisotropy = ;*/
+	csc->sampler = pipe->create_sampler_state(pipe, &sampler);
+
+	vlCreateVertexShader(csc);
+	vlCreateFragmentShader(csc);
+	vlCreateDataBufs(csc);
+
+	return 0;
+}
+
+int vlCreateBasicCSC
+(
+	struct pipe_context *pipe,
+	struct vlCSC **csc
+)
+{
+	struct vlBasicCSC *basic_csc;
+
+	assert(pipe);
+	assert(csc);
+
+	basic_csc = calloc(1, sizeof(struct vlBasicCSC));
+
+	if (!basic_csc)
+		return 1;
+
+	basic_csc->base.vlResizeFrameBuffer = &vlResizeFrameBuffer;
+	basic_csc->base.vlBegin = &vlBegin;
+	basic_csc->base.vlPutPicture = &vlPutPictureCSC;
+	basic_csc->base.vlEnd = &vlEnd;
+	basic_csc->base.vlGetFrameBuffer = &vlGetFrameBuffer;
+	basic_csc->base.vlDestroy = &vlDestroy;
+	basic_csc->pipe = pipe;
+
+	vlInit(basic_csc);
+
+	*csc = &basic_csc->base;
+
+	return 0;
+}
diff --git a/src/gallium/state_trackers/g3dvl/vl_basic_csc.h b/src/gallium/state_trackers/g3dvl/vl_basic_csc.h
new file mode 100644
index 0000000000..2e17f1d814
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_basic_csc.h
@@ -0,0 +1,13 @@
+#ifndef vl_basic_csc_h
+#define vl_basic_csc_h
+
+struct pipe_context;
+struct vlCSC;
+
+int vlCreateBasicCSC
+(
+	struct pipe_context *pipe,
+	struct vlCSC **csc
+);
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_context.c b/src/gallium/state_trackers/g3dvl/vl_context.c
index 5616de0ba4..56d360c05b 100644
--- a/src/gallium/state_trackers/g3dvl/vl_context.c
+++ b/src/gallium/state_trackers/g3dvl/vl_context.c
@@ -1,2088 +1,26 @@
+#define VL_INTERNAL
 #include "vl_context.h"
 #include <assert.h>
 #include <stdlib.h>
 #include <pipe/p_context.h>
-#include <pipe/p_winsys.h>
-#include <pipe/p_screen.h>
 #include <pipe/p_state.h>
-#include <pipe/p_inlines.h>
-#include <pipe/p_shader_tokens.h>
-#include <tgsi/tgsi_parse.h>
-#include <tgsi/tgsi_build.h>
-#include "vl_shader_build.h"
-#include "vl_data.h"
-#include "vl_defs.h"
-#include "vl_util.h"
+#include "vl_render.h"
+#include "vl_r16snorm_mc.h"
+#include "vl_csc.h"
+#include "vl_basic_csc.h"
 
-static int vlCreateVertexShaderFrameIDCT(struct VL_CONTEXT *context)
-{
-	const unsigned int		max_tokens = 50;
-
-	struct pipe_context		*pipe;
-	struct pipe_shader_state	fs;
-	struct tgsi_token		*tokens;
-	struct tgsi_header		*header;
-
-	struct tgsi_full_declaration	decl;
-	struct tgsi_full_instruction	inst;
-	
-	unsigned int			ti;
-	unsigned int			i;
-	
-	assert(context);
-	
-	pipe = context->pipe;
-	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
-
-	/* Version */
-	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
-	/* Header */
-	header = (struct tgsi_header*)&tokens[1];
-	*header = tgsi_build_header();
-	/* Processor */
-	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
-
-	ti = 3;
-	
-	/*
-	 * decl i0		; Vertex pos
-	 * decl i1		; Vertex texcoords
-	 */
-	for (i = 0; i < 2; i++)
-	{
-		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-	
-	/*
-	 * decl o0		; Vertex pos
-	 * decl o1		; Vertex texcoords
-	 */
-	for (i = 0; i < 2; i++)
-	{
-		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-	
-	/*
-	 * mov o0, i0		; Move pos in to pos out
-	 * mov o1, i1		; Move texcoord in to texcoord out */
-	for (i = 0; i < 2; ++i)
-	{
-		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-	
-	/* end */
-	inst = vl_end();
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	fs.tokens = tokens;
-	//context->states.idct.frame_vs = pipe->create_fs_state(pipe, &fs);
-	free(tokens);
-	
-	return 0;
-}
-
-static int vlCreateFragmentShaderFrameIDCT(struct VL_CONTEXT *context)
-{
-	const unsigned int		max_tokens = 50;
-
-	struct pipe_context		*pipe;
-	struct pipe_shader_state	fs;
-	struct tgsi_token		*tokens;
-	struct tgsi_header		*header;
-
-	struct tgsi_full_declaration	decl;
-	struct tgsi_full_instruction	inst;
-	
-	unsigned int			ti;
-	unsigned int			i;
-	
-	assert(context);
-	
-	pipe = context->pipe;
-	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
-
-	/* Version */
-	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
-	/* Header */
-	header = (struct tgsi_header*)&tokens[1];
-	*header = tgsi_build_header();
-	/* Processor */
-	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
-
-	ti = 3;
-
-	/* decl i0		; Texcoords for s0 */
-	decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, 1, 0, 0, TGSI_INTERPOLATE_LINEAR);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/* decl o0		; Fragment color */
-	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	
-	/* decl s0		; Sampler for tex containing picture to display */
-	decl = vl_decl_samplers(0, 0);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	
-	/* tex2d t0, i0, s0	; Read src pixel */
-	inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 0, TGSI_FILE_SAMPLER, 0);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/* sub t0, t0, c0	; Subtract bias vector from pixel */
-	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/*
-	 * dp4 o0.x, t0, c1	; Multiply pixel by the color conversion matrix
-	 * dp4 o0.y, t0, c2
-	 * dp4 o0.z, t0, c3
-	 * dp4 o0.w, t0, c4	; XXX: Don't need 4th coefficient
-	 */
-	for (i = 0; i < 4; ++i)
-	{
-		inst = vl_inst3(TGSI_OPCODE_DP4, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, i + 1);
-		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/* end */
-	inst = vl_end();
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	fs.tokens = tokens;
-	//context->states.idct.frame_fs = pipe->create_fs_state(pipe, &fs);
-	free(tokens);
-	
-	return 0;
-}
-
-static int vlInitIDCT(struct VL_CONTEXT *context)
-{
-	struct pipe_context		*pipe;
-	struct pipe_sampler_state	sampler;
-	struct pipe_texture		template;
-	unsigned int			i;
-	
-	assert(context);
-	
-	pipe = context->pipe;
-	
-	context->states.idct.viewport.scale[0] = VL_BLOCK_WIDTH;
-	context->states.idct.viewport.scale[1] = VL_BLOCK_HEIGHT;
-	context->states.idct.viewport.scale[2] = 1;
-	context->states.idct.viewport.scale[3] = 1;
-	context->states.idct.viewport.translate[0] = 0;
-	context->states.idct.viewport.translate[1] = 0;
-	context->states.idct.viewport.translate[2] = 0;
-	context->states.idct.viewport.translate[3] = 0;
-	
-	context->states.idct.render_target.width = VL_BLOCK_WIDTH;
-	context->states.idct.render_target.height = VL_BLOCK_HEIGHT;
-	context->states.idct.render_target.num_cbufs = 1;
-	context->states.idct.render_target.zsbuf = NULL;
-	
-	sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-	sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-	sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-	sampler.min_img_filter = PIPE_TEX_FILTER_NEAREST;
-	sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
-	sampler.mag_img_filter = PIPE_TEX_FILTER_NEAREST;
-	sampler.compare_mode = PIPE_TEX_COMPARE_NONE;
-	sampler.compare_func = PIPE_FUNC_ALWAYS;
-	sampler.normalized_coords = 1;
-	/*sampler.prefilter = ;*/
-	/*sampler.shadow_ambient = ;*/
-	/*sampler.lod_bias = ;*/
-	sampler.min_lod = 0;
-	/*sampler.max_lod = ;*/
-	/*sampler.border_color[i] = ;*/
-	/*sampler.max_anisotropy = ;*/
-	context->states.idct.sampler = pipe->create_sampler_state(pipe, &sampler);
-	
-	memset(&template, 0, sizeof(struct pipe_texture));
-	template.target = PIPE_TEXTURE_2D;
-	template.format = PIPE_FORMAT_A8L8_UNORM;
-	template.last_level = 0;
-	template.width[0] = 8;
-	template.height[0] = 8;
-	template.depth[0] = 1;
-	template.compressed = 0;
-	pf_get_block(template.format, &template.block);
-	
-	context->states.idct.texture = pipe->screen->texture_create(pipe->screen, &template);
-	
-	template.format = PIPE_FORMAT_A8R8G8B8_UNORM;
-	template.width[0] = 16;
-	template.height[0] = 1;
-	
-	context->states.idct.basis = pipe->screen->texture_create(pipe->screen, &template);
-	
-	for (i = 0; i < 2; ++i)
-	{
-		context->states.idct.vertex_bufs[i] = &context->states.csc.vertex_bufs[i];
-		context->states.idct.vertex_buf_elems[i] = &context->states.csc.vertex_buf_elems[i];
-		/*
-		context->states.idct.vertex_bufs[i].pitch = sizeof(struct VL_VERTEX2F);
-		context->states.idct.vertex_bufs[i].max_index = 3;
-		context->states.idct.vertex_bufs[i].buffer_offset = 0;
-		context->states.idct.vertex_bufs[i].buffer = pipe->winsys->buffer_create
-		(
-			pipe->winsys,
-			1,
-			PIPE_BUFFER_USAGE_VERTEX,
-			sizeof(struct VL_VERTEX2F) * 4
-		);
-	
-		context->states.idct.vertex_buf_elems[i].src_offset = 0;
-		context->states.idct.vertex_buf_elems[i].vertex_buffer_index = i;
-		context->states.idct.vertex_buf_elems[i].nr_components = 2;
-		context->states.idct.vertex_buf_elems[i].src_format = PIPE_FORMAT_R32G32_FLOAT;
-		*/
-	}
-	
-	vlCreateVertexShaderFrameIDCT(context);
-	vlCreateFragmentShaderFrameIDCT(context);
-	
-	return 0;
-}
-
-static int vlDestroyIDCT(struct VL_CONTEXT *context)
-{
-	//unsigned int i;
-	
-	assert(context);
-	
-	context->pipe->delete_sampler_state(context->pipe, context->states.idct.sampler);
-	
-	//for (i = 0; i < 2; ++i)
-		//context->pipe->winsys->buffer_destroy(context->pipe->winsys, context->states.idct.vertex_bufs[i].buffer);
-	
-	pipe_texture_release(&context->states.idct.texture);
-	pipe_texture_release(&context->states.idct.basis);
-	
-	//context->pipe->delete_vs_state(context->pipe, context->states.idct.frame_vs);
-	//context->pipe->delete_fs_state(context->pipe, context->states.idct.frame_fs);
-	
-	//context->pipe->winsys->buffer_destroy(context->pipe->winsys, context->states.idct.vs_const_buf.buffer);
-	//context->pipe->winsys->buffer_destroy(context->pipe->winsys, context->states.idct.fs_const_buf.buffer);
-	
-	return 0;
-}
-
-static int vlCreateVertexShaderIMC(struct VL_CONTEXT *context)
-{
-	const unsigned int		max_tokens = 50;
-	
-	struct pipe_context		*pipe;
-	struct pipe_shader_state	vs;
-	struct tgsi_token		*tokens;
-	struct tgsi_header		*header;
-	
-	struct tgsi_full_declaration	decl;
-	struct tgsi_full_instruction	inst;
-	
-	unsigned int			ti;
-	unsigned int			i;
-	
-	assert(context);
-	
-	pipe = context->pipe;
-	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
-
-	/* Version */
-	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
-	/* Header */
-	header = (struct tgsi_header*)&tokens[1];
-	*header = tgsi_build_header();
-	/* Processor */
-	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
-
-	ti = 3;
-
-	/*
-	 * decl i0		; Vertex pos
-	 * decl i1		; Luma texcoords
-	 * decl i2		; Chroma texcoords
-	 */
-	for (i = 0; i < 3; i++)
-	{
-		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-	
-	/*
-	 * decl c0		; Scaling vector to scale unit rect to macroblock size
-	 * decl c1		; Translation vector to move macroblock into position
-	 */
-	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/*
-	 * decl o0		; Vertex pos
-	 * decl o1		; Luma texcoords
-	 * decl o2		; Chroma texcoords
-	 */
-	for (i = 0; i < 3; i++)
-	{
-		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-	
-	/* decl t0 */
-	decl = vl_decl_temps(0, 0);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	
-	/* mul t0, i0, c0	; Scale unit rect to normalized MB size */
-	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 0, TGSI_FILE_CONSTANT, 0);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* add o0, t0, c1	; Translate rect into position */
-	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 1);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/*
-	 * mov o1, i1		; Move input luma texcoords to output
-	 * mov o2, i2		; Move input chroma texcoords to output
-	 */
-	for (i = 1; i < 3; ++i)
-	{
-		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/* end */
-	inst = vl_end();
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	vs.tokens = tokens;
-	context->states.mc.i_vs = pipe->create_vs_state(pipe, &vs);
-	free(tokens);
-	
-	return 0;
-}
-
-static int vlCreateFragmentShaderIMC(struct VL_CONTEXT *context)
-{
-	const unsigned int		max_tokens = 100;
-	
-	struct pipe_context		*pipe;
-	struct pipe_shader_state	fs;
-	struct tgsi_token		*tokens;
-	struct tgsi_header		*header;
-
-	struct tgsi_full_declaration	decl;
-	struct tgsi_full_instruction	inst;
-	
-	unsigned int			ti;
-	unsigned int			i;
-	
-	assert(context);
-	
-	pipe = context->pipe;
-	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
-
-	/* Version */
-	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
-	/* Header */
-	header = (struct tgsi_header*)&tokens[1];
-	*header = tgsi_build_header();
-	/* Processor */
-	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
-
-	ti = 3;
-
-	/*
-	 * decl i0			; Texcoords for s0
-	 * decl i1			; Texcoords for s1, s2
-	 */
-	for (i = 0; i < 2; ++i)
-	{
-		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-	
-	/* decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm */
-	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/* decl o0			; Fragment color */
-	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	
-	/* decl t0, t1 */
-	decl = vl_decl_temps(0, 1);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	
-	/*
-	 * decl s0			; Sampler for luma texture
-	 * decl s1			; Sampler for chroma Cb texture
-	 * decl s2			; Sampler for chroma Cr texture
-	 */
-	for (i = 0; i < 3; ++i)
-	{
-		decl = vl_decl_samplers(i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header,max_tokens - ti);
-	}
-	
-	/*
-	 * tex2d t1, i0, s0		; Read texel from luma texture
-	 * mov t0.x, t1.x		; Move luma sample into .x component
-	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
-	 * mov t0.y, t1.x		; Move Cb sample into .y component
-	 * tex2d t1, i1, s2		; Read texel from chroma Cr texture
-	 * mov t0.z, t1.x		; Move Cr sample into .z component
-	 */
-	for (i = 0; i < 3; ++i)
-	{
-		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i > 0 ? 1 : 0, TGSI_FILE_SAMPLER, i);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-		
-		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
-		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
-		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
-		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
-		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-		
-	}
-	
-	/* mul o0, t0, c0		; Rescale texel to correct range */
-	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* end */
-	inst = vl_end();
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	fs.tokens = tokens;
-	context->states.mc.i_fs = pipe->create_fs_state(pipe, &fs);
-	free(tokens);
-	
-	return 0;
-}
-
-static int vlCreateVertexShaderFramePMC(struct VL_CONTEXT *context)
-{
-	const unsigned int		max_tokens = 100;
-	
-	struct pipe_context		*pipe;
-	struct pipe_shader_state	vs;
-	struct tgsi_token		*tokens;
-	struct tgsi_header		*header;
-	
-	struct tgsi_full_declaration	decl;
-	struct tgsi_full_instruction	inst;
-	
-	unsigned int			ti;
-	unsigned int			i;
-	
-	assert(context);
-	
-	pipe = context->pipe;
-	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
-
-	/* Version */
-	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
-	/* Header */
-	header = (struct tgsi_header*)&tokens[1];
-	*header = tgsi_build_header();
-	/* Processor */
-	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
-
-	ti = 3;
-
-	/*
-	 * decl i0		; Vertex pos
-	 * decl i1		; Luma texcoords
-	 * decl i2		; Chroma texcoords
-	 */
-	for (i = 0; i < 3; i++)
-	{
-		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-	
-	/*
-	 * decl c0		; Scaling vector to scale unit rect to macroblock size
-	 * decl c1		; Translation vector to move macroblock into position
-	 * decl c2		; Unused
-	 * decl c3		; Translation vector to move ref macroblock texcoords into position
-	 */
-	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 3);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/*
-	 * decl o0		; Vertex pos
-	 * decl o1		; Luma texcoords
-	 * decl o2		; Chroma texcoords
-	 * decl o3		; Ref macroblock texcoords
-	 */
-	for (i = 0; i < 4; i++)
-	{
-		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-	
-	/* decl t0 */
-	decl = vl_decl_temps(0, 0);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	
-	/* mul t0, i0, c0	; Scale unit rect to normalized MB size */
-	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 0, TGSI_FILE_CONSTANT, 0);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* add o0, t0, c1	; Translate rect into position */
-	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 1);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/*
-	 * mov o1, i1		; Move input luma texcoords to output
-	 * mov o2, i2		; Move input chroma texcoords to output
-	 */
-	for (i = 1; i < 3; ++i)
-	{
-		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/* add o3, t0, c3	; Translate rect into position on ref macroblock */
-	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 3, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 3);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* end */
-	inst = vl_end();
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	vs.tokens = tokens;
-	context->states.mc.p_vs[0] = pipe->create_vs_state(pipe, &vs);
-	free(tokens);
-	
-	return 0;
-}
-
-static int vlCreateVertexShaderFieldPMC(struct VL_CONTEXT *context)
-{
-	const unsigned int		max_tokens = 100;
-	
-	struct pipe_context		*pipe;
-	struct pipe_shader_state	vs;
-	struct tgsi_token		*tokens;
-	struct tgsi_header		*header;
-	
-	struct tgsi_full_declaration	decl;
-	struct tgsi_full_instruction	inst;
-	
-	unsigned int			ti;
-	unsigned int			i;
-	
-	assert(context);
-	
-	pipe = context->pipe;
-	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
-
-	/* Version */
-	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
-	/* Header */
-	header = (struct tgsi_header*)&tokens[1];
-	*header = tgsi_build_header();
-	/* Processor */
-	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
-
-	ti = 3;
-
-	/*
-	 * decl i0		; Vertex pos
-	 * decl i1		; Luma texcoords
-	 * decl i2		; Chroma texcoords
-	 */
-	for (i = 0; i < 3; i++)
-	{
-		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-		ti += tgsi_build_full_declaration
-		(
-			&decl,
-			&tokens[ti],
-			header,
-			max_tokens - ti
-		);
-	}
-	
-	/*
-	 * decl c0		; Scaling vector to scale unit rect to macroblock size
-	 * decl c1		; Translation vector to move macroblock into position
-	 * decl c2		; Denorm coefficients
-	 * decl c3		; Translation vector to move top field ref macroblock texcoords into position
-	 * decl c4		; Translation vector to move bottom field ref macroblock texcoords into position
-	 */
-	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 4);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/*
-	 * decl o0		; Vertex pos
-	 * decl o1		; Luma texcoords
-	 * decl o2		; Chroma texcoords
-	 * decl o3		; Top field ref macroblock texcoords
-	 * decl o4		; Bottom field ref macroblock texcoords
-	 * decl o5		; Denormalized vertex pos
-	 */
-	for (i = 0; i < 6; i++)
-	{
-		decl = vl_decl_output((i == 0 || i == 5) ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-	
-	/* decl t0, t1 */
-	decl = vl_decl_temps(0, 1);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	
-	/* mul t0, i0, c0	; Scale unit rect to normalized MB size */
-	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 0, TGSI_FILE_CONSTANT, 0);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* add t1, t0, c1	; Translate rect into position */
-	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 1);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/* mov o0, t1		; Move vertex pos to output */
-	inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 1);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/*
-	mov o1, i1		; Move input luma texcoords to output
-	mov o2, i2		; Move input chroma texcoords to output
-	*/
-	for (i = 1; i < 3; ++i)
-	{
-		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/* add o3, t0, c3	; Translate top field rect into position on ref macroblock
-	   add o4, t0, c4	; Translate bottom field rect into position on ref macroblock */
-	for (i = 0; i < 2; ++i)
-	{
-		inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 3, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, i + 3);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-	
-	/* mul o5, t1, c2	; Denorm vertex pos */
-	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 5, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 2);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* end */
-	inst = vl_end();
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	vs.tokens = tokens;
-	context->states.mc.p_vs[1] = pipe->create_vs_state(pipe, &vs);
-	free(tokens);
-	
-	return 0;
-}
-
-static int vlCreateFragmentShaderFramePMC(struct VL_CONTEXT *context)
-{
-	const unsigned int		max_tokens = 100;
-
-	struct pipe_context		*pipe;
-	struct pipe_shader_state	fs;
-	struct tgsi_token		*tokens;
-	struct tgsi_header		*header;
-
-	struct tgsi_full_declaration	decl;
-	struct tgsi_full_instruction	inst;
-	
-	unsigned int			ti;
-	unsigned int			i;
-	
-	assert(context);
-	
-	pipe = context->pipe;
-	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
-
-	/* Version */
-	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
-	/* Header */
-	header = (struct tgsi_header*)&tokens[1];
-	*header = tgsi_build_header();
-	/* Processor */
-	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
-
-	ti = 3;
-
-	/*
-	 * decl i0			; Texcoords for s0
-	 * decl i1			; Texcoords for s1, s2
-	 * decl i2			; Texcoords for s3
-	 */
-	for (i = 0; i < 3; ++i)
-	{
-		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-	
-	/* decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm */
-	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/* decl o0			; Fragment color */
-	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	
-	/* decl t0, t1 */
-	decl = vl_decl_temps(0, 1);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	
-	/*
-	 * decl s0			; Sampler for luma texture
-	 * decl s1			; Sampler for chroma Cb texture
-	 * decl s2			; Sampler for chroma Cr texture
-	 * decl s3			; Sampler for ref surface texture
-	 */
-	for (i = 0; i < 4; ++i)
-	{
-		decl = vl_decl_samplers(i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-	
-	/*
-	 * tex2d t1, i0, s0		; Read texel from luma texture
-	 * mov t0.x, t1.x		; Move luma sample into .x component
-	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
-	 * mov t0.y, t1.x		; Move Cb sample into .y component
-	 * tex2d t1, i1, s2		; Read texel from chroma Cr texture
-	 * mov t0.z, t1.x		; Move Cr sample into .z component
-	 */
-	for (i = 0; i < 3; ++i)
-	{
-		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i > 0 ? 1 : 0, TGSI_FILE_SAMPLER, i);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-		
-		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
-		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
-		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
-		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
-		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-		
-	}
-	
-	/* mul t0, t0, c0		; Rescale texel to correct range */
-	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/* tex2d t1, i2, s3		; Read texel from ref macroblock */
-	inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, 2, TGSI_FILE_SAMPLER, 3);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/* add o0, t0, t1		; Add ref and differential to form final output */
-	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* end */
-	inst = vl_end();
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	fs.tokens = tokens;
-	context->states.mc.p_fs[0] = pipe->create_fs_state(pipe, &fs);
-	free(tokens);
-	
-	return 0;
-}
-
-static int vlCreateFragmentShaderFieldPMC(struct VL_CONTEXT *context)
-{
-	const unsigned int		max_tokens = 200;
-
-	struct pipe_context		*pipe;
-	struct pipe_shader_state	fs;
-	struct tgsi_token		*tokens;
-	struct tgsi_header		*header;
-
-	struct tgsi_full_declaration	decl;
-	struct tgsi_full_instruction	inst;
-	
-	unsigned int			ti;
-	unsigned int			i;
-	
-	assert(context);
-	
-	pipe = context->pipe;
-	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
-
-	/* Version */
-	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
-	/* Header */
-	header = (struct tgsi_header*)&tokens[1];
-	*header = tgsi_build_header();
-	/* Processor */
-	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
-
-	ti = 3;
-
-	/*
-	 * decl i0			; Texcoords for s0
-	 * decl i1			; Texcoords for s1, s2
-	 * decl i2			; Texcoords for s3
-	 * decl i3			; Texcoords for s3
-	 * decl i4			; Denormalized vertex pos
-	 */
-	for (i = 0; i < 5; ++i)
-	{
-		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-	
-	/*
-	 * decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm
-	 * decl c1			; Constants 1/2 & 2 in .x, .y channels for Y-mod-2 top/bottom field selection
-	 */
-	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/* decl o0			; Fragment color */
-	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	
-	/* decl t0-t4 */
-	decl = vl_decl_temps(0, 4);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	
-	/*
-	 * decl s0			; Sampler for luma texture
-	 * decl s1			; Sampler for chroma Cb texture
-	 * decl s2			; Sampler for chroma Cr texture
-	 * decl s3			; Sampler for ref surface texture
-	 */
-	for (i = 0; i < 4; ++i)
-	{
-		decl = vl_decl_samplers(i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-	
-	/*
-	 * tex2d t1, i0, s0		; Read texel from luma texture
-	 * mov t0.x, t1.x		; Move luma sample into .x component
-	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
-	 * mov t0.y, t1.x		; Move Cb sample into .y component
-	 * tex2d t1, i1, s2		; Read texel from chroma Cr texture
-	 * mov t0.z, t1.x		; Move Cr sample into .z component
-	 */
-	for (i = 0; i < 3; ++i)
-	{
-		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i > 0 ? 1 : 0, TGSI_FILE_SAMPLER, i);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-		
-		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
-		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
-		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
-		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
-		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-		
-	}
-	
-	/* mul t0, t0, c0		; Rescale texel to correct range */
-	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/*
-	 * tex2d t1, i2, s3		; Read texel from ref macroblock top field
-	 * tex2d t2, i3, s3		; Read texel from ref macroblock bottom field
-	 */
-	for (i = 0; i < 2; ++i)
-	{
-		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 2, TGSI_FILE_SAMPLER, 3);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-	
-	/* XXX: Pos values off by 0.5? */
-	/* sub t4, i4.y, c1.x		; Sub 0.5 from denormalized pos */
-	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_INPUT, 4, TGSI_FILE_CONSTANT, 1);
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/* mul t3, t4, c1.x		; Multiply pos Y-coord by 1/2 */
-	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_CONSTANT, 1);
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/* floor t3, t3			; Get rid of fractional part */
-	inst = vl_inst2(TGSI_OPCODE_FLOOR, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/* mul t3, t3, c1.y		; Multiply by 2 */
-	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_CONSTANT, 1);
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/* sub t3, t4, t3		; Subtract from original Y to get Y % 2 */
-	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_TEMPORARY, 3);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/* TODO: Move to conditional tex fetch on t3 instead of lerp */
-	/* lerp t1, t3, t1, t2		; Choose between top and bottom fields based on Y % 2 */
-	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/* add o0, t0, t1		; Add ref and differential to form final output */
-	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* end */
-	inst = vl_end();
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	fs.tokens = tokens;
-	context->states.mc.p_fs[1] = pipe->create_fs_state(pipe, &fs);
-	free(tokens);
-	
-	return 0;
-}
-
-static int vlCreateVertexShaderFrameBMC(struct VL_CONTEXT *context)
-{
-	const unsigned int		max_tokens = 100;
-	
-	struct pipe_context		*pipe;
-	struct pipe_shader_state	vs;
-	struct tgsi_token		*tokens;
-	struct tgsi_header		*header;
-	
-	struct tgsi_full_declaration	decl;
-	struct tgsi_full_instruction	inst;
-	
-	unsigned int			ti;
-	unsigned int			i;
-	
-	assert(context);
-	
-	pipe = context->pipe;
-	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
-
-	/* Version */
-	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
-	/* Header */
-	header = (struct tgsi_header*)&tokens[1];
-	*header = tgsi_build_header();
-	/* Processor */
-	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
-
-	ti = 3;
-
-	/*
-	 * decl i0		; Vertex pos
-	 * decl i1		; Luma texcoords
-	 * decl i2		; Chroma texcoords
-	 */
-	for (i = 0; i < 3; i++)
-	{
-		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-	
-	/*
-	 * decl c0		; Scaling vector to scale unit rect to macroblock size
-	 * decl c1		; Translation vector to move macroblock into position
-	 * decl c2		; Unused
-	 * decl c3		; Translation vector to move past ref macroblock texcoords into position
-	 * decl c4		; Unused
-	 * decl c5		; Translation vector to move future ref macroblock texcoords into position
-	 */
-	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 5);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/*
-	 * decl o0		; Vertex pos
-	 * decl o1		; Luma texcoords
-	 * decl o2		; Chroma texcoords
-	 * decl o3		; Past ref macroblock texcoords
-	 * decl o4		; Future ref macroblock texcoords
-	 */
-	for (i = 0; i < 5; i++)
-	{
-		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-	
-	/* decl t0 */
-	decl = vl_decl_temps(0, 0);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	
-	/* mul t0, i0, c0	; Scale unit rect to normalized MB size */
-	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 0, TGSI_FILE_CONSTANT, 0);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* add o0, t0, c1	; Translate rect into position */
-	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 1);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/*
-	 * mov o1, i1		; Move input luma texcoords to output
-	 * mov o2, i2		; Move input chroma texcoords to output
-	 */
-	for (i = 1; i < 3; ++i)
-	{
-		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-	
-	/* add o3, t0, c3	; Translate rect into position on past ref macroblock
-	   add o4, t0, c5	; Translate rect into position on future ref macroblock */
-	for (i = 0; i < 2; ++i)
-	{
-		inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 3, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, i * 2 + 3);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/* end */
-	inst = vl_end();
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	vs.tokens = tokens;
-	context->states.mc.b_vs[0] = pipe->create_vs_state(pipe, &vs);
-	free(tokens);
-	
-	return 0;
-}
-
-static int vlCreateVertexShaderFieldBMC(struct VL_CONTEXT *context)
-{
-	const unsigned int		max_tokens = 100;
-	
-	struct pipe_context		*pipe;
-	struct pipe_shader_state	vs;
-	struct tgsi_token		*tokens;
-	struct tgsi_header		*header;
-	
-	struct tgsi_full_declaration	decl;
-	struct tgsi_full_instruction	inst;
-	
-	unsigned int			ti;
-	unsigned int			i;
-	
-	assert(context);
-	
-	pipe = context->pipe;	
-	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
-
-	/* Version */
-	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
-	/* Header */
-	header = (struct tgsi_header*)&tokens[1];
-	*header = tgsi_build_header();
-	/* Processor */
-	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
-
-	ti = 3;
-
-	/*
-	 * decl i0		; Vertex pos
-	 * decl i1		; Luma texcoords
-	 * decl i2		; Chroma texcoords
-	 */
-	for (i = 0; i < 3; i++)
-	{
-		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-	
-	/*
-	 * decl c0		; Scaling vector to scale unit rect to macroblock size
-	 * decl c1		; Translation vector to move macroblock into position
-	 * decl c2		; Denorm coefficients
-	 * decl c3		; Translation vector to move top field past ref macroblock texcoords into position
-	 * decl c4		; Translation vector to move bottom field past ref macroblock texcoords into position
-	 * decl c5		; Translation vector to move top field future ref macroblock texcoords into position
-	 * decl c6		; Translation vector to move bottom field future ref macroblock texcoords into position
-	 */
-	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 6);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/*
-	 * decl o0		; Vertex pos
-	 * decl o1		; Luma texcoords
-	 * decl o2		; Chroma texcoords
-	 * decl o3		; Top field past ref macroblock texcoords
-	 * decl o4		; Bottom field past ref macroblock texcoords
-	 * decl o5		; Top field future ref macroblock texcoords
-	 * decl o6		; Bottom field future ref macroblock texcoords
-	 * decl o7		; Denormalized vertex pos
-	 */
-	for (i = 0; i < 8; i++)
-	{
-		decl = vl_decl_output((i == 0 || i == 7) ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-	
-	/* decl t0, t1 */
-	decl = vl_decl_temps(0, 1);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	
-	/* mul t0, i0, c0	; Scale unit rect to normalized MB size */
-	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 0, TGSI_FILE_CONSTANT, 0);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* add t1, t0, c1	; Translate rect into position */
-	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 1);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/* mov o0, t1		; Move vertex pos to output */
-	inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 1);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/*
-	 * mov o1, i1		; Move input luma texcoords to output
-	 * mov o2, i2		; Move input chroma texcoords to output
-	 */
-	for (i = 1; i < 3; ++i)
-	{
-		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/*
-	 * add o3, t0, c3	; Translate top field rect into position on past ref macroblock
-	 * add o4, t0, c4	; Translate bottom field rect into position on past ref macroblock
-	 * add o5, t0, c5	; Translate top field rect into position on future ref macroblock
-	 * add o6, t0, c6	; Translate bottom field rect into position on future ref macroblock
-	 */
-	for (i = 0; i < 4; ++i)
-	{
-		inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 3, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, i + 3);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-	
-	/* mul o7, t1, c2	; Denorm vertex pos */
-	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 7, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 2);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* end */
-	inst = vl_end();
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	vs.tokens = tokens;
-	context->states.mc.b_vs[1] = pipe->create_vs_state(pipe, &vs);
-	free(tokens);
-	
-	return 0;
-}
-
-static int vlCreateFragmentShaderFrameBMC(struct VL_CONTEXT *context)
-{
-	const unsigned int		max_tokens = 100;
-
-	struct pipe_context		*pipe;
-	struct pipe_shader_state	fs;
-	struct tgsi_token		*tokens;
-	struct tgsi_header		*header;
-
-	struct tgsi_full_declaration	decl;
-	struct tgsi_full_instruction	inst;
-	
-	unsigned int			ti;
-	unsigned int			i;
-	
-	assert(context);
-	
-	pipe = context->pipe;
-	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
-
-	/* Version */
-	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
-	/* Header */
-	header = (struct tgsi_header*)&tokens[1];
-	*header = tgsi_build_header();
-	/* Processor */
-	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
-
-	ti = 3;
-
-	/*
-	 * decl i0			; Texcoords for s0
-	 * decl i1			; Texcoords for s1, s2
-	 * decl i2			; Texcoords for s3
-	 * decl i3			; Texcoords for s4
-	 */
-	for (i = 0; i < 4; ++i)
-	{
-		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-	
-	/*
-	 * decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm
-	 * decl c1			; Constant 1/2 in .x channel to use as weight to blend past and future texels
-	 */
-	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/* decl o0			; Fragment color */
-	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	
-	/* decl t0-t2 */
-	decl = vl_decl_temps(0, 2);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	
-	/*
-	 * decl s0			; Sampler for luma texture
-	 * decl s1			; Sampler for chroma Cb texture
-	 * decl s2			; Sampler for chroma Cr texture
-	 * decl s3			; Sampler for past ref surface texture
-	 * decl s4			; Sampler for future ref surface texture
-	 */
-	for (i = 0; i < 5; ++i)
-	{
-		decl = vl_decl_samplers(i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-	
-	/*
-	 * tex2d t1, i0, s0		; Read texel from luma texture
-	 * mov t0.x, t1.x		; Move luma sample into .x component
-	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
-	 * mov t0.y, t1.x		; Move Cb sample into .y component
-	 * tex2d t1, i1, s2		; Read texel from chroma Cr texture
-	 * mov t0.z, t1.x		; Move Cr sample into .z component
-	 */
-	for (i = 0; i < 3; ++i)
-	{
-		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i > 0 ? 1 : 0, TGSI_FILE_SAMPLER, i);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-		
-		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
-		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
-		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
-		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
-		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-		
-	}
-	
-	/* mul t0, t0, c0		; Rescale texel to correct range */
-	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/*
-	 * tex2d t1, i2, s3		; Read texel from past ref macroblock
-	 * tex2d t2, i3, s4		; Read texel from future ref macroblock
-	 */
-	for (i = 0; i < 2; ++i)
-	{
-		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 2, TGSI_FILE_SAMPLER, i + 3);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-	
-	/* lerp t1, c1.x, t1, t2	; Blend past and future texels */
-	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 1, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/* add o0, t0, t1		; Add past/future ref and differential to form final output */
-	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* end */
-	inst = vl_end();
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	fs.tokens = tokens;
-	context->states.mc.b_fs[0] = pipe->create_fs_state(pipe, &fs);
-	free(tokens);
-	
-	return 0;
-}
-
-static int vlCreateFragmentShaderFieldBMC(struct VL_CONTEXT *context)
-{
-	const unsigned int		max_tokens = 200;
-
-	struct pipe_context		*pipe;
-	struct pipe_shader_state	fs;
-	struct tgsi_token		*tokens;
-	struct tgsi_header		*header;
-
-	struct tgsi_full_declaration	decl;
-	struct tgsi_full_instruction	inst;
-	
-	unsigned int			ti;
-	unsigned int			i;
-	
-	assert(context);
-	
-	pipe = context->pipe;
-	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
-
-	/* Version */
-	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
-	/* Header */
-	header = (struct tgsi_header*)&tokens[1];
-	*header = tgsi_build_header();
-	/* Processor */
-	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
-
-	ti = 3;
-
-	/*
-	 * decl i0			; Texcoords for s0
-	 * decl i1			; Texcoords for s1, s2
-	 * decl i2			; Texcoords for s3
-	 * decl i3			; Texcoords for s3
-	 * decl i4			; Texcoords for s4
-	 * decl i5			; Texcoords for s4
-	 * decl i6			; Denormalized vertex pos
-	 */
-	for (i = 0; i < 7; ++i)
-	{
-		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-	
-	/*
-	 * decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm
-	 * decl c1			; Constants 1/2 & 2 in .x, .y channels to use as weight to blend past and future texels
-	 *				; and for Y-mod-2 top/bottom field selection
-	 */
-	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/* decl o0			; Fragment color */
-	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	
-	/* decl t0-t5 */
-	decl = vl_decl_temps(0, 5);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	
-	/*
-	 * decl s0			; Sampler for luma texture
-	 * decl s1			; Sampler for chroma Cb texture
-	 * decl s2			; Sampler for chroma Cr texture
-	 * decl s3			; Sampler for past ref surface texture
-	 * decl s4			; Sampler for future ref surface texture
-	 */
-	for (i = 0; i < 5; ++i)
-	{
-		decl = vl_decl_samplers(i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-	
-	/*
-	 * tex2d t1, i0, s0		; Read texel from luma texture
-	 * mov t0.x, t1.x		; Move luma sample into .x component
-	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
-	 * mov t0.y, t1.x		; Move Cb sample into .y component
-	 * tex2d t1, i1, s2		; Read texel from chroma Cr texture
-	 * mov t0.z, t1.x		; Move Cr sample into .z component
-	 */
-	for (i = 0; i < 3; ++i)
-	{
-		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i > 0 ? 1 : 0, TGSI_FILE_SAMPLER, i);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-		
-		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
-		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
-		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
-		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
-		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-		
-	}
-	
-	/* mul t0, t0, c0		; Rescale texel to correct range */
-	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/* XXX: Pos values off by 0.5? */
-	/* sub t4, i6.y, c1.x		; Sub 0.5 from denormalized pos */
-	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_INPUT, 6, TGSI_FILE_CONSTANT, 1);
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/* mul t3, t4, c1.x		; Multiply pos Y-coord by 1/2 */
-	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_CONSTANT, 1);
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/* floor t3, t3			; Get rid of fractional part */
-	inst = vl_inst2(TGSI_OPCODE_FLOOR, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/* mul t3, t3, c1.y		; Multiply by 2 */
-	inst = vl_inst3( TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_CONSTANT, 1);
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/* sub t3, t4, t3		; Subtract from original Y to get Y % 2 */
-	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_TEMPORARY, 3);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/*
-	 * tex2d t1, i2, s3		; Read texel from past ref macroblock top field
-	 * tex2d t2, i3, s3		; Read texel from past ref macroblock bottom field
-	 */
-	for (i = 0; i < 2; ++i)
-	{
-		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 2, TGSI_FILE_SAMPLER, 3);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-	
-	/* TODO: Move to conditional tex fetch on t3 instead of lerp */
-	/* lerp t1, t3, t1, t2		; Choose between top and bottom fields based on Y % 2 */
-	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/*
-	 * tex2d t4, i4, s4		; Read texel from future ref macroblock top field
-	 * tex2d t5, i5, s4		; Read texel from future ref macroblock bottom field
-	 */
-	for (i = 0; i < 2; ++i)
-	{
-		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 4, TGSI_FILE_INPUT, i + 4, TGSI_FILE_SAMPLER, 4);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-	
-	/* TODO: Move to conditional tex fetch on t3 instead of lerp */
-	/* lerp t2, t3, t4, t5		; Choose between top and bottom fields based on Y % 2 */
-	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 2, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_TEMPORARY, 5);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/* lerp t1, c1.x, t1, t2	; Blend past and future texels */
-	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 1, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/* add o0, t0, t1		; Add past/future ref and differential to form final output */
-	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* end */
-	inst = vl_end();
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	fs.tokens = tokens;
-	context->states.mc.b_fs[1] = pipe->create_fs_state(pipe, &fs);
-	free(tokens);
-	
-	return 0;
-}
-
-int vlCreateDataBufsMC(struct VL_CONTEXT *context)
-{
-	struct pipe_context	*pipe;
-	unsigned int		i;
-	
-	assert(context);
-	
-	pipe = context->pipe;
-	
-	/* Create our vertex buffer and vertex buffer element */
-	context->states.mc.vertex_bufs[0].pitch = sizeof(struct VL_VERTEX2F);
-	context->states.mc.vertex_bufs[0].max_index = 23;
-	context->states.mc.vertex_bufs[0].buffer_offset = 0;
-	context->states.mc.vertex_bufs[0].buffer = pipe->winsys->buffer_create
-	(
-		pipe->winsys,
-		1,
-		PIPE_BUFFER_USAGE_VERTEX,
-		sizeof(struct VL_VERTEX2F) * 24
-	);
-	
-	context->states.mc.vertex_buf_elems[0].src_offset = 0;
-	context->states.mc.vertex_buf_elems[0].vertex_buffer_index = 0;
-	context->states.mc.vertex_buf_elems[0].nr_components = 2;
-	context->states.mc.vertex_buf_elems[0].src_format = PIPE_FORMAT_R32G32_FLOAT;
-	
-	/* Create our texcoord buffers and texcoord buffer elements */
-	for (i = 1; i < 3; ++i)
-	{
-		context->states.mc.vertex_bufs[i].pitch = sizeof(struct VL_TEXCOORD2F);
-		context->states.mc.vertex_bufs[i].max_index = 23;
-		context->states.mc.vertex_bufs[i].buffer_offset = 0;
-		context->states.mc.vertex_bufs[i].buffer = pipe->winsys->buffer_create
-		(
-			pipe->winsys,
-			1,
-			PIPE_BUFFER_USAGE_VERTEX,
-			sizeof(struct VL_TEXCOORD2F) * 24
-		);
-	
-		context->states.mc.vertex_buf_elems[i].src_offset = 0;
-		context->states.mc.vertex_buf_elems[i].vertex_buffer_index = i;
-		context->states.mc.vertex_buf_elems[i].nr_components = 2;
-		context->states.mc.vertex_buf_elems[i].src_format = PIPE_FORMAT_R32G32_FLOAT;
-	}
-	
-	/* Fill buffers */
-	memcpy
-	(
-		pipe->winsys->buffer_map(pipe->winsys, context->states.mc.vertex_bufs[0].buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
-		vl_chroma_420_texcoords,
-		sizeof(struct VL_VERTEX2F) * 24
-	);
-	memcpy
-	(
-		pipe->winsys->buffer_map(pipe->winsys, context->states.mc.vertex_bufs[1].buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
-		vl_luma_texcoords,
-		sizeof(struct VL_TEXCOORD2F) * 24
-	);
-	/* TODO: Accomodate 422, 444 */
-	memcpy
-	(
-		pipe->winsys->buffer_map(pipe->winsys, context->states.mc.vertex_bufs[2].buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
-		vl_chroma_420_texcoords,
-		sizeof(struct VL_TEXCOORD2F) * 24
-	);
-	
-	for (i = 0; i < 3; ++i)
-		pipe->winsys->buffer_unmap(pipe->winsys, context->states.mc.vertex_bufs[i].buffer);
-	
-	/* Create our constant buffer */
-	context->states.mc.vs_const_buf.size = sizeof(struct VL_MC_VS_CONSTS);
-	context->states.mc.vs_const_buf.buffer = pipe->winsys->buffer_create
-	(
-		pipe->winsys,
-		1,
-		PIPE_BUFFER_USAGE_CONSTANT,
-		context->states.mc.vs_const_buf.size
-	);
-	
-	context->states.mc.fs_const_buf.size = sizeof(struct VL_MC_FS_CONSTS);
-	context->states.mc.fs_const_buf.buffer = pipe->winsys->buffer_create
-	(
-		pipe->winsys,
-		1,
-		PIPE_BUFFER_USAGE_CONSTANT,
-		context->states.mc.fs_const_buf.size
-	);
-	
-	memcpy
-	(
-		pipe->winsys->buffer_map(pipe->winsys, context->states.mc.fs_const_buf.buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
-		&vl_mc_fs_consts,
-		sizeof(struct VL_MC_FS_CONSTS)
-	);
-	
-	pipe->winsys->buffer_unmap(pipe->winsys, context->states.mc.fs_const_buf.buffer);
-	
-	return 0;
-}
-
-static int vlInitMC(struct VL_CONTEXT *context)
-{	
-	struct pipe_context		*pipe;
-	struct pipe_sampler_state	sampler;
-	struct pipe_texture		template;
-	unsigned int			filters[5];
-	unsigned int			i;
-	
-	assert(context);
-	
-	pipe = context->pipe;
-	
-	/* For MC we render to textures, which are rounded up to nearest POT */
-	context->states.mc.viewport.scale[0] = vlRoundUpPOT(context->video_width);
-	context->states.mc.viewport.scale[1] = vlRoundUpPOT(context->video_height);
-	context->states.mc.viewport.scale[2] = 1;
-	context->states.mc.viewport.scale[3] = 1;
-	context->states.mc.viewport.translate[0] = 0;
-	context->states.mc.viewport.translate[1] = 0;
-	context->states.mc.viewport.translate[2] = 0;
-	context->states.mc.viewport.translate[3] = 0;
-	
-	context->states.mc.render_target.width = vlRoundUpPOT(context->video_width);
-	context->states.mc.render_target.height = vlRoundUpPOT(context->video_height);
-	context->states.mc.render_target.num_cbufs = 1;
-	/* FB for MC stage is a VL_SURFACE, set in vlSetRenderSurface() */
-	context->states.mc.render_target.zsbuf = NULL;
-	
-	filters[0] = PIPE_TEX_FILTER_NEAREST;
-	filters[1] = context->video_format == VL_FORMAT_YCBCR_444 ? PIPE_TEX_FILTER_NEAREST : PIPE_TEX_FILTER_LINEAR;
-	filters[2] = context->video_format == VL_FORMAT_YCBCR_444 ? PIPE_TEX_FILTER_NEAREST : PIPE_TEX_FILTER_LINEAR;
-	filters[3] = PIPE_TEX_FILTER_LINEAR;
-	filters[4] = PIPE_TEX_FILTER_LINEAR;
-	
-	for (i = 0; i < 5; ++i)
-	{
-		sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-		sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-		sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-		sampler.min_img_filter = filters[i];
-		sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
-		sampler.mag_img_filter = filters[i];
-		sampler.compare_mode = PIPE_TEX_COMPARE_NONE;
-		sampler.compare_func = PIPE_FUNC_ALWAYS;
-		sampler.normalized_coords = 1;
-		/*sampler.prefilter = ;*/
-		/*sampler.shadow_ambient = ;*/
-		/*sampler.lod_bias = ;*/
-		sampler.min_lod = 0;
-		/*sampler.max_lod = ;*/
-		/*sampler.border_color[i] = ;*/
-		/*sampler.max_anisotropy = ;*/
-		context->states.mc.samplers[i] = pipe->create_sampler_state(pipe, &sampler);
-	}
-	
-	memset(&template, 0, sizeof(struct pipe_texture));
-	template.target = PIPE_TEXTURE_2D;
-	template.format = PIPE_FORMAT_R16_SNORM;
-	template.last_level = 0;
-	template.width[0] = 8;
-	template.height[0] = 8 * 4;
-	template.depth[0] = 1;
-	template.compressed = 0;
-	pf_get_block(template.format, &template.block);
-	
-	context->states.mc.textures[0] = pipe->screen->texture_create(pipe->screen, &template);
-	
-	if (context->video_format == VL_FORMAT_YCBCR_420)
-		template.height[0] = 8;
-	else if (context->video_format == VL_FORMAT_YCBCR_422)
-		template.height[0] = 8 * 2;
-	else if (context->video_format == VL_FORMAT_YCBCR_444)
-		template.height[0] = 8 * 4;
-	else
-		assert(0);
-		
-	context->states.mc.textures[1] = pipe->screen->texture_create(pipe->screen, &template);
-	context->states.mc.textures[2] = pipe->screen->texture_create(pipe->screen, &template);
-	
-	/* textures[3] & textures[4] are assigned from VL_SURFACEs for P and B macroblocks at render time */
-	
-	vlCreateVertexShaderIMC(context);
-	vlCreateFragmentShaderIMC(context);
-	vlCreateVertexShaderFramePMC(context);
-	vlCreateVertexShaderFieldPMC(context);
-	vlCreateFragmentShaderFramePMC(context);
-	vlCreateFragmentShaderFieldPMC(context);
-	vlCreateVertexShaderFrameBMC(context);
-	vlCreateVertexShaderFieldBMC(context);
-	vlCreateFragmentShaderFrameBMC(context);
-	vlCreateFragmentShaderFieldBMC(context);
-	vlCreateDataBufsMC(context);
-	
-	return 0;
-}
-
-static int vlDestroyMC(struct VL_CONTEXT *context)
-{
-	unsigned int i;
-	
-	assert(context);
-	
-	for (i = 0; i < 5; ++i)
-		context->pipe->delete_sampler_state(context->pipe, context->states.mc.samplers[i]);
-	
-	for (i = 0; i < 3; ++i)
-		context->pipe->winsys->buffer_destroy(context->pipe->winsys, context->states.mc.vertex_bufs[i].buffer);
-	
-	/* Textures 3 & 4 are not created directly, no need to release them here */
-	for (i = 0; i < 3; ++i)
-		pipe_texture_release(&context->states.mc.textures[i]);
-	
-	context->pipe->delete_vs_state(context->pipe, context->states.mc.i_vs);
-	context->pipe->delete_fs_state(context->pipe, context->states.mc.i_fs);
-	
-	for (i = 0; i < 2; ++i)
-	{
-		context->pipe->delete_vs_state(context->pipe, context->states.mc.p_vs[i]);
-		context->pipe->delete_fs_state(context->pipe, context->states.mc.p_fs[i]);
-		context->pipe->delete_vs_state(context->pipe, context->states.mc.b_vs[i]);
-		context->pipe->delete_fs_state(context->pipe, context->states.mc.b_fs[i]);
-	}
-	
-	context->pipe->winsys->buffer_destroy(context->pipe->winsys, context->states.mc.vs_const_buf.buffer);
-	context->pipe->winsys->buffer_destroy(context->pipe->winsys, context->states.mc.fs_const_buf.buffer);
-	
-	return 0;
-}
-
-static int vlCreateVertexShaderCSC(struct VL_CONTEXT *context)
-{
-	const unsigned int		max_tokens = 50;
-	
-	struct pipe_context		*pipe;
-	struct pipe_shader_state	vs;
-	struct tgsi_token		*tokens;
-	struct tgsi_header		*header;
-	
-	struct tgsi_full_declaration	decl;
-	struct tgsi_full_instruction	inst;
-	
-	unsigned int			ti;
-	unsigned int			i;
-	
-	assert(context);
-	
-	pipe = context->pipe;
-	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
-
-	/* Version */
-	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
-	/* Header */
-	header = (struct tgsi_header*)&tokens[1];
-	*header = tgsi_build_header();
-	/* Processor */
-	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
-
-	ti = 3;
-
-	/*
-	 * decl i0		; Vertex pos
-	 * decl i1		; Vertex texcoords
-	 */
-	for (i = 0; i < 2; i++)
-	{
-		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-	
-	/*
-	 * decl c0		; Scaling vector to scale texcoord rect to source size
-	 * decl c1		; Translation vector to move texcoord rect into position
-	 */
-	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/*
-	 * decl o0		; Vertex pos
-	 * decl o1		; Vertex texcoords
-	 */
-	for (i = 0; i < 2; i++)
-	{
-		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-	
-	/* decl t0 */
-	decl = vl_decl_temps(0, 0);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	
-	/* mov o0, i0		; Move pos in to pos out */
-	inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, 0, TGSI_FILE_INPUT, 0);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/* mul t0, i1, c0	; Scale unit texcoord rect to source size */
-	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 1, TGSI_FILE_CONSTANT, 0);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* add o1, t0, c1	; Translate texcoord rect into position */
-	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 1, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 1);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* end */
-	inst = vl_end();
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	vs.tokens = tokens;
-	context->states.csc.vertex_shader = pipe->create_vs_state(pipe, &vs);
-	//free(tokens);
-	
-	return 0;
-}
-
-static int vlCreateFragmentShaderCSC(struct VL_CONTEXT *context)
-{
-	const unsigned int		max_tokens = 50;
-
-	struct pipe_context		*pipe;
-	struct pipe_shader_state	fs;
-	struct tgsi_token		*tokens;
-	struct tgsi_header		*header;
-
-	struct tgsi_full_declaration	decl;
-	struct tgsi_full_instruction	inst;
-	
-	unsigned int			ti;
-	unsigned int			i;
-	
-	assert(context);
-	
-	pipe = context->pipe;
-	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
-
-	/* Version */
-	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
-	/* Header */
-	header = (struct tgsi_header*)&tokens[1];
-	*header = tgsi_build_header();
-	/* Processor */
-	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
-
-	ti = 3;
-
-	/* decl i0		; Texcoords for s0 */
-	decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, 1, 0, 0, TGSI_INTERPOLATE_LINEAR);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	
-	/*
-	 * decl c0		; Bias vector for CSC
-	 * decl c1-c4		; CSC matrix c1-c4
-	 */
-	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 4);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/* decl o0		; Fragment color */
-	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	
-	/* decl t0 */
-	decl = vl_decl_temps(0, 0);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	
-	/* decl s0		; Sampler for tex containing picture to display */
-	decl = vl_decl_samplers(0, 0);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	
-	/* tex2d t0, i0, s0	; Read src pixel */
-	inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 0, TGSI_FILE_SAMPLER, 0);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/* sub t0, t0, c0	; Subtract bias vector from pixel */
-	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	
-	/*
-	 * dp4 o0.x, t0, c1	; Multiply pixel by the color conversion matrix
-	 * dp4 o0.y, t0, c2
-	 * dp4 o0.z, t0, c3
-	 * dp4 o0.w, t0, c4	; XXX: Don't need 4th coefficient
-	 */
-	for (i = 0; i < 4; ++i)
-	{
-		inst = vl_inst3(TGSI_OPCODE_DP4, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, i + 1);
-		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/* end */
-	inst = vl_end();
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	fs.tokens = tokens;
-	context->states.csc.fragment_shader = pipe->create_fs_state(pipe, &fs);
-	//free(tokens);
-	
-	return 0;
-}
-
-static int vlCreateDataBufsCSC(struct VL_CONTEXT *context)
-{
-	struct pipe_context *pipe;
-	
-	assert(context);
-	
-	pipe = context->pipe;
-	
-	/*
-	Create our vertex buffer and vertex buffer element
-	VB contains 4 vertices that render a quad covering the entire window
-	to display a rendered surface
-	Quad is rendered as a tri strip
-	*/
-	context->states.csc.vertex_bufs[0].pitch = sizeof(struct VL_VERTEX2F);
-	context->states.csc.vertex_bufs[0].max_index = 3;
-	context->states.csc.vertex_bufs[0].buffer_offset = 0;
-	context->states.csc.vertex_bufs[0].buffer = pipe->winsys->buffer_create
-	(
-		pipe->winsys,
-		1,
-		PIPE_BUFFER_USAGE_VERTEX,
-		sizeof(struct VL_VERTEX2F) * 4
-	);
-	
-	memcpy
-	(
-		pipe->winsys->buffer_map(pipe->winsys, context->states.csc.vertex_bufs[0].buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
-		vl_surface_vertex_positions,
-		sizeof(struct VL_VERTEX2F) * 4
-	);
-	
-	pipe->winsys->buffer_unmap(pipe->winsys, context->states.csc.vertex_bufs[0].buffer);
-	
-	context->states.csc.vertex_buf_elems[0].src_offset = 0;
-	context->states.csc.vertex_buf_elems[0].vertex_buffer_index = 0;
-	context->states.csc.vertex_buf_elems[0].nr_components = 2;
-	context->states.csc.vertex_buf_elems[0].src_format = PIPE_FORMAT_R32G32_FLOAT;
-	
-	/*
-	Create our texcoord buffer and texcoord buffer element
-	Texcoord buffer contains the TCs for mapping the rendered surface to the 4 vertices
-	*/
-	context->states.csc.vertex_bufs[1].pitch = sizeof(struct VL_TEXCOORD2F);
-	context->states.csc.vertex_bufs[1].max_index = 3;
-	context->states.csc.vertex_bufs[1].buffer_offset = 0;
-	context->states.csc.vertex_bufs[1].buffer = pipe->winsys->buffer_create
-	(
-		pipe->winsys,
-		1,
-		PIPE_BUFFER_USAGE_VERTEX,
-		sizeof(struct VL_TEXCOORD2F) * 4
-	);
-	
-	memcpy
-	(
-		pipe->winsys->buffer_map(pipe->winsys, context->states.csc.vertex_bufs[1].buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
-		vl_surface_texcoords,
-		sizeof(struct VL_TEXCOORD2F) * 4
-	);
-	
-	pipe->winsys->buffer_unmap(pipe->winsys, context->states.csc.vertex_bufs[1].buffer);
-	
-	context->states.csc.vertex_buf_elems[1].src_offset = 0;
-	context->states.csc.vertex_buf_elems[1].vertex_buffer_index = 1;
-	context->states.csc.vertex_buf_elems[1].nr_components = 2;
-	context->states.csc.vertex_buf_elems[1].src_format = PIPE_FORMAT_R32G32_FLOAT;
-	
-	/*
-	Create our vertex shader's constant buffer
-	Const buffer contains scaling and translation vectors
-	*/
-	context->states.csc.vs_const_buf.size = sizeof(struct VL_CSC_VS_CONSTS);
-	context->states.csc.vs_const_buf.buffer = pipe->winsys->buffer_create
-	(
-		pipe->winsys,
-		1,
-		PIPE_BUFFER_USAGE_CONSTANT,
-		context->states.csc.vs_const_buf.size
-	);
-	
-	/*
-	Create our fragment shader's constant buffer
-	Const buffer contains the color conversion matrix and bias vectors
-	*/
-	context->states.csc.fs_const_buf.size = sizeof(struct VL_CSC_FS_CONSTS);
-	context->states.csc.fs_const_buf.buffer = pipe->winsys->buffer_create
-	(
-		pipe->winsys,
-		1,
-		PIPE_BUFFER_USAGE_CONSTANT,
-		context->states.csc.fs_const_buf.size
-	);
-	
-	/*
-	TODO: Refactor this into a seperate function,
-	allow changing the CSC matrix at runtime to switch between regular & full versions
-	*/
-	memcpy
-	(
-		pipe->winsys->buffer_map(pipe->winsys, context->states.csc.fs_const_buf.buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
-		&vl_csc_fs_consts_601,
-		sizeof(struct VL_CSC_FS_CONSTS)
-	);
-	
-	pipe->winsys->buffer_unmap(pipe->winsys, context->states.csc.fs_const_buf.buffer);
-	
-	return 0;
-}
-
-static int vlInitCSC(struct VL_CONTEXT *context)
-{	
-	struct pipe_context		*pipe;
-	struct pipe_sampler_state	sampler;
-	
-	assert(context);
-	
-	pipe = context->pipe;
-	
-	/* Delay creating the FB until vlPutSurface() so we know window size */
-	context->states.csc.framebuffer.num_cbufs = 1;
-	context->states.csc.framebuffer.cbufs[0] = NULL;
-	context->states.csc.framebuffer.zsbuf = NULL;
-
-	sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-	sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-	sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-	sampler.min_img_filter = PIPE_TEX_FILTER_LINEAR;
-	sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
-	sampler.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
-	sampler.compare_mode = PIPE_TEX_COMPARE_NONE;
-	sampler.compare_func = PIPE_FUNC_ALWAYS;
-	sampler.normalized_coords = 1;
-	/*sampler.prefilter = ;*/
-	/*sampler.shadow_ambient = ;*/
-	/*sampler.lod_bias = ;*/
-	/*sampler.min_lod = ;*/
-	/*sampler.max_lod = ;*/
-	/*sampler.border_color[i] = ;*/
-	/*sampler.max_anisotropy = ;*/
-	context->states.csc.sampler = pipe->create_sampler_state(pipe, &sampler);
-	
-	vlCreateVertexShaderCSC(context);
-	vlCreateFragmentShaderCSC(context);
-	vlCreateDataBufsCSC(context);
-	
-	return 0;
-}
-
-static int vlDestroyCSC(struct VL_CONTEXT *context)
-{
-	assert(context);
-	
-	/*
-	Since we create the final FB when we display our first surface,
-	it may not be created if vlPutSurface() is never called
-	*/
-	if (context->states.csc.framebuffer.cbufs[0])
-		context->pipe->winsys->surface_release(context->pipe->winsys, &context->states.csc.framebuffer.cbufs[0]);
-	context->pipe->delete_sampler_state(context->pipe, context->states.csc.sampler);
-	context->pipe->delete_vs_state(context->pipe, context->states.csc.vertex_shader);
-	context->pipe->delete_fs_state(context->pipe, context->states.csc.fragment_shader);
-	context->pipe->winsys->buffer_destroy(context->pipe->winsys, context->states.csc.vertex_bufs[0].buffer);
-	context->pipe->winsys->buffer_destroy(context->pipe->winsys, context->states.csc.vertex_bufs[1].buffer);
-	context->pipe->winsys->buffer_destroy(context->pipe->winsys, context->states.csc.vs_const_buf.buffer);
-	context->pipe->winsys->buffer_destroy(context->pipe->winsys, context->states.csc.fs_const_buf.buffer);
-	
-	return 0;
-}
-
-static int vlInitCommon(struct VL_CONTEXT *context)
+static int vlInitCommon(struct vlContext *context)
 {
 	struct pipe_context			*pipe;
 	struct pipe_rasterizer_state		rast;
 	struct pipe_blend_state			blend;
 	struct pipe_depth_stencil_alpha_state	dsa;
 	unsigned int				i;
-	
+
 	assert(context);
-	
+
 	pipe = context->pipe;
-	
+
 	rast.flatshade = 1;
 	rast.flatshade_first = 0;
 	rast.light_twoside = 0;
@@ -2113,9 +51,9 @@ static int vlInitCommon(struct VL_CONTEXT *context)
 	rast.offset_units = 1;
 	rast.offset_scale = 1;
 	/*rast.sprite_coord_mode[i] = ;*/
-	context->states.common.raster = pipe->create_rasterizer_state(pipe, &rast);
-	pipe->bind_rasterizer_state(pipe, context->states.common.raster);
-	
+	context->raster = pipe->create_rasterizer_state(pipe, &rast);
+	pipe->bind_rasterizer_state(pipe, context->raster);
+
 	blend.blend_enable = 0;
 	blend.rgb_func = PIPE_BLEND_ADD;
 	blend.rgb_src_factor = PIPE_BLENDFACTOR_ONE;
@@ -2128,9 +66,9 @@ static int vlInitCommon(struct VL_CONTEXT *context)
 	/* Needed to allow color writes to FB, even if blending disabled */
 	blend.colormask = PIPE_MASK_RGBA;
 	blend.dither = 0;
-	context->states.common.blend = pipe->create_blend_state(pipe, &blend);
-	pipe->bind_blend_state(pipe, context->states.common.blend);
-	
+	context->blend = pipe->create_blend_state(pipe, &blend);
+	pipe->bind_blend_state(pipe, context->blend);
+
 	dsa.depth.enabled = 0;
 	dsa.depth.writemask = 0;
 	dsa.depth.func = PIPE_FUNC_ALWAYS;
@@ -2149,134 +87,122 @@ static int vlInitCommon(struct VL_CONTEXT *context)
 	dsa.alpha.enabled = 0;
 	dsa.alpha.func = PIPE_FUNC_ALWAYS;
 	dsa.alpha.ref = 0;
-	context->states.common.dsa = pipe->create_depth_stencil_alpha_state(pipe, &dsa);
-	pipe->bind_depth_stencil_alpha_state(pipe, context->states.common.dsa);
-	
-	return 0;
-}
+	context->dsa = pipe->create_depth_stencil_alpha_state(pipe, &dsa);
+	pipe->bind_depth_stencil_alpha_state(pipe, context->dsa);
 
-static int vlDestroyCommon(struct VL_CONTEXT *context)
-{
-	assert(context);
-	
-	context->pipe->delete_blend_state(context->pipe, context->states.common.blend);
-	context->pipe->delete_rasterizer_state(context->pipe, context->states.common.raster);
-	context->pipe->delete_depth_stencil_alpha_state(context->pipe, context->states.common.dsa);
-	
 	return 0;
 }
 
-static int vlInit(struct VL_CONTEXT *context)
+int vlCreateContext
+(
+	struct vlScreen *screen,
+	struct pipe_context *pipe,
+	unsigned int picture_width,
+	unsigned int picture_height,
+	enum vlFormat picture_format,
+	enum vlProfile profile,
+	enum vlEntryPoint entry_point,
+	struct vlContext **context
+)
 {
+	struct vlContext *ctx;
+
+	assert(screen);
 	assert(context);
-	
-	vlInitCommon(context);
-	vlInitCSC(context);
-	vlInitMC(context);
-	vlInitIDCT(context);
-	
+	assert(pipe);
+
+	ctx = calloc(1, sizeof(struct vlContext));
+
+	if (!ctx)
+		return 1;
+
+	ctx->screen = screen;
+	ctx->pipe = pipe;
+	ctx->picture_width = picture_width;
+	ctx->picture_height = picture_height;
+	ctx->picture_format = picture_format;
+	ctx->profile = profile;
+	ctx->entry_point = entry_point;
+
+	vlInitCommon(ctx);
+
+	vlCreateR16SNormMC(pipe, picture_width, picture_height, picture_format, &ctx->render);
+	vlCreateBasicCSC(pipe, &ctx->csc);
+
+	*context = ctx;
+
 	return 0;
 }
 
-static int vlDestroy(struct VL_CONTEXT *context)
+int vlDestroyContext
+(
+	struct vlContext *context
+)
 {
 	assert(context);
-	
+
 	/* XXX: Must unbind shaders before we can delete them for some reason */
 	context->pipe->bind_vs_state(context->pipe, NULL);
 	context->pipe->bind_fs_state(context->pipe, NULL);
-	
-	vlDestroyCommon(context);
-	vlDestroyCSC(context);
-	vlDestroyMC(context);
-	vlDestroyIDCT(context);
-	
+
+	context->render->vlDestroy(context->render);
+	context->csc->vlDestroy(context->csc);
+
+	context->pipe->delete_blend_state(context->pipe, context->blend);
+	context->pipe->delete_rasterizer_state(context->pipe, context->raster);
+	context->pipe->delete_depth_stencil_alpha_state(context->pipe, context->dsa);
+
+	free(context);
+
 	return 0;
 }
 
-int vlCreateContext
+struct vlScreen* vlContextGetScreen
 (
-	Display *display,
-	struct pipe_context *pipe,
-	unsigned int video_width,
-	unsigned int video_height,
-	enum VL_FORMAT video_format,
-	struct VL_CONTEXT **context
+	struct vlContext *context
 )
 {
-	struct VL_CONTEXT *ctx;
-	
-	assert(display);
-	assert(pipe);
 	assert(context);
-	
-	ctx = calloc(1, sizeof(struct VL_CONTEXT));
-	
-	ctx->display = display;
-	ctx->pipe = pipe;
-	ctx->video_width = video_width;
-	ctx->video_height = video_height;
-	ctx->video_format = video_format;
-	
-	vlInit(ctx);
-	
-	/* Since we only change states in vlPutSurface() we need to start in render mode */
-	vlBeginRender(ctx);
-	
-	*context = ctx;
-	
-	return 0;
+
+	return context->screen;
 }
 
-int vlDestroyContext(struct VL_CONTEXT *context)
+struct pipe_context* vlGetPipeContext
+(
+	struct vlContext *context
+)
 {
 	assert(context);
-	
-	vlDestroy(context);
-	
-	free(context);
-	
-	return 0;
+
+	return context->pipe;
 }
 
-int vlBeginRender(struct VL_CONTEXT *context)
+unsigned int vlGetPictureWidth
+(
+	struct vlContext *context
+)
 {
-	struct pipe_context	*pipe;
-	
 	assert(context);
-	
-	pipe = context->pipe;
-	
-	/* Frame buffer set in vlRender*Macroblock() */
-	/* Shaders, samplers, textures set in vlRender*Macroblock() */
-	pipe->set_vertex_buffers(pipe, 3, context->states.mc.vertex_bufs);
-	pipe->set_vertex_elements(pipe, 3, context->states.mc.vertex_buf_elems);
-	pipe->set_viewport_state(pipe, &context->states.mc.viewport);
-	pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &context->states.mc.vs_const_buf);
-	pipe->set_constant_buffer(pipe, PIPE_SHADER_FRAGMENT, 0, &context->states.mc.fs_const_buf);
-	
-	return 0;
+
+	return context->picture_width;
 }
 
-int vlEndRender(struct VL_CONTEXT *context)
+unsigned int vlGetPictureHeight
+(
+	struct vlContext *context
+)
 {
-	struct pipe_context *pipe;
-	
 	assert(context);
-	
-	pipe = context->pipe;
-	
-	pipe->set_framebuffer_state(pipe, &context->states.csc.framebuffer);
-	pipe->set_viewport_state(pipe, &context->states.csc.viewport);
-	pipe->bind_sampler_states(pipe, 1, (void**)&context->states.csc.sampler);
-	/* Source texture set in vlPutSurface() */
-	pipe->bind_vs_state(pipe, context->states.csc.vertex_shader);
-	pipe->bind_fs_state(pipe, context->states.csc.fragment_shader);
-	pipe->set_vertex_buffers(pipe, 2, context->states.csc.vertex_bufs);
-	pipe->set_vertex_elements(pipe, 2, context->states.csc.vertex_buf_elems);
-	pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &context->states.csc.vs_const_buf);
-	pipe->set_constant_buffer(pipe, PIPE_SHADER_FRAGMENT, 0, &context->states.csc.fs_const_buf);
-	
-	return 0;
+
+	return context->picture_height;
 }
 
+enum vlFormat vlGetPictureFormat
+(
+	struct vlContext *context
+)
+{
+	assert(context);
+
+	return context->picture_format;
+}
diff --git a/src/gallium/state_trackers/g3dvl/vl_context.h b/src/gallium/state_trackers/g3dvl/vl_context.h
index bff318854a..3d14634c44 100644
--- a/src/gallium/state_trackers/g3dvl/vl_context.h
+++ b/src/gallium/state_trackers/g3dvl/vl_context.h
@@ -1,83 +1,73 @@
 #ifndef vl_context_h
 #define vl_context_h
 
-#include <X11/Xlib.h>
-#include <pipe/p_state.h>
 #include "vl_types.h"
 
 struct pipe_context;
 
-struct VL_CONTEXT
+#ifdef VL_INTERNAL
+struct vlRender;
+struct vlCSC;
+
+struct vlContext
 {
-	Display			*display;
+	struct vlScreen		*screen;
 	struct pipe_context	*pipe;
-	unsigned int		video_width;
-	unsigned int		video_height;
-	enum VL_FORMAT		video_format;
-	
-	struct
-	{
-		struct
-		{
-			struct pipe_rasterizer_state		*raster;
-			struct pipe_depth_stencil_alpha_state	*dsa;
-			struct pipe_blend_state			*blend;
-		} common;
-		
-		struct
-		{
-			struct pipe_viewport_state		viewport;
-			struct pipe_framebuffer_state		render_target;
-			struct pipe_sampler_state		*sampler;
-			struct pipe_texture			*texture;
-			struct pipe_texture			*basis;
-			struct pipe_shader_state		*frame_vs;
-			struct pipe_shader_state		*frame_fs;
-			struct pipe_vertex_buffer 		*vertex_bufs[2];
-			struct pipe_vertex_element		*vertex_buf_elems[2];
-			//struct pipe_constant_buffer		vs_const_buf, fs_const_buf;
-		} idct;
-		
-		struct
-		{
-			struct pipe_viewport_state		viewport;
-			struct pipe_framebuffer_state		render_target;
-			struct pipe_sampler_state		*samplers[5];
-			struct pipe_texture			*textures[5];
-			struct pipe_shader_state		*i_vs, *p_vs[2], *b_vs[2];
-			struct pipe_shader_state		*i_fs, *p_fs[2], *b_fs[2];
-			struct pipe_vertex_buffer 		vertex_bufs[3];
-			struct pipe_vertex_element		vertex_buf_elems[3];
-			struct pipe_constant_buffer		vs_const_buf, fs_const_buf;
-		} mc;
-		
-		struct
-		{
-			struct pipe_viewport_state		viewport;
-			struct pipe_framebuffer_state		framebuffer;
-			struct pipe_sampler_state		*sampler;
-			struct pipe_shader_state		*vertex_shader, *fragment_shader;
-			struct pipe_vertex_buffer 		vertex_bufs[2];
-			struct pipe_vertex_element		vertex_buf_elems[2];
-			struct pipe_constant_buffer		vs_const_buf, fs_const_buf;
-		} csc;
-	} states;
+	unsigned int		picture_width;
+	unsigned int		picture_height;
+	enum vlFormat		picture_format;
+	enum vlProfile		profile;
+	enum vlEntryPoint	entry_point;
+
+	void			*raster;
+	void			*dsa;
+	void			*blend;
+
+	struct vlRender		*render;
+	struct vlCSC		*csc;
 };
+#endif
 
 int vlCreateContext
 (
-	Display *display,
+	struct vlScreen *screen,
 	struct pipe_context *pipe,
-	unsigned int video_width,
-	unsigned int video_height,
-	enum VL_FORMAT video_format,
-	struct VL_CONTEXT **context
+	unsigned int picture_width,
+	unsigned int picture_height,
+	enum vlFormat picture_format,
+	enum vlProfile profile,
+	enum vlEntryPoint entry_point,
+	struct vlContext **context
 );
 
-int vlDestroyContext(struct VL_CONTEXT *context);
+int vlDestroyContext
+(
+	struct vlContext *context
+);
 
-int vlBeginRender(struct VL_CONTEXT *context);
-int vlEndRender(struct VL_CONTEXT *context);
+struct vlScreen* vlContextGetScreen
+(
+	struct vlContext *context
+);
 
-#endif
+struct pipe_context* vlGetPipeContext
+(
+	struct vlContext *context
+);
 
+unsigned int vlGetPictureWidth
+(
+	struct vlContext *context
+);
+
+unsigned int vlGetPictureHeight
+(
+	struct vlContext *context
+);
+
+enum vlFormat vlGetPictureFormat
+(
+	struct vlContext *context
+);
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_csc.h b/src/gallium/state_trackers/g3dvl/vl_csc.h
new file mode 100644
index 0000000000..36417a2792
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_csc.h
@@ -0,0 +1,53 @@
+#ifndef vl_csc_h
+#define vl_csc_h
+
+#include "vl_types.h"
+
+struct pipe_surface;
+
+struct vlCSC
+{
+	int (*vlResizeFrameBuffer)
+	(
+		struct vlCSC *csc,
+		unsigned int width,
+		unsigned int height
+	);
+
+	int (*vlBegin)
+	(
+		struct vlCSC *csc
+	);
+
+	int (*vlPutPicture)
+	(
+		struct vlCSC *csc,
+		struct vlSurface *surface,
+		int srcx,
+		int srcy,
+		int srcw,
+		int srch,
+		int destx,
+		int desty,
+		int destw,
+		int desth,
+		enum vlPictureType picture_type
+	);
+
+	int (*vlEnd)
+	(
+		struct vlCSC *csc
+	);
+
+	struct pipe_surface* (*vlGetFrameBuffer)
+	(
+		struct vlCSC *csc
+	);
+
+	int (*vlDestroy)
+	(
+		struct vlCSC *csc
+	);
+};
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_data.c b/src/gallium/state_trackers/g3dvl/vl_data.c
index 0e5c8c77f9..f2476dbf1e 100644
--- a/src/gallium/state_trackers/g3dvl/vl_data.c
+++ b/src/gallium/state_trackers/g3dvl/vl_data.c
@@ -6,17 +6,17 @@
  * Need to be scaled to cover mbW*mbH macroblock pixels and translated into
  * position on target surface.
  */
-const struct VL_VERTEX2F vl_mb_vertex_positions[24] =
+const struct vlVertex2f macroblock_verts[24] =
 {
 	{0.0f, 0.0f}, {0.0f, 0.5f}, {0.5f, 0.0f},
 	{0.5f, 0.0f}, {0.0f, 0.5f}, {0.5f, 0.5f},
-	
+
 	{0.5f, 0.0f}, {0.5f, 0.5f}, {1.0f, 0.0f},
 	{1.0f, 0.0f}, {0.5f, 0.5f}, {1.0f, 0.5f},
-	
+
 	{0.0f, 0.5f}, {0.0f, 1.0f}, {0.5f, 0.5f},
 	{0.5f, 0.5f}, {0.0f, 1.0f}, {0.5f, 1.0f},
-	
+
 	{0.5f, 0.5f}, {0.5f, 1.0f}, {1.0f, 0.5f},
 	{1.0f, 0.5f}, {0.5f, 1.0f}, {1.0f, 1.0f}
 };
@@ -26,17 +26,17 @@ const struct VL_VERTEX2F vl_mb_vertex_positions[24] =
  * in a bW*(bH*4) texture. First luma block located at 0,0->bW,bH; second at
  * 0,bH->bW,2bH; third at 0,2bH->bW,3bH; fourth at 0,3bH->bW,4bH.
  */
-const struct VL_TEXCOORD2F vl_luma_texcoords[24] =
+const struct vlVertex2f macroblock_luma_texcoords[24] =
 {
 	{0.0f, 0.0f}, {0.0f, 0.25f}, {1.0f, 0.0f},
 	{1.0f, 0.0f}, {0.0f, 0.25f}, {1.0f, 0.25f},
-	
+
 	{0.0f, 0.25f}, {0.0f, 0.5f}, {1.0f, 0.25f},
 	{1.0f, 0.25f}, {0.0f, 0.5f}, {1.0f, 0.5f},
-	
+
 	{0.0f, 0.5f}, {0.0f, 0.75f}, {1.0f, 0.5f},
 	{1.0f, 0.5f}, {0.0f, 0.75f}, {1.0f, 0.75f},
-	
+
 	{0.0f, 0.75f}, {0.0f, 1.0f}, {1.0f, 0.75f},
 	{1.0f, 0.75f}, {0.0f, 1.0f}, {1.0f, 1.0f}
 };
@@ -45,7 +45,7 @@ const struct VL_TEXCOORD2F vl_luma_texcoords[24] =
  * Represents texcoords for the above for rendering 1 chroma block.
  * Straight forward 0,0->1,1 mapping so we can reuse the MB pos vectors.
  */
-const struct VL_TEXCOORD2F *vl_chroma_420_texcoords = (const struct VL_TEXCOORD2F*)vl_mb_vertex_positions;
+const struct vlVertex2f *macroblock_chroma_420_texcoords = macroblock_verts;
 
 /*
  * Represents texcoords for the above for rendering 2 chroma blocks arranged
@@ -53,30 +53,13 @@ const struct VL_TEXCOORD2F *vl_chroma_420_texcoords = (const struct VL_TEXCOORD2
  * 0,bH->bW,2bH. We can render this with 0,0->1,1 mapping.
  * Straight forward 0,0->1,1 mapping so we can reuse MB pos vectors.
  */
-const struct VL_TEXCOORD2F *vl_chroma_422_texcoords = (const struct VL_TEXCOORD2F*)vl_mb_vertex_positions;
+const struct vlVertex2f *macroblock_chroma_422_texcoords = macroblock_verts;
 
 /*
  * Represents texcoords for the above for rendering 4 chroma blocks.
  * Same case as 4 luma blocks.
  */
-const struct VL_TEXCOORD2F *vl_chroma_444_texcoords = vl_luma_texcoords;
-
-/*
- * Represents 2 triangles in a strip in normalized coords.
- * Used to render the surface onto the frame buffer.
- */
-const struct VL_VERTEX2F vl_surface_vertex_positions[4] =
-{
-	{0.0f, 0.0f},
-	{0.0f, 1.0f},
-	{1.0f, 0.0f},
-	{1.0f, 1.0f}
-};
-
-/*
- * Represents texcoords for the above. We can use the position values directly.
- */
-const struct VL_TEXCOORD2F *vl_surface_texcoords = (const struct VL_TEXCOORD2F*)vl_surface_vertex_positions;
+const struct vlVertex2f *macroblock_chroma_444_texcoords = macroblock_luma_texcoords;
 
 /*
  * Used when rendering P and B macroblocks, multiplier is applied to the A channel,
@@ -84,97 +67,10 @@ const struct VL_TEXCOORD2F *vl_surface_texcoords = (const struct VL_TEXCOORD2F*)
  * get back the differential. The differential is then added to the samples from the
  * reference surface(s).
  */
+#if 0
 const struct VL_MC_FS_CONSTS vl_mc_fs_consts =
 {
 	{32767.0f / 255.0f, 32767.0f / 255.0f, 32767.0f / 255.0f, 0.0f},
 	{0.5f, 2.0f, 0.0f, 0.0f}
 };
-
-/*
- * Identity color conversion constants, for debugging
- */
-const struct VL_CSC_FS_CONSTS vl_csc_fs_consts_identity =
-{
-	{
-		0.0f, 0.0f, 0.0f, 0.0f
-	},
-	{
-		1.0f, 0.0f, 0.0f, 0.0f,
-		0.0f, 1.0f, 0.0f, 0.0f,
-		0.0f, 0.0f, 1.0f, 0.0f,
-		0.0f, 0.0f, 0.0f, 1.0f
-	}
-};
-
-/*
- * Converts ITU-R BT.601 YCbCr pixels to RGB pixels where:
- * Y is in [16,235], Cb and Cr are in [16,240]
- * R, G, and B are in [16,235]
- */
-const struct VL_CSC_FS_CONSTS vl_csc_fs_consts_601 =
-{
-	{
-		0.0f,		0.501960784f,	0.501960784f,	0.0f
-	},
-	{
-		1.0f,		0.0f,		1.371f,		0.0f,
-		1.0f,		-0.336f,	-0.698f,	0.0f,
-		1.0f,		1.732f,		0.0f,		0.0f,
-		0.0f,		0.0f,		0.0f,		1.0f
-	}
-};
-
-/*
- * Converts ITU-R BT.601 YCbCr pixels to RGB pixels where:
- * Y is in [16,235], Cb and Cr are in [16,240]
- * R, G, and B are in [0,255]
- */
-const struct VL_CSC_FS_CONSTS vl_csc_fs_consts_601_full =
-{
-	{
-		0.062745098f,	0.501960784f,	0.501960784f,	0.0f
-	},
-	{
-		1.164f,		0.0f,		1.596f,		0.0f,
-		1.164f,		-0.391f,	-0.813f,	0.0f,
-		1.164f,		2.018f,		0.0f,		0.0f,
-		0.0f,		0.0f,		0.0f,		1.0f
-	}
-};
-
-/*
- * Converts ITU-R BT.709 YCbCr pixels to RGB pixels where:
- * Y is in [16,235], Cb and Cr are in [16,240]
- * R, G, and B are in [16,235]
- */
-const struct VL_CSC_FS_CONSTS vl_csc_fs_consts_709 =
-{
-	{
-		0.0f,		0.501960784f,	0.501960784f,	0.0f
-	},
-	{
-		1.0f,		0.0f,		1.540f,		0.0f,
-		1.0f,		-0.183f,	-0.459f,	0.0f,
-		1.0f,		1.816f,		0.0f,		0.0f,
-		0.0f,		0.0f,		0.0f,		1.0f
-	}
-};
-
-/*
- * Converts ITU-R BT.709 YCbCr pixels to RGB pixels where:
- * Y is in [16,235], Cb and Cr are in [16,240]
- * R, G, and B are in [0,255]
- */
-const struct VL_CSC_FS_CONSTS vl_csc_fs_consts_709_full =
-{
-	{
-		0.062745098f,	0.501960784f,	0.501960784f,	0.0f
-	},
-	{
-		1.164f,		0.0f,		1.793f,		0.0f,
-		1.164f,		-0.213f,	-0.534f,	0.0f,
-		1.164f,		2.115f,		0.0f,		0.0f,
-		0.0f,		0.0f,		0.0f,		1.0f
-	}
-};
-
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_data.h b/src/gallium/state_trackers/g3dvl/vl_data.h
index 8f347273ad..f0de2e976c 100644
--- a/src/gallium/state_trackers/g3dvl/vl_data.h
+++ b/src/gallium/state_trackers/g3dvl/vl_data.h
@@ -3,15 +3,18 @@
 
 #include "vl_types.h"
 
-extern const struct VL_VERTEX2F vl_mb_vertex_positions[24];
-extern const struct VL_TEXCOORD2F vl_luma_texcoords[24];
-extern const struct VL_TEXCOORD2F *vl_chroma_420_texcoords;
-extern const struct VL_TEXCOORD2F *vl_chroma_422_texcoords;
-extern const struct VL_TEXCOORD2F *vl_chroma_444_texcoords;
+/* TODO: Needs to be rolled into the appropriate stage */
 
-extern const struct VL_VERTEX2F vl_surface_vertex_positions[4];
-extern const struct VL_TEXCOORD2F *vl_surface_texcoords;
+extern const struct vlVertex2f macroblock_verts[24];
+extern const struct vlVertex2f macroblock_luma_texcoords[24];
+extern const struct vlVertex2f *macroblock_chroma_420_texcoords;
+extern const struct vlVertex2f *macroblock_chroma_422_texcoords;
+extern const struct vlVertex2f *macroblock_chroma_444_texcoords;
 
+extern const struct vlVertex2f surface_verts[4];
+extern const struct vlVertex2f *surface_texcoords;
+
+/*
 extern const struct VL_MC_FS_CONSTS vl_mc_fs_consts;
 
 extern const struct VL_CSC_FS_CONSTS vl_csc_fs_consts_identity;
@@ -19,6 +22,6 @@ extern const struct VL_CSC_FS_CONSTS vl_csc_fs_consts_601;
 extern const struct VL_CSC_FS_CONSTS vl_csc_fs_consts_601_full;
 extern const struct VL_CSC_FS_CONSTS vl_csc_fs_consts_709;
 extern const struct VL_CSC_FS_CONSTS vl_csc_fs_consts_709_full;
+*/
 
 #endif
-
diff --git a/src/gallium/state_trackers/g3dvl/vl_defs.h b/src/gallium/state_trackers/g3dvl/vl_defs.h
index e668a7a10e..d612d02502 100644
--- a/src/gallium/state_trackers/g3dvl/vl_defs.h
+++ b/src/gallium/state_trackers/g3dvl/vl_defs.h
@@ -9,4 +9,3 @@
 #define VL_MACROBLOCK_SIZE	(VL_MACROBLOCK_WIDTH * VL_MACROBLOCK_HEIGHT)
 
 #endif
-
diff --git a/src/gallium/state_trackers/g3dvl/vl_display.c b/src/gallium/state_trackers/g3dvl/vl_display.c
new file mode 100644
index 0000000000..af80faa7f5
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_display.c
@@ -0,0 +1,48 @@
+#define VL_INTERNAL
+#include "vl_display.h"
+#include <assert.h>
+#include <stdlib.h>
+
+int vlCreateDisplay
+(
+	vlNativeDisplay native_display,
+	struct vlDisplay **display
+)
+{
+	struct vlDisplay *dpy;
+
+	assert(native_display);
+	assert(display);
+
+	dpy = calloc(1, sizeof(struct vlDisplay));
+
+	if (!dpy)
+		return 1;
+
+	dpy->native = native_display;
+	*display = dpy;
+
+	return 0;
+}
+
+int vlDestroyDisplay
+(
+	struct vlDisplay *display
+)
+{
+	assert(display);
+
+	free(display);
+
+	return 0;
+}
+
+vlNativeDisplay vlGetNativeDisplay
+(
+	struct vlDisplay *display
+)
+{
+	assert(display);
+
+	return display->native;
+}
diff --git a/src/gallium/state_trackers/g3dvl/vl_display.h b/src/gallium/state_trackers/g3dvl/vl_display.h
new file mode 100644
index 0000000000..e11fd40799
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_display.h
@@ -0,0 +1,29 @@
+#ifndef vl_display_h
+#define vl_display_h
+
+#include "vl_types.h"
+
+#ifdef VL_INTERNAL
+struct vlDisplay
+{
+	vlNativeDisplay native;
+};
+#endif
+
+int vlCreateDisplay
+(
+	vlNativeDisplay native_display,
+	struct vlDisplay **display
+);
+
+int vlDestroyDisplay
+(
+	struct vlDisplay *display
+);
+
+vlNativeDisplay vlGetNativeDisplay
+(
+	struct vlDisplay *display
+);
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.c b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.c
new file mode 100644
index 0000000000..4fae224431
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.c
@@ -0,0 +1,2315 @@
+#define VL_INTERNAL
+#include "vl_r16snorm_mc.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <pipe/p_context.h>
+#include <pipe/p_winsys.h>
+#include <pipe/p_state.h>
+#include <pipe/p_inlines.h>
+#include <tgsi/tgsi_parse.h>
+#include <tgsi/tgsi_build.h>
+#include "vl_render.h"
+#include "vl_shader_build.h"
+#include "vl_surface.h"
+#include "vl_util.h"
+#include "vl_types.h"
+#include "vl_defs.h"
+
+struct vlVertexShaderConsts
+{
+	/*struct vlVertex4f scale;
+	struct vlVertex4f denorm;*/
+	struct vlVertex4f	scale;
+	struct vlVertex4f	mb_pos_trans;
+	struct vlVertex4f	denorm;
+	struct
+	{
+		struct vlVertex4f	top_field;
+		struct vlVertex4f	bottom_field;
+	} mb_tc_trans[2];
+};
+
+struct vlFragmentShaderConsts
+{
+	struct vlVertex4f multiplier;
+	struct vlVertex4f div;
+};
+
+struct vlR16SnormMC
+{
+	struct vlRender				base;
+
+	unsigned int				video_width, video_height;
+	enum vlFormat				video_format;
+
+	struct pipe_context			*pipe;
+	struct pipe_viewport_state		viewport;
+	struct pipe_framebuffer_state		render_target;
+	struct pipe_sampler_state		*samplers[5];
+	struct pipe_texture			*textures[5];
+	void					*i_vs, *p_vs[2], *b_vs[2];
+	void					*i_fs, *p_fs[2], *b_fs[2];
+	struct pipe_vertex_buffer 		vertex_bufs[3];
+	struct pipe_vertex_element		vertex_elems[3];
+	struct pipe_constant_buffer		vs_const_buf, fs_const_buf;
+};
+
+int vlBegin
+(
+	struct vlRender *render
+)
+{
+	struct vlR16SnormMC	*mc;
+	struct pipe_context	*pipe;
+
+	assert(render);
+
+	mc = (struct vlR16SnormMC*)render;
+	pipe = mc->pipe;
+
+	/* Frame buffer set in vlRender*Macroblock() */
+	/* Shaders, samplers, textures set in vlRender*Macroblock() */
+	pipe->set_vertex_buffers(pipe, 3, mc->vertex_bufs);
+	pipe->set_vertex_elements(pipe, 3, mc->vertex_elems);
+	pipe->set_viewport_state(pipe, &mc->viewport);
+	pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &mc->vs_const_buf);
+	pipe->set_constant_buffer(pipe, PIPE_SHADER_FRAGMENT, 0, &mc->fs_const_buf);
+
+	return 0;
+}
+
+/*static int vlGrabMacroBlock
+(
+	struct vlR16SnormMC *mc,
+	struct vlMpeg2MacroBlock *macroblock
+)
+{
+	assert(mc);
+	assert(macroblock);
+
+
+
+	return 0;
+}*/
+
+/*#define DO_IDCT*/
+
+#ifdef DO_IDCT
+static int vlTransformBlock(short *src, short *dst, short bias)
+{
+	static const float basis[8][8] =
+	{
+		{0.3536,   0.4904,   0.4619,   0.4157,   0.3536,   0.2778,   0.1913,   0.0975},
+		{0.3536,   0.4157,   0.1913,  -0.0975,  -0.3536,  -0.4904,  -0.4619,  -0.2778},
+		{0.3536,   0.2778,  -0.1913,  -0.4904,  -0.3536,   0.0975,   0.4619,   0.4157},
+		{0.3536,   0.0975,  -0.4619,  -0.2778,   0.3536,   0.4157,  -0.1913,  -0.4904},
+		{0.3536,  -0.0975,  -0.4619,   0.2778,   0.3536,  -0.4157,  -0.1913,   0.4904},
+		{0.3536,  -0.2778,  -0.1913,   0.4904,  -0.3536,  -0.0975,   0.4619,  -0.4157},
+		{0.3536,  -0.4157,   0.1913,   0.0975,  -0.3536,   0.4904,  -0.4619,   0.2778},
+		{0.3536,  -0.4904,   0.4619,  -0.4157,   0.3536,  -0.2778,   0.1913,  -0.0975}
+	};
+
+	unsigned int	x, y;
+	short		tmp[64];
+
+	for (y = 0; y < VL_BLOCK_HEIGHT; ++y)
+		for (x = 0; x < VL_BLOCK_WIDTH; ++x)
+			tmp[y * VL_BLOCK_WIDTH + x] = (short)
+			(
+				src[y * VL_BLOCK_WIDTH + 0] * basis[x][0] +
+				src[y * VL_BLOCK_WIDTH + 1] * basis[x][1] +
+				src[y * VL_BLOCK_WIDTH + 2] * basis[x][2] +
+				src[y * VL_BLOCK_WIDTH + 3] * basis[x][3] +
+				src[y * VL_BLOCK_WIDTH + 4] * basis[x][4] +
+				src[y * VL_BLOCK_WIDTH + 5] * basis[x][5] +
+				src[y * VL_BLOCK_WIDTH + 6] * basis[x][6] +
+				src[y * VL_BLOCK_WIDTH + 7] * basis[x][7]
+			);
+
+	for (x = 0; x < VL_BLOCK_WIDTH; ++x)
+		for (y = 0; y < VL_BLOCK_HEIGHT; ++y)
+		{
+			dst[y * VL_BLOCK_WIDTH + x] = bias + (short)
+			(
+				tmp[0 * VL_BLOCK_WIDTH + x] * basis[y][0] +
+				tmp[1 * VL_BLOCK_WIDTH + x] * basis[y][1] +
+				tmp[2 * VL_BLOCK_WIDTH + x] * basis[y][2] +
+				tmp[3 * VL_BLOCK_WIDTH + x] * basis[y][3] +
+				tmp[4 * VL_BLOCK_WIDTH + x] * basis[y][4] +
+				tmp[5 * VL_BLOCK_WIDTH + x] * basis[y][5] +
+				tmp[6 * VL_BLOCK_WIDTH + x] * basis[y][6] +
+				tmp[7 * VL_BLOCK_WIDTH + x] * basis[y][7]
+			);
+			if (dst[y * VL_BLOCK_WIDTH + x] > 255)
+				dst[y * VL_BLOCK_WIDTH + x] = 255;
+			else if (bias > 0 && dst[y * VL_BLOCK_WIDTH + x] < 0)
+				dst[y * VL_BLOCK_WIDTH + x] = 0;
+		}
+	return 0;
+}
+#endif
+
+static int vlGrabFrameCodedBlock(short *src, short *dst, unsigned int dst_pitch)
+{
+	unsigned int y;
+
+	for (y = 0; y < VL_BLOCK_HEIGHT; ++y)
+		memcpy
+		(
+			dst + y * dst_pitch,
+			src + y * VL_BLOCK_WIDTH,
+			VL_BLOCK_WIDTH * 2
+		);
+
+	return 0;
+}
+
+static int vlGrabFieldCodedBlock(short *src, short *dst, unsigned int dst_pitch)
+{
+	unsigned int y;
+
+	for (y = 0; y < VL_BLOCK_HEIGHT / 2; ++y)
+		memcpy
+		(
+			dst + y * dst_pitch * 2,
+			src + y * VL_BLOCK_WIDTH,
+			VL_BLOCK_WIDTH * 2
+		);
+
+	dst += VL_BLOCK_HEIGHT * dst_pitch;
+
+	for (; y < VL_BLOCK_HEIGHT; ++y)
+		memcpy
+		(
+			dst + y * dst_pitch * 2,
+			src + y * VL_BLOCK_WIDTH,
+			VL_BLOCK_WIDTH * 2
+		);
+
+	return 0;
+}
+
+static int vlGrabNoBlock(short *dst, unsigned int dst_pitch)
+{
+	unsigned int y;
+
+	for (y = 0; y < VL_BLOCK_HEIGHT; ++y)
+		memset
+		(
+			dst + y * dst_pitch,
+			0,
+			VL_BLOCK_WIDTH * 2
+		);
+
+	return 0;
+}
+
+enum vlSampleType
+{
+	vlSampleTypeFull,
+	vlSampleTypeDiff
+};
+
+static int vlGrabBlocks
+(
+	struct vlR16SnormMC *mc,
+	unsigned int coded_block_pattern,
+	enum vlDCTType dct_type,
+	enum vlSampleType sample_type,
+	short *blocks
+)
+{
+	struct pipe_surface	*tex_surface;
+	short			*texels;
+	unsigned int		tex_pitch;
+	unsigned int		tb, sb = 0;
+
+	assert(mc);
+	assert(blocks);
+
+	tex_surface = mc->pipe->screen->get_tex_surface
+	(
+		mc->pipe->screen,
+		mc->textures[0],
+		0, 0, 0, PIPE_BUFFER_USAGE_CPU_WRITE
+	);
+
+	texels = pipe_surface_map(tex_surface, PIPE_BUFFER_USAGE_CPU_WRITE);
+	tex_pitch = tex_surface->stride / tex_surface->block.size;
+
+	for (tb = 0; tb < 4; ++tb)
+	{
+		if ((coded_block_pattern >> (5 - tb)) & 1)
+		{
+			short *cur_block = blocks + sb * VL_BLOCK_WIDTH * VL_BLOCK_HEIGHT;
+
+#ifdef DO_IDCT
+			vlTransformBlock(cur_block, cur_block, sample_type == vlSampleTypeFull ? 128 : 0);
+#endif
+
+			if (dct_type == vlDCTTypeFrameCoded)
+				vlGrabFrameCodedBlock
+				(
+					cur_block,
+					texels + tb * tex_pitch * VL_BLOCK_HEIGHT,
+					tex_pitch
+				);
+			else
+				vlGrabFieldCodedBlock
+				(
+					cur_block,
+					texels + (tb % 2) * tex_pitch * VL_BLOCK_HEIGHT + (tb / 2) * tex_pitch,
+					tex_pitch
+				);
+
+			++sb;
+		}
+		else
+			vlGrabNoBlock(texels + tb * tex_pitch * VL_BLOCK_HEIGHT, tex_pitch);
+	}
+
+	pipe_surface_unmap(tex_surface);
+
+	/* TODO: Implement 422, 444 */
+	for (tb = 0; tb < 2; ++tb)
+	{
+		tex_surface = mc->pipe->screen->get_tex_surface
+		(
+			mc->pipe->screen,
+			mc->textures[tb + 1],
+			0, 0, 0, PIPE_BUFFER_USAGE_CPU_WRITE
+		);
+
+		texels = pipe_surface_map(tex_surface, PIPE_BUFFER_USAGE_CPU_WRITE);
+		tex_pitch = tex_surface->stride / tex_surface->block.size;
+
+		if ((coded_block_pattern >> (1 - tb)) & 1)
+		{
+			short *cur_block = blocks + sb * VL_BLOCK_WIDTH * VL_BLOCK_HEIGHT;
+
+#ifdef DO_IDCT
+			vlTransformBlock(cur_block, cur_block, sample_type == vlSampleTypeFull ? 128 : 0);
+#endif
+
+			vlGrabFrameCodedBlock
+			(
+				cur_block,
+				texels,
+				tex_pitch
+			);
+
+			++sb;
+		}
+		else
+			vlGrabNoBlock(texels, tex_pitch);
+
+		pipe_surface_unmap(tex_surface);
+	}
+
+	return 0;
+}
+
+int vlRenderIMacroBlock
+(
+	struct vlR16SnormMC *mc,
+	enum vlPictureType picture_type,
+	enum vlFieldOrder field_order,
+	unsigned int mbx,
+	unsigned int mby,
+	unsigned int coded_block_pattern,
+	enum vlDCTType dct_type,
+	short *blocks,
+	struct vlSurface *surface
+)
+{
+	struct pipe_context		*pipe;
+	struct vlVertexShaderConsts	*vs_consts;
+
+	assert(blocks);
+	assert(surface);
+
+	/* TODO: Implement interlaced rendering */
+	if (picture_type != vlPictureTypeFrame)
+		return 0;
+
+	vlGrabBlocks(mc, coded_block_pattern, dct_type, vlSampleTypeFull, blocks);
+
+	pipe = mc->pipe;
+
+	vs_consts = pipe->winsys->buffer_map
+	(
+		pipe->winsys,
+		mc->vs_const_buf.buffer,
+		PIPE_BUFFER_USAGE_CPU_WRITE
+	);
+
+	vs_consts->scale.x = VL_MACROBLOCK_WIDTH / (float)surface->texture->width[0];
+	vs_consts->scale.y = VL_MACROBLOCK_HEIGHT / (float)surface->texture->height[0];
+	vs_consts->scale.z = 1.0f;
+	vs_consts->scale.w = 1.0f;
+	vs_consts->mb_pos_trans.x = (mbx * VL_MACROBLOCK_WIDTH) / (float)surface->texture->width[0];
+	vs_consts->mb_pos_trans.y = (mby * VL_MACROBLOCK_HEIGHT) / (float)surface->texture->height[0];
+	vs_consts->mb_pos_trans.z = 0.0f;
+	vs_consts->mb_pos_trans.w = 0.0f;
+
+	pipe->winsys->buffer_unmap(pipe->winsys, mc->vs_const_buf.buffer);
+
+	mc->render_target.cbufs[0] = pipe->screen->get_tex_surface
+	(
+		pipe->screen,
+		surface->texture,
+		0, 0, 0, PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE
+	);
+	pipe->set_framebuffer_state(pipe, &mc->render_target);
+	pipe->set_sampler_textures(pipe, 3, mc->textures);
+	pipe->bind_sampler_states(pipe, 3, (void**)mc->samplers);
+	pipe->bind_vs_state(pipe, mc->i_vs);
+	pipe->bind_fs_state(pipe, mc->i_fs);
+
+	pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, 0, 24);
+
+	return 0;
+}
+
+int vlRenderPMacroBlock
+(
+	struct vlR16SnormMC *mc,
+	enum vlPictureType picture_type,
+	enum vlFieldOrder field_order,
+	unsigned int mbx,
+	unsigned int mby,
+	enum vlMotionType mc_type,
+	short top_x,
+	short top_y,
+	short bottom_x,
+	short bottom_y,
+	unsigned int coded_block_pattern,
+	enum vlDCTType dct_type,
+	short *blocks,
+	struct vlSurface *ref_surface,
+	struct vlSurface *surface
+)
+{
+	struct pipe_context		*pipe;
+	struct vlVertexShaderConsts	*vs_consts;
+
+	assert(motion_vectors);
+	assert(blocks);
+	assert(ref_surface);
+	assert(surface);
+
+	/* TODO: Implement interlaced rendering */
+	if (picture_type != vlPictureTypeFrame)
+		return 0;
+	/* TODO: Implement other MC types */
+	if (mc_type != vlMotionTypeFrame && mc_type != vlMotionTypeField)
+		return 0;
+
+	vlGrabBlocks(mc, coded_block_pattern, dct_type, vlSampleTypeDiff, blocks);
+
+	pipe = mc->pipe;
+
+	vs_consts = pipe->winsys->buffer_map
+	(
+		pipe->winsys,
+		mc->vs_const_buf.buffer,
+		PIPE_BUFFER_USAGE_CPU_WRITE
+	);
+
+	vs_consts->scale.x = VL_MACROBLOCK_WIDTH / (float)surface->texture->width[0];
+	vs_consts->scale.y = VL_MACROBLOCK_HEIGHT / (float)surface->texture->height[0];
+	vs_consts->scale.z = 1.0f;
+	vs_consts->scale.w = 1.0f;
+	vs_consts->mb_pos_trans.x = (mbx * VL_MACROBLOCK_WIDTH) / (float)surface->texture->width[0];
+	vs_consts->mb_pos_trans.y = (mby * VL_MACROBLOCK_HEIGHT) / (float)surface->texture->height[0];
+	vs_consts->mb_pos_trans.z = 0.0f;
+	vs_consts->mb_pos_trans.w = 0.0f;
+	vs_consts->mb_tc_trans[0].top_field.x = (mbx * VL_MACROBLOCK_WIDTH + top_x * 0.5f) / (float)surface->texture->width[0];
+	vs_consts->mb_tc_trans[0].top_field.y = (mby * VL_MACROBLOCK_HEIGHT + top_y * 0.5f) / (float)surface->texture->height[0];
+	vs_consts->mb_tc_trans[0].top_field.z = 0.0f;
+	vs_consts->mb_tc_trans[0].top_field.w = 0.0f;
+
+	if (mc_type == vlMotionTypeField)
+	{
+		vs_consts->denorm.x = (float)surface->texture->width[0];
+		vs_consts->denorm.y = (float)surface->texture->height[0];
+
+		vs_consts->mb_tc_trans[0].bottom_field.x = (mbx * VL_MACROBLOCK_WIDTH + bottom_x * 0.5f) / (float)surface->texture->width[0];
+		vs_consts->mb_tc_trans[0].bottom_field.y = (mby * VL_MACROBLOCK_HEIGHT + bottom_y * 0.5f) / (float)surface->texture->height[0];
+		vs_consts->mb_tc_trans[0].bottom_field.z = 0.0f;
+		vs_consts->mb_tc_trans[0].bottom_field.w = 0.0f;
+
+		pipe->bind_vs_state(pipe, mc->p_vs[1]);
+		pipe->bind_fs_state(pipe, mc->p_fs[1]);
+	}
+	else
+	{
+		pipe->bind_vs_state(pipe, mc->p_vs[0]);
+		pipe->bind_fs_state(pipe, mc->p_fs[0]);
+	}
+
+	pipe->winsys->buffer_unmap(pipe->winsys, mc->vs_const_buf.buffer);
+
+	mc->render_target.cbufs[0] = pipe->screen->get_tex_surface
+	(
+		pipe->screen,
+		surface->texture,
+		0, 0, 0, PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE
+	);
+	pipe->set_framebuffer_state(pipe, &mc->render_target);
+
+	mc->textures[3] = ref_surface->texture;
+	pipe->set_sampler_textures(pipe, 4, mc->textures);
+	pipe->bind_sampler_states(pipe, 4, (void**)mc->samplers);
+
+	pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, 0, 24);
+
+	return 0;
+}
+
+int vlRenderBMacroBlock
+(
+	struct vlR16SnormMC *mc,
+	enum vlPictureType picture_type,
+	enum vlFieldOrder field_order,
+	unsigned int mbx,
+	unsigned int mby,
+	enum vlMotionType mc_type,
+	short top_past_x,
+	short top_past_y,
+	short bottom_past_x,
+	short bottom_past_y,
+	short top_future_x,
+	short top_future_y,
+	short bottom_future_x,
+	short bottom_future_y,
+	unsigned int coded_block_pattern,
+	enum vlDCTType dct_type,
+	short *blocks,
+	struct vlSurface *past_surface,
+	struct vlSurface *future_surface,
+	struct vlSurface *surface
+)
+{
+	struct pipe_context		*pipe;
+	struct vlVertexShaderConsts	*vs_consts;
+
+	assert(motion_vectors);
+	assert(blocks);
+	assert(ref_surface);
+	assert(surface);
+
+	/* TODO: Implement interlaced rendering */
+	if (picture_type != vlPictureTypeFrame)
+		return 0;
+	/* TODO: Implement other MC types */
+	if (mc_type != vlMotionTypeFrame && mc_type != vlMotionTypeField)
+		return 0;
+
+	vlGrabBlocks(mc, coded_block_pattern, dct_type, vlSampleTypeDiff, blocks);
+
+	pipe = mc->pipe;
+
+	vs_consts = pipe->winsys->buffer_map
+	(
+		pipe->winsys,
+		mc->vs_const_buf.buffer,
+		PIPE_BUFFER_USAGE_CPU_WRITE
+	);
+
+	vs_consts->scale.x = VL_MACROBLOCK_WIDTH / (float)surface->texture->width[0];
+	vs_consts->scale.y = VL_MACROBLOCK_HEIGHT / (float)surface->texture->height[0];
+	vs_consts->scale.z = 1.0f;
+	vs_consts->scale.w = 1.0f;
+	vs_consts->mb_pos_trans.x = (mbx * VL_MACROBLOCK_WIDTH) / (float)surface->texture->width[0];
+	vs_consts->mb_pos_trans.y = (mby * VL_MACROBLOCK_HEIGHT) / (float)surface->texture->height[0];
+	vs_consts->mb_pos_trans.z = 0.0f;
+	vs_consts->mb_pos_trans.w = 0.0f;
+	vs_consts->mb_tc_trans[0].top_field.x = (mbx * VL_MACROBLOCK_WIDTH + top_past_x * 0.5f) / (float)surface->texture->width[0];
+	vs_consts->mb_tc_trans[0].top_field.y = (mby * VL_MACROBLOCK_HEIGHT + top_past_y * 0.5f) / (float)surface->texture->height[0];
+	vs_consts->mb_tc_trans[0].top_field.z = 0.0f;
+	vs_consts->mb_tc_trans[0].top_field.w = 0.0f;
+	vs_consts->mb_tc_trans[1].top_field.x = (mbx * VL_MACROBLOCK_WIDTH + top_future_x * 0.5f) / (float)surface->texture->width[0];
+	vs_consts->mb_tc_trans[1].top_field.y = (mby * VL_MACROBLOCK_HEIGHT + top_future_y * 0.5f) / (float)surface->texture->height[0];
+	vs_consts->mb_tc_trans[1].top_field.z = 0.0f;
+	vs_consts->mb_tc_trans[1].top_field.w = 0.0f;
+
+	if (mc_type == vlMotionTypeField)
+	{
+		vs_consts->denorm.x = (float)surface->texture->width[0];
+		vs_consts->denorm.y = (float)surface->texture->height[0];
+
+		vs_consts->mb_tc_trans[0].bottom_field.x = (mbx * VL_MACROBLOCK_WIDTH + bottom_past_x * 0.5f) / (float)surface->texture->width[0];
+		vs_consts->mb_tc_trans[0].bottom_field.y = (mby * VL_MACROBLOCK_HEIGHT + bottom_past_y * 0.5f) / (float)surface->texture->height[0];
+		vs_consts->mb_tc_trans[0].bottom_field.z = 0.0f;
+		vs_consts->mb_tc_trans[0].bottom_field.w = 0.0f;
+		vs_consts->mb_tc_trans[1].bottom_field.x = (mbx * VL_MACROBLOCK_WIDTH + bottom_future_x * 0.5f) / (float)surface->texture->width[0];
+		vs_consts->mb_tc_trans[1].bottom_field.y = (mby * VL_MACROBLOCK_HEIGHT + bottom_future_y * 0.5f) / (float)surface->texture->height[0];
+		vs_consts->mb_tc_trans[1].bottom_field.z = 0.0f;
+		vs_consts->mb_tc_trans[1].bottom_field.w = 0.0f;
+
+		pipe->bind_vs_state(pipe, mc->b_vs[1]);
+		pipe->bind_fs_state(pipe, mc->b_fs[1]);
+	}
+	else
+	{
+		pipe->bind_vs_state(pipe, mc->b_vs[0]);
+		pipe->bind_fs_state(pipe, mc->b_fs[0]);
+	}
+
+	pipe->winsys->buffer_unmap(pipe->winsys, mc->vs_const_buf.buffer);
+
+	mc->render_target.cbufs[0] = pipe->screen->get_tex_surface
+	(
+		pipe->screen,
+		surface->texture,
+		0, 0, 0, PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE
+	);
+	pipe->set_framebuffer_state(pipe, &mc->render_target);
+
+	mc->textures[3] = past_surface->texture;
+	mc->textures[4] = future_surface->texture;
+	pipe->set_sampler_textures(pipe, 5, mc->textures);
+	pipe->bind_sampler_states(pipe, 5, (void**)mc->samplers);
+
+	pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, 0, 24);
+
+	return 0;
+}
+
+int vlRenderMacroBlocksMpeg2R16Snorm
+(
+	struct vlRender *render,
+	struct vlMpeg2MacroBlockBatch *batch,
+	struct vlSurface *surface
+)
+{
+	struct vlR16SnormMC	*mc;
+	unsigned int		i;
+
+	assert(render);
+
+	mc = (struct vlR16SnormMC*)render;
+
+	/*for (i = 0; i < batch->num_macroblocks; ++i)
+		vlGrabMacroBlock(batch->macroblocks[i]);*/
+
+	for (i = 0; i < batch->num_macroblocks; ++i)
+	{
+		switch (batch->macroblocks[i].mb_type)
+		{
+			case vlMacroBlockTypeIntra:
+			{
+				vlRenderIMacroBlock
+				(
+					mc,
+					batch->picture_type,
+					batch->field_order,
+					batch->macroblocks[i].mbx,
+					batch->macroblocks[i].mby,
+					batch->macroblocks[i].cbp,
+					batch->macroblocks[i].dct_type,
+					batch->macroblocks[i].blocks,
+					surface
+				);
+				break;
+			}
+			case vlMacroBlockTypeFwdPredicted:
+			{
+				vlRenderPMacroBlock
+				(
+					mc,
+					batch->picture_type,
+					batch->field_order,
+					batch->macroblocks[i].mbx,
+					batch->macroblocks[i].mby,
+					batch->macroblocks[i].mo_type,
+					batch->macroblocks[i].PMV[0][0][0],
+					batch->macroblocks[i].PMV[0][0][1],
+					batch->macroblocks[i].PMV[1][0][0],
+					batch->macroblocks[i].PMV[1][0][1],
+					batch->macroblocks[i].cbp,
+					batch->macroblocks[i].dct_type,
+					batch->macroblocks[i].blocks,
+					batch->past_surface,
+					surface
+				);
+				break;
+			}
+			case vlMacroBlockTypeBkwdPredicted:
+			{
+				vlRenderPMacroBlock
+				(
+					mc,
+					batch->picture_type,
+					batch->field_order,
+					batch->macroblocks[i].mbx,
+					batch->macroblocks[i].mby,
+					batch->macroblocks[i].mo_type,
+					batch->macroblocks[i].PMV[0][1][0],
+					batch->macroblocks[i].PMV[0][1][1],
+					batch->macroblocks[i].PMV[1][1][0],
+					batch->macroblocks[i].PMV[1][1][1],
+					batch->macroblocks[i].cbp,
+					batch->macroblocks[i].dct_type,
+					batch->macroblocks[i].blocks,
+					batch->future_surface,
+					surface
+				);
+				break;
+			}
+			case vlMacroBlockTypeBiPredicted:
+			{
+				vlRenderBMacroBlock
+				(
+					mc,
+					batch->picture_type,
+					batch->field_order,
+					batch->macroblocks[i].mbx,
+					batch->macroblocks[i].mby,
+					batch->macroblocks[i].mo_type,
+					batch->macroblocks[i].PMV[0][0][0],
+					batch->macroblocks[i].PMV[0][0][1],
+					batch->macroblocks[i].PMV[1][0][0],
+					batch->macroblocks[i].PMV[1][0][1],
+					batch->macroblocks[i].PMV[0][1][0],
+					batch->macroblocks[i].PMV[0][1][1],
+					batch->macroblocks[i].PMV[1][1][0],
+					batch->macroblocks[i].PMV[1][1][1],
+					batch->macroblocks[i].cbp,
+					batch->macroblocks[i].dct_type,
+					batch->macroblocks[i].blocks,
+					batch->past_surface,
+					batch->future_surface,
+					surface
+				);
+				break;
+			}
+			default:
+				assert(0);
+		}
+	}
+
+	return 0;
+}
+
+int vlEnd
+(
+	struct vlRender *render
+)
+{
+	assert(render);
+
+	return 0;
+}
+
+int vlDestroy
+(
+	struct vlRender *render
+)
+{
+	struct vlR16SnormMC	*mc;
+	struct pipe_context	*pipe;
+	unsigned int		i;
+
+	assert(render);
+
+	mc = (struct vlR16SnormMC*)render;
+	pipe = mc->pipe;
+
+	for (i = 0; i < 5; ++i)
+		pipe->delete_sampler_state(pipe, mc->samplers[i]);
+
+	for (i = 0; i < 3; ++i)
+		pipe->winsys->buffer_destroy(pipe->winsys, mc->vertex_bufs[i].buffer);
+
+	/* Textures 3 & 4 are not created directly, no need to release them here */
+	for (i = 0; i < 3; ++i)
+		pipe_texture_release(&mc->textures[i]);
+
+	pipe->delete_vs_state(pipe, mc->i_vs);
+	pipe->delete_fs_state(pipe, mc->i_fs);
+
+	for (i = 0; i < 2; ++i)
+	{
+		pipe->delete_vs_state(pipe, mc->p_vs[i]);
+		pipe->delete_fs_state(pipe, mc->p_fs[i]);
+		pipe->delete_vs_state(pipe, mc->b_vs[i]);
+		pipe->delete_fs_state(pipe, mc->b_fs[i]);
+	}
+
+	pipe->winsys->buffer_destroy(pipe->winsys, mc->vs_const_buf.buffer);
+	pipe->winsys->buffer_destroy(pipe->winsys, mc->fs_const_buf.buffer);
+
+	free(mc);
+
+	return 0;
+}
+
+/*
+ * Represents 8 triangles (4 quads, 1 per block) in noormalized coords
+ * that render a macroblock.
+ * Need to be scaled to cover mbW*mbH macroblock pixels and translated into
+ * position on target surface.
+ */
+const struct vlVertex2f macroblock_verts[24] =
+{
+	{0.0f, 0.0f}, {0.0f, 0.5f}, {0.5f, 0.0f},
+	{0.5f, 0.0f}, {0.0f, 0.5f}, {0.5f, 0.5f},
+
+	{0.5f, 0.0f}, {0.5f, 0.5f}, {1.0f, 0.0f},
+	{1.0f, 0.0f}, {0.5f, 0.5f}, {1.0f, 0.5f},
+
+	{0.0f, 0.5f}, {0.0f, 1.0f}, {0.5f, 0.5f},
+	{0.5f, 0.5f}, {0.0f, 1.0f}, {0.5f, 1.0f},
+
+	{0.5f, 0.5f}, {0.5f, 1.0f}, {1.0f, 0.5f},
+	{1.0f, 0.5f}, {0.5f, 1.0f}, {1.0f, 1.0f}
+};
+
+/*
+ * Represents texcoords for the above for rendering 4 luma blocks arranged
+ * in a bW*(bH*4) texture. First luma block located at 0,0->bW,bH; second at
+ * 0,bH->bW,2bH; third at 0,2bH->bW,3bH; fourth at 0,3bH->bW,4bH.
+ */
+const struct vlVertex2f macroblock_luma_texcoords[24] =
+{
+	{0.0f, 0.0f}, {0.0f, 0.25f}, {1.0f, 0.0f},
+	{1.0f, 0.0f}, {0.0f, 0.25f}, {1.0f, 0.25f},
+
+	{0.0f, 0.25f}, {0.0f, 0.5f}, {1.0f, 0.25f},
+	{1.0f, 0.25f}, {0.0f, 0.5f}, {1.0f, 0.5f},
+
+	{0.0f, 0.5f}, {0.0f, 0.75f}, {1.0f, 0.5f},
+	{1.0f, 0.5f}, {0.0f, 0.75f}, {1.0f, 0.75f},
+
+	{0.0f, 0.75f}, {0.0f, 1.0f}, {1.0f, 0.75f},
+	{1.0f, 0.75f}, {0.0f, 1.0f}, {1.0f, 1.0f}
+};
+
+/*
+ * Represents texcoords for the above for rendering 1 chroma block.
+ * Straight forward 0,0->1,1 mapping so we can reuse the MB pos vectors.
+ */
+const struct vlVertex2f *macroblock_chroma_420_texcoords = macroblock_verts;
+
+/*
+ * Represents texcoords for the above for rendering 2 chroma blocks arranged
+ * in a bW*(bH*2) texture. First chroma block located at 0,0->bW,bH; second at
+ * 0,bH->bW,2bH. We can render this with 0,0->1,1 mapping.
+ * Straight forward 0,0->1,1 mapping so we can reuse MB pos vectors.
+ */
+const struct vlVertex2f *macroblock_chroma_422_texcoords = macroblock_verts;
+
+/*
+ * Represents texcoords for the above for rendering 4 chroma blocks.
+ * Same case as 4 luma blocks.
+ */
+const struct vlVertex2f *macroblock_chroma_444_texcoords = macroblock_luma_texcoords;
+
+/*
+ * Used when rendering P and B macroblocks, multiplier is applied to the A channel,
+ * which is then added to the L channel, then the bias is subtracted from that to
+ * get back the differential. The differential is then added to the samples from the
+ * reference surface(s).
+ */
+const struct vlFragmentShaderConsts fs_consts =
+{
+	{32767.0f / 255.0f, 32767.0f / 255.0f, 32767.0f / 255.0f, 0.0f},
+	{0.5f, 2.0f, 0.0f, 0.0f}
+};
+
+static int vlCreateVertexShaderIMB
+(
+	struct vlR16SnormMC *mc
+)
+{
+	const unsigned int		max_tokens = 50;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	vs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0		; Vertex pos
+	 * decl i1		; Luma texcoords
+	 * decl i2		; Chroma texcoords
+	 */
+	for (i = 0; i < 3; i++)
+	{
+		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl c0		; Scaling vector to scale unit rect to macroblock size
+	 * decl c1		; Translation vector to move macroblock into position
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl o0		; Vertex pos
+	 * decl o1		; Luma texcoords
+	 * decl o2		; Chroma texcoords
+	 */
+	for (i = 0; i < 3; i++)
+	{
+		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl t0 */
+	decl = vl_decl_temps(0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t0, i0, c0	; Scale unit rect to normalized MB size */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add o0, t0, c1	; Translate rect into position */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * mov o1, i1		; Move input luma texcoords to output
+	 * mov o2, i2		; Move input chroma texcoords to output
+	 */
+	for (i = 1; i < 3; ++i)
+	{
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	vs.tokens = tokens;
+	mc->i_vs = pipe->create_vs_state(pipe, &vs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateFragmentShaderIMB
+(
+	struct vlR16SnormMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	fs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0			; Texcoords for s0
+	 * decl i1			; Texcoords for s1, s2
+	 */
+	for (i = 0; i < 2; ++i)
+	{
+		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl o0			; Fragment color */
+	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl t0, t1 */
+	decl = vl_decl_temps(0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl s0			; Sampler for luma texture
+	 * decl s1			; Sampler for chroma Cb texture
+	 * decl s2			; Sampler for chroma Cr texture
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		decl = vl_decl_samplers(i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header,max_tokens - ti);
+	}
+
+	/*
+	 * tex2d t1, i0, s0		; Read texel from luma texture
+	 * mov t0.x, t1.x		; Move luma sample into .x component
+	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
+	 * mov t0.y, t1.x		; Move Cb sample into .y component
+	 * tex2d t1, i1, s2		; Read texel from chroma Cr texture
+	 * mov t0.z, t1.x		; Move Cr sample into .z component
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i > 0 ? 1 : 0, TGSI_FILE_SAMPLER, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	}
+
+	/* mul o0, t0, c0		; Rescale texel to correct range */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	fs.tokens = tokens;
+	mc->i_fs = pipe->create_fs_state(pipe, &fs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateVertexShaderFramePMB
+(
+	struct vlR16SnormMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	vs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0		; Vertex pos
+	 * decl i1		; Luma texcoords
+	 * decl i2		; Chroma texcoords
+	 */
+	for (i = 0; i < 3; i++)
+	{
+		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl c0		; Scaling vector to scale unit rect to macroblock size
+	 * decl c1		; Translation vector to move macroblock into position
+	 * decl c2		; Unused
+	 * decl c3		; Translation vector to move ref macroblock texcoords into position
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 3);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl o0		; Vertex pos
+	 * decl o1		; Luma texcoords
+	 * decl o2		; Chroma texcoords
+	 * decl o3		; Ref macroblock texcoords
+	 */
+	for (i = 0; i < 4; i++)
+	{
+		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl t0 */
+	decl = vl_decl_temps(0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t0, i0, c0	; Scale unit rect to normalized MB size */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add o0, t0, c1	; Translate rect into position */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * mov o1, i1		; Move input luma texcoords to output
+	 * mov o2, i2		; Move input chroma texcoords to output
+	 */
+	for (i = 1; i < 3; ++i)
+	{
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* add o3, t0, c3	; Translate rect into position on ref macroblock */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 3, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 3);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	vs.tokens = tokens;
+	mc->p_vs[0] = pipe->create_vs_state(pipe, &vs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateVertexShaderFieldPMB
+(
+	struct vlR16SnormMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	vs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0		; Vertex pos
+	 * decl i1		; Luma texcoords
+	 * decl i2		; Chroma texcoords
+	 */
+	for (i = 0; i < 3; i++)
+	{
+		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration
+		(
+			&decl,
+			&tokens[ti],
+			header,
+			max_tokens - ti
+		);
+	}
+
+	/*
+	 * decl c0		; Scaling vector to scale unit rect to macroblock size
+	 * decl c1		; Translation vector to move macroblock into position
+	 * decl c2		; Denorm coefficients
+	 * decl c3		; Translation vector to move top field ref macroblock texcoords into position
+	 * decl c4		; Translation vector to move bottom field ref macroblock texcoords into position
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 4);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl o0		; Vertex pos
+	 * decl o1		; Luma texcoords
+	 * decl o2		; Chroma texcoords
+	 * decl o3		; Top field ref macroblock texcoords
+	 * decl o4		; Bottom field ref macroblock texcoords
+	 * decl o5		; Denormalized vertex pos
+	 */
+	for (i = 0; i < 6; i++)
+	{
+		decl = vl_decl_output((i == 0 || i == 5) ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl t0, t1 */
+	decl = vl_decl_temps(0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t0, i0, c0	; Scale unit rect to normalized MB size */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add t1, t0, c1	; Translate rect into position */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* mov o0, t1		; Move vertex pos to output */
+	inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	mov o1, i1		; Move input luma texcoords to output
+	mov o2, i2		; Move input chroma texcoords to output
+	*/
+	for (i = 1; i < 3; ++i)
+	{
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* add o3, t0, c3	; Translate top field rect into position on ref macroblock
+	   add o4, t0, c4	; Translate bottom field rect into position on ref macroblock */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 3, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, i + 3);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* mul o5, t1, c2	; Denorm vertex pos */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 5, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 2);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	vs.tokens = tokens;
+	mc->p_vs[1] = pipe->create_vs_state(pipe, &vs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateFragmentShaderFramePMB
+(
+	struct vlR16SnormMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	fs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0			; Texcoords for s0
+	 * decl i1			; Texcoords for s1, s2
+	 * decl i2			; Texcoords for s3
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl o0			; Fragment color */
+	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl t0, t1 */
+	decl = vl_decl_temps(0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl s0			; Sampler for luma texture
+	 * decl s1			; Sampler for chroma Cb texture
+	 * decl s2			; Sampler for chroma Cr texture
+	 * decl s3			; Sampler for ref surface texture
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		decl = vl_decl_samplers(i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * tex2d t1, i0, s0		; Read texel from luma texture
+	 * mov t0.x, t1.x		; Move luma sample into .x component
+	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
+	 * mov t0.y, t1.x		; Move Cb sample into .y component
+	 * tex2d t1, i1, s2		; Read texel from chroma Cr texture
+	 * mov t0.z, t1.x		; Move Cr sample into .z component
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i > 0 ? 1 : 0, TGSI_FILE_SAMPLER, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	}
+
+	/* mul t0, t0, c0		; Rescale texel to correct range */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* tex2d t1, i2, s3		; Read texel from ref macroblock */
+	inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, 2, TGSI_FILE_SAMPLER, 3);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add o0, t0, t1		; Add ref and differential to form final output */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	fs.tokens = tokens;
+	mc->p_fs[0] = pipe->create_fs_state(pipe, &fs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateFragmentShaderFieldPMB
+(
+	struct vlR16SnormMC *mc
+)
+{
+	const unsigned int		max_tokens = 200;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	fs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0			; Texcoords for s0
+	 * decl i1			; Texcoords for s1, s2
+	 * decl i2			; Texcoords for s3
+	 * decl i3			; Texcoords for s3
+	 * decl i4			; Denormalized vertex pos
+	 */
+	for (i = 0; i < 5; ++i)
+	{
+		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm
+	 * decl c1			; Constants 1/2 & 2 in .x, .y channels for Y-mod-2 top/bottom field selection
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl o0			; Fragment color */
+	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl t0-t4 */
+	decl = vl_decl_temps(0, 4);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl s0			; Sampler for luma texture
+	 * decl s1			; Sampler for chroma Cb texture
+	 * decl s2			; Sampler for chroma Cr texture
+	 * decl s3			; Sampler for ref surface texture
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		decl = vl_decl_samplers(i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * tex2d t1, i0, s0		; Read texel from luma texture
+	 * mov t0.x, t1.x		; Move luma sample into .x component
+	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
+	 * mov t0.y, t1.x		; Move Cb sample into .y component
+	 * tex2d t1, i1, s2		; Read texel from chroma Cr texture
+	 * mov t0.z, t1.x		; Move Cr sample into .z component
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i > 0 ? 1 : 0, TGSI_FILE_SAMPLER, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	}
+
+	/* mul t0, t0, c0		; Rescale texel to correct range */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * tex2d t1, i2, s3		; Read texel from ref macroblock top field
+	 * tex2d t2, i3, s3		; Read texel from ref macroblock bottom field
+	 */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 2, TGSI_FILE_SAMPLER, 3);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* XXX: Pos values off by 0.5? */
+	/* sub t4, i4.y, c1.x		; Sub 0.5 from denormalized pos */
+	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_INPUT, 4, TGSI_FILE_CONSTANT, 1);
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t3, t4, c1.x		; Multiply pos Y-coord by 1/2 */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_CONSTANT, 1);
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* floor t3, t3			; Get rid of fractional part */
+	inst = vl_inst2(TGSI_OPCODE_FLOOR, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t3, t3, c1.y		; Multiply by 2 */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_CONSTANT, 1);
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* sub t3, t4, t3		; Subtract from original Y to get Y % 2 */
+	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_TEMPORARY, 3);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* TODO: Move to conditional tex fetch on t3 instead of lerp */
+	/* lerp t1, t3, t1, t2		; Choose between top and bottom fields based on Y % 2 */
+	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add o0, t0, t1		; Add ref and differential to form final output */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	fs.tokens = tokens;
+	mc->p_fs[1] = pipe->create_fs_state(pipe, &fs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateVertexShaderFrameBMB
+(
+	struct vlR16SnormMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	vs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0		; Vertex pos
+	 * decl i1		; Luma texcoords
+	 * decl i2		; Chroma texcoords
+	 */
+	for (i = 0; i < 3; i++)
+	{
+		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl c0		; Scaling vector to scale unit rect to macroblock size
+	 * decl c1		; Translation vector to move macroblock into position
+	 * decl c2		; Unused
+	 * decl c3		; Translation vector to move past ref macroblock texcoords into position
+	 * decl c4		; Unused
+	 * decl c5		; Translation vector to move future ref macroblock texcoords into position
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 5);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl o0		; Vertex pos
+	 * decl o1		; Luma texcoords
+	 * decl o2		; Chroma texcoords
+	 * decl o3		; Past ref macroblock texcoords
+	 * decl o4		; Future ref macroblock texcoords
+	 */
+	for (i = 0; i < 5; i++)
+	{
+		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl t0 */
+	decl = vl_decl_temps(0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t0, i0, c0	; Scale unit rect to normalized MB size */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add o0, t0, c1	; Translate rect into position */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * mov o1, i1		; Move input luma texcoords to output
+	 * mov o2, i2		; Move input chroma texcoords to output
+	 */
+	for (i = 1; i < 3; ++i)
+	{
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* add o3, t0, c3	; Translate rect into position on past ref macroblock
+	   add o4, t0, c5	; Translate rect into position on future ref macroblock */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 3, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, i * 2 + 3);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	vs.tokens = tokens;
+	mc->b_vs[0] = pipe->create_vs_state(pipe, &vs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateVertexShaderFieldBMB
+(
+	struct vlR16SnormMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	vs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0		; Vertex pos
+	 * decl i1		; Luma texcoords
+	 * decl i2		; Chroma texcoords
+	 */
+	for (i = 0; i < 3; i++)
+	{
+		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl c0		; Scaling vector to scale unit rect to macroblock size
+	 * decl c1		; Translation vector to move macroblock into position
+	 * decl c2		; Denorm coefficients
+	 * decl c3		; Translation vector to move top field past ref macroblock texcoords into position
+	 * decl c4		; Translation vector to move bottom field past ref macroblock texcoords into position
+	 * decl c5		; Translation vector to move top field future ref macroblock texcoords into position
+	 * decl c6		; Translation vector to move bottom field future ref macroblock texcoords into position
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 6);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl o0		; Vertex pos
+	 * decl o1		; Luma texcoords
+	 * decl o2		; Chroma texcoords
+	 * decl o3		; Top field past ref macroblock texcoords
+	 * decl o4		; Bottom field past ref macroblock texcoords
+	 * decl o5		; Top field future ref macroblock texcoords
+	 * decl o6		; Bottom field future ref macroblock texcoords
+	 * decl o7		; Denormalized vertex pos
+	 */
+	for (i = 0; i < 8; i++)
+	{
+		decl = vl_decl_output((i == 0 || i == 7) ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl t0, t1 */
+	decl = vl_decl_temps(0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t0, i0, c0	; Scale unit rect to normalized MB size */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add t1, t0, c1	; Translate rect into position */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* mov o0, t1		; Move vertex pos to output */
+	inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * mov o1, i1		; Move input luma texcoords to output
+	 * mov o2, i2		; Move input chroma texcoords to output
+	 */
+	for (i = 1; i < 3; ++i)
+	{
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * add o3, t0, c3	; Translate top field rect into position on past ref macroblock
+	 * add o4, t0, c4	; Translate bottom field rect into position on past ref macroblock
+	 * add o5, t0, c5	; Translate top field rect into position on future ref macroblock
+	 * add o6, t0, c6	; Translate bottom field rect into position on future ref macroblock
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 3, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, i + 3);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* mul o7, t1, c2	; Denorm vertex pos */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 7, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 2);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	vs.tokens = tokens;
+	mc->b_vs[1] = pipe->create_vs_state(pipe, &vs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateFragmentShaderFrameBMB
+(
+	struct vlR16SnormMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	fs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0			; Texcoords for s0
+	 * decl i1			; Texcoords for s1, s2
+	 * decl i2			; Texcoords for s3
+	 * decl i3			; Texcoords for s4
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm
+	 * decl c1			; Constant 1/2 in .x channel to use as weight to blend past and future texels
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl o0			; Fragment color */
+	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl t0-t2 */
+	decl = vl_decl_temps(0, 2);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl s0			; Sampler for luma texture
+	 * decl s1			; Sampler for chroma Cb texture
+	 * decl s2			; Sampler for chroma Cr texture
+	 * decl s3			; Sampler for past ref surface texture
+	 * decl s4			; Sampler for future ref surface texture
+	 */
+	for (i = 0; i < 5; ++i)
+	{
+		decl = vl_decl_samplers(i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * tex2d t1, i0, s0		; Read texel from luma texture
+	 * mov t0.x, t1.x		; Move luma sample into .x component
+	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
+	 * mov t0.y, t1.x		; Move Cb sample into .y component
+	 * tex2d t1, i1, s2		; Read texel from chroma Cr texture
+	 * mov t0.z, t1.x		; Move Cr sample into .z component
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i > 0 ? 1 : 0, TGSI_FILE_SAMPLER, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	}
+
+	/* mul t0, t0, c0		; Rescale texel to correct range */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * tex2d t1, i2, s3		; Read texel from past ref macroblock
+	 * tex2d t2, i3, s4		; Read texel from future ref macroblock
+	 */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 2, TGSI_FILE_SAMPLER, i + 3);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* lerp t1, c1.x, t1, t2	; Blend past and future texels */
+	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 1, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add o0, t0, t1		; Add past/future ref and differential to form final output */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	fs.tokens = tokens;
+	mc->b_fs[0] = pipe->create_fs_state(pipe, &fs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateFragmentShaderFieldBMB
+(
+	struct vlR16SnormMC *mc
+)
+{
+	const unsigned int		max_tokens = 200;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	fs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0			; Texcoords for s0
+	 * decl i1			; Texcoords for s1, s2
+	 * decl i2			; Texcoords for s3
+	 * decl i3			; Texcoords for s3
+	 * decl i4			; Texcoords for s4
+	 * decl i5			; Texcoords for s4
+	 * decl i6			; Denormalized vertex pos
+	 */
+	for (i = 0; i < 7; ++i)
+	{
+		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm
+	 * decl c1			; Constants 1/2 & 2 in .x, .y channels to use as weight to blend past and future texels
+	 *				; and for Y-mod-2 top/bottom field selection
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl o0			; Fragment color */
+	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl t0-t5 */
+	decl = vl_decl_temps(0, 5);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl s0			; Sampler for luma texture
+	 * decl s1			; Sampler for chroma Cb texture
+	 * decl s2			; Sampler for chroma Cr texture
+	 * decl s3			; Sampler for past ref surface texture
+	 * decl s4			; Sampler for future ref surface texture
+	 */
+	for (i = 0; i < 5; ++i)
+	{
+		decl = vl_decl_samplers(i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * tex2d t1, i0, s0		; Read texel from luma texture
+	 * mov t0.x, t1.x		; Move luma sample into .x component
+	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
+	 * mov t0.y, t1.x		; Move Cb sample into .y component
+	 * tex2d t1, i1, s2		; Read texel from chroma Cr texture
+	 * mov t0.z, t1.x		; Move Cr sample into .z component
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i > 0 ? 1 : 0, TGSI_FILE_SAMPLER, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	}
+
+	/* mul t0, t0, c0		; Rescale texel to correct range */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* XXX: Pos values off by 0.5? */
+	/* sub t4, i6.y, c1.x		; Sub 0.5 from denormalized pos */
+	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_INPUT, 6, TGSI_FILE_CONSTANT, 1);
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t3, t4, c1.x		; Multiply pos Y-coord by 1/2 */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_CONSTANT, 1);
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* floor t3, t3			; Get rid of fractional part */
+	inst = vl_inst2(TGSI_OPCODE_FLOOR, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t3, t3, c1.y		; Multiply by 2 */
+	inst = vl_inst3( TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_CONSTANT, 1);
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* sub t3, t4, t3		; Subtract from original Y to get Y % 2 */
+	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_TEMPORARY, 3);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * tex2d t1, i2, s3		; Read texel from past ref macroblock top field
+	 * tex2d t2, i3, s3		; Read texel from past ref macroblock bottom field
+	 */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 2, TGSI_FILE_SAMPLER, 3);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* TODO: Move to conditional tex fetch on t3 instead of lerp */
+	/* lerp t1, t3, t1, t2		; Choose between top and bottom fields based on Y % 2 */
+	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * tex2d t4, i4, s4		; Read texel from future ref macroblock top field
+	 * tex2d t5, i5, s4		; Read texel from future ref macroblock bottom field
+	 */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 4, TGSI_FILE_INPUT, i + 4, TGSI_FILE_SAMPLER, 4);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* TODO: Move to conditional tex fetch on t3 instead of lerp */
+	/* lerp t2, t3, t4, t5		; Choose between top and bottom fields based on Y % 2 */
+	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 2, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_TEMPORARY, 5);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* lerp t1, c1.x, t1, t2	; Blend past and future texels */
+	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 1, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add o0, t0, t1		; Add past/future ref and differential to form final output */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	fs.tokens = tokens;
+	mc->b_fs[1] = pipe->create_fs_state(pipe, &fs);
+	free(tokens);
+
+	return 0;
+}
+
+int vlCreateDataBufs
+(
+	struct vlR16SnormMC *mc
+)
+{
+	struct pipe_context	*pipe;
+	unsigned int		i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+
+	/* Create our vertex buffer and vertex buffer element */
+	mc->vertex_bufs[0].pitch = sizeof(struct vlVertex2f);
+	mc->vertex_bufs[0].max_index = 23;
+	mc->vertex_bufs[0].buffer_offset = 0;
+	mc->vertex_bufs[0].buffer = pipe->winsys->buffer_create
+	(
+		pipe->winsys,
+		1,
+		PIPE_BUFFER_USAGE_VERTEX,
+		sizeof(struct vlVertex2f) * 24
+	);
+
+	mc->vertex_elems[0].src_offset = 0;
+	mc->vertex_elems[0].vertex_buffer_index = 0;
+	mc->vertex_elems[0].nr_components = 2;
+	mc->vertex_elems[0].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+	/* Create our texcoord buffers and texcoord buffer elements */
+	for (i = 1; i < 3; ++i)
+	{
+		mc->vertex_bufs[i].pitch = sizeof(struct vlVertex2f);
+		mc->vertex_bufs[i].max_index = 23;
+		mc->vertex_bufs[i].buffer_offset = 0;
+		mc->vertex_bufs[i].buffer = pipe->winsys->buffer_create
+		(
+			pipe->winsys,
+			1,
+			PIPE_BUFFER_USAGE_VERTEX,
+			sizeof(struct vlVertex2f) * 24
+		);
+
+		mc->vertex_elems[i].src_offset = 0;
+		mc->vertex_elems[i].vertex_buffer_index = i;
+		mc->vertex_elems[i].nr_components = 2;
+		mc->vertex_elems[i].src_format = PIPE_FORMAT_R32G32_FLOAT;
+	}
+
+	/* Fill buffers */
+	memcpy
+	(
+		pipe->winsys->buffer_map(pipe->winsys, mc->vertex_bufs[0].buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+		macroblock_verts,
+		sizeof(struct vlVertex2f) * 24
+	);
+	memcpy
+	(
+		pipe->winsys->buffer_map(pipe->winsys, mc->vertex_bufs[1].buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+		macroblock_luma_texcoords,
+		sizeof(struct vlVertex2f) * 24
+	);
+	/* TODO: Accomodate 422, 444 */
+	memcpy
+	(
+		pipe->winsys->buffer_map(pipe->winsys, mc->vertex_bufs[2].buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+		macroblock_chroma_420_texcoords,
+		sizeof(struct vlVertex2f) * 24
+	);
+
+	for (i = 0; i < 3; ++i)
+		pipe->winsys->buffer_unmap(pipe->winsys, mc->vertex_bufs[i].buffer);
+
+	/* Create our constant buffer */
+	mc->vs_const_buf.size = sizeof(struct vlVertexShaderConsts);
+	mc->vs_const_buf.buffer = pipe->winsys->buffer_create
+	(
+		pipe->winsys,
+		1,
+		PIPE_BUFFER_USAGE_CONSTANT,
+		mc->vs_const_buf.size
+	);
+
+	mc->fs_const_buf.size = sizeof(struct vlFragmentShaderConsts);
+	mc->fs_const_buf.buffer = pipe->winsys->buffer_create
+	(
+		pipe->winsys,
+		1,
+		PIPE_BUFFER_USAGE_CONSTANT,
+		mc->fs_const_buf.size
+	);
+
+	memcpy
+	(
+		pipe->winsys->buffer_map(pipe->winsys, mc->fs_const_buf.buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+		&fs_consts,
+		sizeof(struct vlFragmentShaderConsts)
+	);
+
+	pipe->winsys->buffer_unmap(pipe->winsys, mc->fs_const_buf.buffer);
+
+	return 0;
+}
+
+static int vlInit
+(
+	struct vlR16SnormMC *mc
+)
+{
+	struct pipe_context		*pipe;
+	struct pipe_sampler_state	sampler;
+	struct pipe_texture		template;
+	unsigned int			filters[5];
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+
+	/* For MC we render to textures, which are rounded up to nearest POT */
+	mc->viewport.scale[0] = vlRoundUpPOT(mc->video_width);
+	mc->viewport.scale[1] = vlRoundUpPOT(mc->video_height);
+	mc->viewport.scale[2] = 1;
+	mc->viewport.scale[3] = 1;
+	mc->viewport.translate[0] = 0;
+	mc->viewport.translate[1] = 0;
+	mc->viewport.translate[2] = 0;
+	mc->viewport.translate[3] = 0;
+
+	mc->render_target.width = vlRoundUpPOT(mc->video_width);
+	mc->render_target.height = vlRoundUpPOT(mc->video_height);
+	mc->render_target.num_cbufs = 1;
+	/* FB for MC stage is a vlSurface, set in vlSetRenderSurface() */
+	mc->render_target.zsbuf = NULL;
+
+	filters[0] = PIPE_TEX_FILTER_NEAREST;
+	filters[1] = mc->video_format == vlFormatYCbCr444 ? PIPE_TEX_FILTER_NEAREST : PIPE_TEX_FILTER_LINEAR;
+	filters[2] = mc->video_format == vlFormatYCbCr444 ? PIPE_TEX_FILTER_NEAREST : PIPE_TEX_FILTER_LINEAR;
+	filters[3] = PIPE_TEX_FILTER_LINEAR;
+	filters[4] = PIPE_TEX_FILTER_LINEAR;
+
+	for (i = 0; i < 5; ++i)
+	{
+		sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+		sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+		sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+		sampler.min_img_filter = filters[i];
+		sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+		sampler.mag_img_filter = filters[i];
+		sampler.compare_mode = PIPE_TEX_COMPARE_NONE;
+		sampler.compare_func = PIPE_FUNC_ALWAYS;
+		sampler.normalized_coords = 1;
+		/*sampler.prefilter = ;*/
+		/*sampler.shadow_ambient = ;*/
+		/*sampler.lod_bias = ;*/
+		sampler.min_lod = 0;
+		/*sampler.max_lod = ;*/
+		/*sampler.border_color[i] = ;*/
+		/*sampler.max_anisotropy = ;*/
+		mc->samplers[i] = pipe->create_sampler_state(pipe, &sampler);
+	}
+
+	memset(&template, 0, sizeof(struct pipe_texture));
+	template.target = PIPE_TEXTURE_2D;
+	template.format = PIPE_FORMAT_R16_SNORM;
+	template.last_level = 0;
+	template.width[0] = 8;
+	template.height[0] = 8 * 4;
+	template.depth[0] = 1;
+	template.compressed = 0;
+	pf_get_block(template.format, &template.block);
+
+	mc->textures[0] = pipe->screen->texture_create(pipe->screen, &template);
+
+	if (mc->video_format == vlFormatYCbCr420)
+		template.height[0] = 8;
+	else if (mc->video_format == vlFormatYCbCr422)
+		template.height[0] = 8 * 2;
+	else if (mc->video_format == vlFormatYCbCr444)
+		template.height[0] = 8 * 4;
+	else
+		assert(0);
+
+	mc->textures[1] = pipe->screen->texture_create(pipe->screen, &template);
+	mc->textures[2] = pipe->screen->texture_create(pipe->screen, &template);
+
+	/* textures[3] & textures[4] are assigned from vlSurfaces for P and B macroblocks at render time */
+
+	vlCreateVertexShaderIMB(mc);
+	vlCreateFragmentShaderIMB(mc);
+	vlCreateVertexShaderFramePMB(mc);
+	vlCreateVertexShaderFieldPMB(mc);
+	vlCreateFragmentShaderFramePMB(mc);
+	vlCreateFragmentShaderFieldPMB(mc);
+	vlCreateVertexShaderFrameBMB(mc);
+	vlCreateVertexShaderFieldBMB(mc);
+	vlCreateFragmentShaderFrameBMB(mc);
+	vlCreateFragmentShaderFieldBMB(mc);
+	vlCreateDataBufs(mc);
+
+	return 0;
+}
+
+int vlCreateR16SNormMC
+(
+	struct pipe_context *pipe,
+	unsigned int video_width,
+	unsigned int video_height,
+	enum vlFormat video_format,
+	struct vlRender **render
+)
+{
+	struct vlR16SnormMC *mc;
+
+	assert(pipe);
+	assert(render);
+
+	mc = calloc(1, sizeof(struct vlR16SnormMC));
+
+	mc->base.vlBegin = &vlBegin;
+	mc->base.vlRenderMacroBlocksMpeg2 = &vlRenderMacroBlocksMpeg2R16Snorm;
+	mc->base.vlEnd = &vlEnd;
+	mc->base.vlDestroy = &vlDestroy;
+	mc->pipe = pipe;
+	mc->video_width = video_width;
+	mc->video_height = video_height;
+
+	vlInit(mc);
+
+	*render = &mc->base;
+
+	return 0;
+}
diff --git a/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.h b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.h
new file mode 100644
index 0000000000..a6eecf05b6
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.h
@@ -0,0 +1,18 @@
+#ifndef vl_mc_h
+#define vl_mc_h
+
+#include "vl_types.h"
+
+struct pipe_context;
+struct vlRender;
+
+int vlCreateR16SNormMC
+(
+	struct pipe_context *pipe,
+	unsigned int video_width,
+	unsigned int video_height,
+	enum vlFormat video_format,
+	struct vlRender **render
+);
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_render.h b/src/gallium/state_trackers/g3dvl/vl_render.h
new file mode 100644
index 0000000000..63016b5cbe
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_render.h
@@ -0,0 +1,33 @@
+#ifndef vl_render_h
+#define vl_render_h
+
+#include "vl_types.h"
+
+struct pipe_surface;
+
+struct vlRender
+{
+	int (*vlBegin)
+	(
+		struct vlRender *render
+	);
+
+	int (*vlRenderMacroBlocksMpeg2)
+	(
+		struct vlRender *render,
+		struct vlMpeg2MacroBlockBatch *batch,
+		struct vlSurface *surface
+	);
+
+	int (*vlEnd)
+	(
+		struct vlRender *render
+	);
+
+	int (*vlDestroy)
+	(
+		struct vlRender *render
+	);
+};
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_screen.c b/src/gallium/state_trackers/g3dvl/vl_screen.c
new file mode 100644
index 0000000000..484f63b0d4
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_screen.c
@@ -0,0 +1,115 @@
+#define VL_INTERNAL
+#include "vl_screen.h"
+#include <assert.h>
+#include <stdlib.h>
+
+int vlCreateScreen
+(
+	struct vlDisplay *display,
+	int screen,
+	struct pipe_screen *pscreen,
+	struct vlScreen **vl_screen
+)
+{
+	struct vlScreen *scrn;
+
+	assert(display);
+	assert(pscreen);
+	assert(vl_screen);
+
+	scrn = calloc(1, sizeof(struct vlScreen));
+
+	if (!scrn)
+		return 1;
+
+	scrn->display = display;
+	scrn->ordinal = screen;
+	scrn->pscreen = pscreen;
+	*vl_screen = scrn;
+
+	return 0;
+}
+
+int vlDestroyScreen
+(
+	struct vlScreen *screen
+)
+{
+	assert(screen);
+
+	free(screen);
+
+	return 0;
+}
+
+struct vlDisplay* vlGetDisplay
+(
+	struct vlScreen *screen
+)
+{
+	assert(screen);
+
+	return screen->display;
+}
+
+struct pipe_screen* vlGetPipeScreen
+(
+	struct vlScreen *screen
+)
+{
+	assert(screen);
+
+	return screen->pscreen;
+}
+
+unsigned int vlGetMaxProfiles
+(
+	struct vlScreen *screen
+)
+{
+	assert(screen);
+
+	return vlProfileCount;
+}
+
+int vlQueryProfiles
+(
+	struct vlScreen *screen,
+	enum vlProfile *profiles
+)
+{
+	assert(screen);
+	assert(profiles);
+
+	profiles[0] = vlProfileMpeg2Simple;
+	profiles[1] = vlProfileMpeg2Main;
+
+	return 0;
+}
+
+unsigned int vlGetMaxEntryPoints
+(
+	struct vlScreen *screen
+)
+{
+	assert(screen);
+
+	return vlEntryPointCount;
+}
+
+int vlQueryEntryPoints
+(
+	struct vlScreen *screen,
+	enum vlProfile profile,
+	enum vlEntryPoint *entry_points
+)
+{
+	assert(screen);
+	assert(entry_points);
+
+	entry_points[0] = vlEntryPointIDCT;
+	entry_points[1] = vlEntryPointMC;
+	entry_points[2] = vlEntryPointCSC;
+
+	return 0;
+}
diff --git a/src/gallium/state_trackers/g3dvl/vl_screen.h b/src/gallium/state_trackers/g3dvl/vl_screen.h
new file mode 100644
index 0000000000..98f3d429b6
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_screen.h
@@ -0,0 +1,63 @@
+#ifndef vl_screen_h
+#define vl_screen_h
+
+#include "vl_types.h"
+
+struct pipe_screen;
+
+#ifdef VL_INTERNAL
+struct vlScreen
+{
+	struct vlDisplay	*display;
+	unsigned int		ordinal;
+	struct pipe_screen	*pscreen;
+};
+#endif
+
+int vlCreateScreen
+(
+	struct vlDisplay *display,
+	int screen,
+	struct pipe_screen *pscreen,
+	struct vlScreen **vl_screen
+);
+
+int vlDestroyScreen
+(
+	struct vlScreen *screen
+);
+
+struct vlDisplay* vlGetDisplay
+(
+	struct vlScreen *screen
+);
+
+struct pipe_screen* vlGetPipeScreen
+(
+	struct vlScreen *screen
+);
+
+unsigned int vlGetMaxProfiles
+(
+	struct vlScreen *screen
+);
+
+int vlQueryProfiles
+(
+	struct vlScreen *screen,
+	enum vlProfile *profiles
+);
+
+unsigned int vlGetMaxEntryPoints
+(
+	struct vlScreen *screen
+);
+
+int vlQueryEntryPoints
+(
+	struct vlScreen *screen,
+	enum vlProfile profile,
+	enum vlEntryPoint *entry_points
+);
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_shader_build.c b/src/gallium/state_trackers/g3dvl/vl_shader_build.c
index 5f30e23ff8..51f1721a33 100644
--- a/src/gallium/state_trackers/g3dvl/vl_shader_build.c
+++ b/src/gallium/state_trackers/g3dvl/vl_shader_build.c
@@ -13,7 +13,7 @@ struct tgsi_full_declaration vl_decl_input(unsigned int name, unsigned int index
 	decl.Semantic.SemanticIndex = index;
 	decl.DeclarationRange.First = first;
 	decl.DeclarationRange.Last = last;
-	
+
 	return decl;
 }
 
@@ -27,7 +27,7 @@ struct tgsi_full_declaration vl_decl_interpolated_input
 )
 {
 	struct tgsi_full_declaration decl = tgsi_default_full_declaration();
-	
+
 	assert
 	(
 		interpolation == TGSI_INTERPOLATE_CONSTANT ||
@@ -42,21 +42,21 @@ struct tgsi_full_declaration vl_decl_interpolated_input
 	decl.Declaration.Interpolate = interpolation;;
 	decl.DeclarationRange.First = first;
 	decl.DeclarationRange.Last = last;
-	
+
 	return decl;
 }
 
 struct tgsi_full_declaration vl_decl_constants(unsigned int name, unsigned int index, unsigned int first, unsigned int last)
 {
 	struct tgsi_full_declaration decl = tgsi_default_full_declaration();
-	
+
 	decl.Declaration.File = TGSI_FILE_CONSTANT;
 	decl.Declaration.Semantic = 1;
 	decl.Semantic.SemanticName = name;
 	decl.Semantic.SemanticIndex = index;
 	decl.DeclarationRange.First = first;
 	decl.DeclarationRange.Last = last;
-	
+
 	return decl;
 }
 
@@ -70,7 +70,7 @@ struct tgsi_full_declaration vl_decl_output(unsigned int name, unsigned int inde
 	decl.Semantic.SemanticIndex = index;
 	decl.DeclarationRange.First = first;
 	decl.DeclarationRange.Last = last;
-	
+
 	return decl;
 }
 
@@ -82,7 +82,7 @@ struct tgsi_full_declaration vl_decl_temps(unsigned int first, unsigned int last
 	decl.Declaration.File = TGSI_FILE_TEMPORARY;
 	decl.DeclarationRange.First = first;
 	decl.DeclarationRange.Last = last;
-	
+
 	return decl;
 }
 
@@ -94,7 +94,7 @@ struct tgsi_full_declaration vl_decl_samplers(unsigned int first, unsigned int l
 	decl.Declaration.File = TGSI_FILE_SAMPLER;
 	decl.DeclarationRange.First = first;
 	decl.DeclarationRange.Last = last;
-	
+
 	return decl;
 }
 
@@ -108,7 +108,7 @@ struct tgsi_full_instruction vl_inst2
 )
 {
 	struct tgsi_full_instruction inst = tgsi_default_full_instruction();
-	
+
 	inst.Instruction.Opcode = opcode;
 	inst.Instruction.NumDstRegs = 1;
 	inst.FullDstRegisters[0].DstRegister.File = dst_file;
@@ -116,7 +116,7 @@ struct tgsi_full_instruction vl_inst2
 	inst.Instruction.NumSrcRegs = 1;
 	inst.FullSrcRegisters[0].SrcRegister.File = src_file;
 	inst.FullSrcRegisters[0].SrcRegister.Index = src_index;
-	
+
 	return inst;
 }
 
@@ -132,7 +132,7 @@ struct tgsi_full_instruction vl_inst3
 )
 {
 	struct tgsi_full_instruction inst = tgsi_default_full_instruction();
-	
+
 	inst.Instruction.Opcode = opcode;
 	inst.Instruction.NumDstRegs = 1;
 	inst.FullDstRegisters[0].DstRegister.File = dst_file;
@@ -142,7 +142,7 @@ struct tgsi_full_instruction vl_inst3
 	inst.FullSrcRegisters[0].SrcRegister.Index = src1_index;
 	inst.FullSrcRegisters[1].SrcRegister.File = src2_file;
 	inst.FullSrcRegisters[1].SrcRegister.Index = src2_index;
-	
+
 	return inst;
 }
 
@@ -158,7 +158,7 @@ struct tgsi_full_instruction vl_tex
 )
 {
 	struct tgsi_full_instruction inst = tgsi_default_full_instruction();
-	
+
 	inst.Instruction.Opcode = TGSI_OPCODE_TEX;
 	inst.Instruction.NumDstRegs = 1;
 	inst.FullDstRegisters[0].DstRegister.File = dst_file;
@@ -169,7 +169,7 @@ struct tgsi_full_instruction vl_tex
 	inst.FullSrcRegisters[0].SrcRegister.Index = src1_index;
 	inst.FullSrcRegisters[1].SrcRegister.File = src2_file;
 	inst.FullSrcRegisters[1].SrcRegister.Index = src2_index;
-	
+
 	return inst;
 }
 
@@ -187,7 +187,7 @@ struct tgsi_full_instruction vl_inst4
 )
 {
 	struct tgsi_full_instruction inst = tgsi_default_full_instruction();
-	
+
 	inst.Instruction.Opcode = opcode;
 	inst.Instruction.NumDstRegs = 1;
 	inst.FullDstRegisters[0].DstRegister.File = dst_file;
@@ -199,18 +199,17 @@ struct tgsi_full_instruction vl_inst4
 	inst.FullSrcRegisters[1].SrcRegister.Index = src2_index;
 	inst.FullSrcRegisters[2].SrcRegister.File = src3_file;
 	inst.FullSrcRegisters[2].SrcRegister.Index = src3_index;
-	
+
 	return inst;
 }
 
 struct tgsi_full_instruction vl_end(void)
 {
 	struct tgsi_full_instruction inst = tgsi_default_full_instruction();
-	
+
 	inst.Instruction.Opcode = TGSI_OPCODE_END;
 	inst.Instruction.NumDstRegs = 0;
 	inst.Instruction.NumSrcRegs = 0;
-	
+
 	return inst;
 }
-
diff --git a/src/gallium/state_trackers/g3dvl/vl_shader_build.h b/src/gallium/state_trackers/g3dvl/vl_shader_build.h
index 878d7e2c45..dc615cb156 100644
--- a/src/gallium/state_trackers/g3dvl/vl_shader_build.h
+++ b/src/gallium/state_trackers/g3dvl/vl_shader_build.h
@@ -59,4 +59,3 @@ struct tgsi_full_instruction vl_inst4
 struct tgsi_full_instruction vl_end(void);
 
 #endif
-
diff --git a/src/gallium/state_trackers/g3dvl/vl_surface.c b/src/gallium/state_trackers/g3dvl/vl_surface.c
index 1386b1107c..ffc8122172 100644
--- a/src/gallium/state_trackers/g3dvl/vl_surface.c
+++ b/src/gallium/state_trackers/g3dvl/vl_surface.c
@@ -1,628 +1,177 @@
+#define VL_INTERNAL
 #include "vl_surface.h"
 #include <assert.h>
 #include <stdlib.h>
-#include <pipe/p_context.h>
+#include <string.h>
+#include <pipe/p_screen.h>
 #include <pipe/p_state.h>
-#include <pipe/p_format.h>
 #include <pipe/p_inlines.h>
 #include <vl_winsys.h>
+#include "vl_screen.h"
 #include "vl_context.h"
-#include "vl_defs.h"
+#include "vl_render.h"
+#include "vl_csc.h"
 #include "vl_util.h"
 
-/*#define DO_IDCT*/
-
-#ifdef DO_IDCT
-static int vlTransformBlock(short *src, short *dst, short bias)
+int vlCreateSurface
+(
+	struct vlScreen *screen,
+	unsigned int width,
+	unsigned int height,
+	enum vlFormat format,
+	struct vlSurface **surface
+)
 {
-	static const float basis[8][8] =
-	{
-		{0.3536,   0.4904,   0.4619,   0.4157,   0.3536,   0.2778,   0.1913,   0.0975},
-		{0.3536,   0.4157,   0.1913,  -0.0975,  -0.3536,  -0.4904,  -0.4619,  -0.2778},
-		{0.3536,   0.2778,  -0.1913,  -0.4904,  -0.3536,   0.0975,   0.4619,   0.4157},
-		{0.3536,   0.0975,  -0.4619,  -0.2778,   0.3536,   0.4157,  -0.1913,  -0.4904},
-		{0.3536,  -0.0975,  -0.4619,   0.2778,   0.3536,  -0.4157,  -0.1913,   0.4904},
-		{0.3536,  -0.2778,  -0.1913,   0.4904,  -0.3536,  -0.0975,   0.4619,  -0.4157},
-		{0.3536,  -0.4157,   0.1913,   0.0975,  -0.3536,   0.4904,  -0.4619,   0.2778},
-		{0.3536,  -0.4904,   0.4619,  -0.4157,   0.3536,  -0.2778,   0.1913,  -0.0975}
-	};
-	
-	unsigned int	x, y;
-	short		tmp[64];
-	
-	for (y = 0; y < VL_BLOCK_HEIGHT; ++y)
-		for (x = 0; x < VL_BLOCK_WIDTH; ++x)
-			tmp[y * VL_BLOCK_WIDTH + x] = (short)
-			(
-				src[y * VL_BLOCK_WIDTH + 0] * basis[x][0] +
-				src[y * VL_BLOCK_WIDTH + 1] * basis[x][1] +
-				src[y * VL_BLOCK_WIDTH + 2] * basis[x][2] +
-				src[y * VL_BLOCK_WIDTH + 3] * basis[x][3] +
-				src[y * VL_BLOCK_WIDTH + 4] * basis[x][4] +
-				src[y * VL_BLOCK_WIDTH + 5] * basis[x][5] +
-				src[y * VL_BLOCK_WIDTH + 6] * basis[x][6] +
-				src[y * VL_BLOCK_WIDTH + 7] * basis[x][7]
-			);
-
-	for (x = 0; x < VL_BLOCK_WIDTH; ++x)
-		for (y = 0; y < VL_BLOCK_HEIGHT; ++y)
-		{
-			dst[y * VL_BLOCK_WIDTH + x] = bias + (short)
-			(
-				tmp[0 * VL_BLOCK_WIDTH + x] * basis[y][0] +
-				tmp[1 * VL_BLOCK_WIDTH + x] * basis[y][1] +
-				tmp[2 * VL_BLOCK_WIDTH + x] * basis[y][2] +
-				tmp[3 * VL_BLOCK_WIDTH + x] * basis[y][3] +
-				tmp[4 * VL_BLOCK_WIDTH + x] * basis[y][4] +
-				tmp[5 * VL_BLOCK_WIDTH + x] * basis[y][5] +
-				tmp[6 * VL_BLOCK_WIDTH + x] * basis[y][6] +
-				tmp[7 * VL_BLOCK_WIDTH + x] * basis[y][7]
-			);
-			if (dst[y * VL_BLOCK_WIDTH + x] > 255)
-				dst[y * VL_BLOCK_WIDTH + x] = 255;
-			else if (bias > 0 && dst[y * VL_BLOCK_WIDTH + x] < 0)
-				dst[y * VL_BLOCK_WIDTH + x] = 0;
-		}
-	return 0;
-}
-#endif
+	struct vlSurface	*sfc;
+	struct pipe_texture	template;
 
-static int vlGrabFrameCodedBlock(short *src, short *dst, unsigned int dst_pitch)
-{
-	unsigned int y;
-	
-	for (y = 0; y < VL_BLOCK_HEIGHT; ++y)
-		memcpy
-		(
-			dst + y * dst_pitch,
-			src + y * VL_BLOCK_WIDTH,
-			VL_BLOCK_WIDTH * 2
-		);
-	
-	return 0;
-}
+	assert(screen);
+	assert(surface);
 
-static int vlGrabFieldCodedBlock(short *src, short *dst, unsigned int dst_pitch)
-{
-	unsigned int y;
-	
-	for (y = 0; y < VL_BLOCK_HEIGHT / 2; ++y)
-		memcpy
-		(
-			dst + y * dst_pitch * 2,
-			src + y * VL_BLOCK_WIDTH,
-			VL_BLOCK_WIDTH * 2
-		);
-	
-	dst += VL_BLOCK_HEIGHT * dst_pitch;
-	
-	for (; y < VL_BLOCK_HEIGHT; ++y)
-		memcpy
-		(
-			dst + y * dst_pitch * 2,
-			src + y * VL_BLOCK_WIDTH,
-			VL_BLOCK_WIDTH * 2
-		);
-	
-	return 0;
-}
+	sfc = calloc(1, sizeof(struct vlSurface));
 
-static int vlGrabNoBlock(short *dst, unsigned int dst_pitch)
-{
-	unsigned int y;
-	
-	for (y = 0; y < VL_BLOCK_HEIGHT; ++y)
-		memset
-		(
-			dst + y * dst_pitch,
-			0,
-			VL_BLOCK_WIDTH * 2
-		);
-	
-	return 0;
-}
+	if (!sfc)
+		return 1;
 
-static int vlGrabBlocks
-(
-	struct VL_CONTEXT *context,
-	unsigned int coded_block_pattern,
-	enum VL_DCT_TYPE dct_type,
-	enum VL_SAMPLE_TYPE sample_type,
-	short *blocks
-)
-{
-	struct pipe_surface	*tex_surface;
-	short			*texels;
-	unsigned int		tex_pitch;
-	unsigned int		tb, sb = 0;
-	
-	assert(context);
-	assert(blocks);
-	
-	tex_surface = context->pipe->screen->get_tex_surface
-	(
-		context->pipe->screen,
-		context->states.mc.textures[0],
-		0, 0, 0, PIPE_BUFFER_USAGE_CPU_WRITE
-	);
-	
-	texels = pipe_surface_map(tex_surface, PIPE_BUFFER_USAGE_CPU_WRITE);
-	tex_pitch = tex_surface->stride / tex_surface->block.size;
-	
-	for (tb = 0; tb < 4; ++tb)
-	{
-		if ((coded_block_pattern >> (5 - tb)) & 1)
-		{
-			short *cur_block = blocks + sb * VL_BLOCK_WIDTH * VL_BLOCK_HEIGHT;
-			
-#ifdef DO_IDCT
-			vlTransformBlock(cur_block, cur_block, sample_type == VL_FULL_SAMPLE ? 128 : 0);
-#endif
-			
-			if (dct_type == VL_DCT_FRAME_CODED)
-				vlGrabFrameCodedBlock
-				(
-					cur_block,
-					texels + tb * tex_pitch * VL_BLOCK_HEIGHT,
-					tex_pitch
-				);
-			else
-				vlGrabFieldCodedBlock
-				(
-					cur_block,
-					texels + (tb % 2) * tex_pitch * VL_BLOCK_HEIGHT + (tb / 2) * tex_pitch,
-					tex_pitch
-				);
-			
-			++sb;
-		}
-		else
-			vlGrabNoBlock(texels + tb * tex_pitch * VL_BLOCK_HEIGHT, tex_pitch);
-	}
-	
-	pipe_surface_unmap(tex_surface);
-	
-	/* TODO: Implement 422, 444 */
-	for (tb = 0; tb < 2; ++tb)
-	{
-		tex_surface = context->pipe->screen->get_tex_surface
-			(
-				context->pipe->screen,
-				context->states.mc.textures[tb + 1],
-				0, 0, 0, PIPE_BUFFER_USAGE_CPU_WRITE
-			);
-	
-		texels = pipe_surface_map(tex_surface, PIPE_BUFFER_USAGE_CPU_WRITE);
-		tex_pitch = tex_surface->stride / tex_surface->block.size;
-		
-		if ((coded_block_pattern >> (1 - tb)) & 1)
-		{
-			short *cur_block = blocks + sb * VL_BLOCK_WIDTH * VL_BLOCK_HEIGHT;
-			
-#ifdef DO_IDCT
-			vlTransformBlock(cur_block, cur_block, sample_type == VL_FULL_SAMPLE ? 128 : 0);
-#endif
-			
-			vlGrabFrameCodedBlock
-			(
-				cur_block,
-				texels,
-				tex_pitch
-			);
-			
-			++sb;
-		}
-		else
-			vlGrabNoBlock(texels, tex_pitch);
-		
-		pipe_surface_unmap(tex_surface);
-	}
-	
-	return 0;
-}
+	sfc->screen = screen;
+	sfc->width = width;
+	sfc->height = height;
+	sfc->format = format;
 
-int vlCreateSurface(struct VL_CONTEXT *context, struct VL_SURFACE **surface)
-{
-	struct pipe_context	*pipe;
-	struct pipe_texture	template;
-	struct VL_SURFACE	*sfc;
-	
-	assert(context);
-	assert(surface);
-	
-	pipe = context->pipe;
-	
-	sfc = calloc(1, sizeof(struct VL_SURFACE));
-	
-	sfc->context = context;
-	sfc->width = vlRoundUpPOT(context->video_width);
-	sfc->height = vlRoundUpPOT(context->video_height);
-	sfc->format = context->video_format;
-	
 	memset(&template, 0, sizeof(struct pipe_texture));
 	template.target = PIPE_TEXTURE_2D;
 	template.format = PIPE_FORMAT_A8R8G8B8_UNORM;
 	template.last_level = 0;
-	template.width[0] = sfc->width;
-	template.height[0] = sfc->height;
+	template.width[0] = vlRoundUpPOT(sfc->width);
+	template.height[0] = vlRoundUpPOT(sfc->height);
 	template.depth[0] = 1;
 	template.compressed = 0;
 	pf_get_block(template.format, &template.block);
-	/* XXX: Needed? */
 	template.tex_usage = PIPE_TEXTURE_USAGE_SAMPLER | PIPE_TEXTURE_USAGE_RENDER_TARGET;
-	
-	sfc->texture = pipe->screen->texture_create(pipe->screen, &template);
-	
+
+	sfc->texture = vlGetPipeScreen(screen)->texture_create(vlGetPipeScreen(screen), &template);
+
 	*surface = sfc;
-	
+
 	return 0;
 }
 
-int vlDestroySurface(struct VL_SURFACE *surface)
+int vlDestroySurface
+(
+	struct vlSurface *surface
+)
 {
 	assert(surface);
+
 	pipe_texture_release(&surface->texture);
 	free(surface);
-	
+
 	return 0;
 }
 
-int vlRenderIMacroBlock
+int vlRenderMacroBlocksMpeg2
 (
-	enum VL_PICTURE picture_type,
-	enum VL_FIELD_ORDER field_order,
-	unsigned int mbx,
-	unsigned int mby,
-	unsigned int coded_block_pattern,
-	enum VL_DCT_TYPE dct_type,
-	short *blocks,
-	struct VL_SURFACE *surface
+	struct vlMpeg2MacroBlockBatch *batch,
+	struct vlSurface *surface
 )
 {
-	struct pipe_context	*pipe;
-	struct VL_MC_VS_CONSTS	*vs_consts;
-	
-	assert(blocks);
+	assert(batch);
 	assert(surface);
-	
-	/* TODO: Implement interlaced rendering */
-	if (picture_type != VL_FRAME_PICTURE)
-		return 0;
-	
-	vlGrabBlocks(surface->context, coded_block_pattern, dct_type, VL_FULL_SAMPLE, blocks);
-	
-	pipe = surface->context->pipe;
-	
-	vs_consts = pipe->winsys->buffer_map
-	(
-		pipe->winsys,
-		surface->context->states.mc.vs_const_buf.buffer,
-		PIPE_BUFFER_USAGE_CPU_WRITE
-	);
-	
-	vs_consts->scale.x = VL_MACROBLOCK_WIDTH / (float)surface->width;
-	vs_consts->scale.y = VL_MACROBLOCK_HEIGHT / (float)surface->height;
-	vs_consts->scale.z = 1.0f;
-	vs_consts->scale.w = 1.0f;
-	vs_consts->mb_pos_trans.x = (mbx * VL_MACROBLOCK_WIDTH) / (float)surface->width;
-	vs_consts->mb_pos_trans.y = (mby * VL_MACROBLOCK_HEIGHT) / (float)surface->height;
-	vs_consts->mb_pos_trans.z = 0.0f;
-	vs_consts->mb_pos_trans.w = 0.0f;
-	
-	pipe->winsys->buffer_unmap(pipe->winsys, surface->context->states.mc.vs_const_buf.buffer);
-	
-	surface->context->states.mc.render_target.cbufs[0] = pipe->screen->get_tex_surface
+
+	surface->context->render->vlBegin(surface->context->render);
+
+	surface->context->render->vlRenderMacroBlocksMpeg2
 	(
-		pipe->screen,
-		surface->texture,
-		0, 0, 0, PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE
+		surface->context->render,
+		batch,
+		surface
 	);
-	pipe->set_framebuffer_state(pipe, &surface->context->states.mc.render_target);
-	pipe->set_sampler_textures(pipe, 3, surface->context->states.mc.textures);
-	pipe->bind_sampler_states(pipe, 3, (void**)surface->context->states.mc.samplers);
-	pipe->bind_vs_state(pipe, surface->context->states.mc.i_vs);
-	pipe->bind_fs_state(pipe, surface->context->states.mc.i_fs);
-	
-	pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, 0, 24);
-	
+
+	surface->context->render->vlEnd(surface->context->render);
+
 	return 0;
 }
 
-int vlRenderPMacroBlock
+int vlPutPicture
 (
-	enum VL_PICTURE picture_type,
-	enum VL_FIELD_ORDER field_order,
-	unsigned int mbx,
-	unsigned int mby,
-	enum VL_MC_TYPE mc_type,
-	struct VL_MOTION_VECTOR *motion_vector,
-	unsigned int coded_block_pattern,
-	enum VL_DCT_TYPE dct_type,
-	short *blocks,
-	struct VL_SURFACE *ref_surface,
-	struct VL_SURFACE *surface
+	struct vlSurface *surface,
+	vlNativeDrawable drawable,
+	int srcx,
+	int srcy,
+	int srcw,
+	int srch,
+	int destx,
+	int desty,
+	int destw,
+	int desth,
+	enum vlPictureType picture_type
 )
 {
+	struct vlCSC		*csc;
 	struct pipe_context	*pipe;
-	struct VL_MC_VS_CONSTS	*vs_consts;
-	
-	assert(motion_vectors);
-	assert(blocks);
-	assert(ref_surface);
+
 	assert(surface);
-	
-	/* TODO: Implement interlaced rendering */
-	if (picture_type != VL_FRAME_PICTURE)
-		return 0;
-	/* TODO: Implement other MC types */
-	if (mc_type != VL_FRAME_MC && mc_type != VL_FIELD_MC)
-		return 0;
-	
-	vlGrabBlocks(surface->context, coded_block_pattern, dct_type, VL_DIFFERENCE_SAMPLE, blocks);
-	
+	assert(surface->context);
+
+	csc = surface->context->csc;
 	pipe = surface->context->pipe;
-	
-	vs_consts = pipe->winsys->buffer_map
+
+	csc->vlResizeFrameBuffer(csc, destw, desth);
+
+	csc->vlBegin(csc);
+
+	csc->vlPutPicture
 	(
-		pipe->winsys,
-		surface->context->states.mc.vs_const_buf.buffer,
-		PIPE_BUFFER_USAGE_CPU_WRITE
+		csc,
+		surface,
+		srcx,
+		srcy,
+		srcw,
+		srch,
+		destx,
+		desty,
+		destw,
+		desth,
+		picture_type
 	);
-	
-	vs_consts->scale.x = VL_MACROBLOCK_WIDTH / (float)surface->width;
-	vs_consts->scale.y = VL_MACROBLOCK_HEIGHT / (float)surface->height;
-	vs_consts->scale.z = 1.0f;
-	vs_consts->scale.w = 1.0f;
-	vs_consts->mb_pos_trans.x = (mbx * VL_MACROBLOCK_WIDTH) / (float)surface->width;
-	vs_consts->mb_pos_trans.y = (mby * VL_MACROBLOCK_HEIGHT) / (float)surface->height;
-	vs_consts->mb_pos_trans.z = 0.0f;
-	vs_consts->mb_pos_trans.w = 0.0f;
-	vs_consts->mb_tc_trans[0].top_field.x = (mbx * VL_MACROBLOCK_WIDTH + motion_vector->top_field.x * 0.5f) / (float)surface->width;
-	vs_consts->mb_tc_trans[0].top_field.y = (mby * VL_MACROBLOCK_HEIGHT + motion_vector->top_field.y * 0.5f) / (float)surface->height;
-	vs_consts->mb_tc_trans[0].top_field.z = 0.0f;
-	vs_consts->mb_tc_trans[0].top_field.w = 0.0f;
-	
-	if (mc_type == VL_FIELD_MC)
-	{
-		vs_consts->denorm.x = (float)surface->width;
-		vs_consts->denorm.y = (float)surface->height;
-		
-		vs_consts->mb_tc_trans[0].bottom_field.x = (mbx * VL_MACROBLOCK_WIDTH + motion_vector->bottom_field.x * 0.5f) / (float)surface->width;
-		vs_consts->mb_tc_trans[0].bottom_field.y = (mby * VL_MACROBLOCK_HEIGHT + motion_vector->bottom_field.y * 0.5f) / (float)surface->height;
-		vs_consts->mb_tc_trans[0].bottom_field.z = 0.0f;
-		vs_consts->mb_tc_trans[0].bottom_field.w = 0.0f;
-		
-		pipe->bind_vs_state(pipe, surface->context->states.mc.p_vs[1]);
-		pipe->bind_fs_state(pipe, surface->context->states.mc.p_fs[1]);
-	}
-	else
-	{
-		pipe->bind_vs_state(pipe, surface->context->states.mc.p_vs[0]);
-		pipe->bind_fs_state(pipe, surface->context->states.mc.p_fs[0]);
-	}
-	
-	pipe->winsys->buffer_unmap(pipe->winsys, surface->context->states.mc.vs_const_buf.buffer);
-	
-	surface->context->states.mc.render_target.cbufs[0] = pipe->screen->get_tex_surface
+
+	csc->vlEnd(csc);
+
+	pipe->flush(pipe, PIPE_FLUSH_RENDER_CACHE, NULL);
+	bind_pipe_drawable(pipe, drawable);
+	/* TODO: Need to take destx, desty into consideration */
+	pipe->winsys->flush_frontbuffer
 	(
-		pipe->screen,
-		surface->texture,
-		0, 0, 0, PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE
+		pipe->winsys,
+		csc->vlGetFrameBuffer(csc),
+		pipe->priv
 	);
-	pipe->set_framebuffer_state(pipe, &surface->context->states.mc.render_target);
-	
-	surface->context->states.mc.textures[3] = ref_surface->texture;
-	pipe->set_sampler_textures(pipe, 4, surface->context->states.mc.textures);
-	pipe->bind_sampler_states(pipe, 4, (void**)surface->context->states.mc.samplers);
-	
-	pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, 0, 24);
-	
+
 	return 0;
 }
 
-int vlRenderBMacroBlock
+struct vlScreen* vlSurfaceGetScreen
 (
-	enum VL_PICTURE picture_type,
-	enum VL_FIELD_ORDER field_order,
-	unsigned int mbx,
-	unsigned int mby,
-	enum VL_MC_TYPE mc_type,
-	struct VL_MOTION_VECTOR *motion_vector,
-	unsigned int coded_block_pattern,
-	enum VL_DCT_TYPE dct_type,
-	short *blocks,
-	struct VL_SURFACE *past_surface,
-	struct VL_SURFACE *future_surface,
-	struct VL_SURFACE *surface
+	struct vlSurface *surface
 )
 {
-	struct pipe_context	*pipe;
-	struct VL_MC_VS_CONSTS	*vs_consts;
-	
-	assert(motion_vectors);
-	assert(blocks);
-	assert(ref_surface);
 	assert(surface);
-	
-	/* TODO: Implement interlaced rendering */
-	if (picture_type != VL_FRAME_PICTURE)
-		return 0;
-	/* TODO: Implement other MC types */
-	if (mc_type != VL_FRAME_MC && mc_type != VL_FIELD_MC)
-		return 0;
-	
-	vlGrabBlocks(surface->context, coded_block_pattern, dct_type, VL_DIFFERENCE_SAMPLE, blocks);
-	
-	pipe = surface->context->pipe;
-	
-	vs_consts = pipe->winsys->buffer_map
-	(
-		pipe->winsys,
-		surface->context->states.mc.vs_const_buf.buffer,
-		PIPE_BUFFER_USAGE_CPU_WRITE
-	);
-	
-	vs_consts->scale.x = VL_MACROBLOCK_WIDTH / (float)surface->width;
-	vs_consts->scale.y = VL_MACROBLOCK_HEIGHT / (float)surface->height;
-	vs_consts->scale.z = 1.0f;
-	vs_consts->scale.w = 1.0f;
-	vs_consts->mb_pos_trans.x = (mbx * VL_MACROBLOCK_WIDTH) / (float)surface->width;
-	vs_consts->mb_pos_trans.y = (mby * VL_MACROBLOCK_HEIGHT) / (float)surface->height;
-	vs_consts->mb_pos_trans.z = 0.0f;
-	vs_consts->mb_pos_trans.w = 0.0f;
-	vs_consts->mb_tc_trans[0].top_field.x = (mbx * VL_MACROBLOCK_WIDTH + motion_vector[0].top_field.x * 0.5f) / (float)surface->width;
-	vs_consts->mb_tc_trans[0].top_field.y = (mby * VL_MACROBLOCK_HEIGHT + motion_vector[0].top_field.y * 0.5f) / (float)surface->height;
-	vs_consts->mb_tc_trans[0].top_field.z = 0.0f;
-	vs_consts->mb_tc_trans[0].top_field.w = 0.0f;
-	vs_consts->mb_tc_trans[1].top_field.x = (mbx * VL_MACROBLOCK_WIDTH + motion_vector[1].top_field.x * 0.5f) / (float)surface->width;
-	vs_consts->mb_tc_trans[1].top_field.y = (mby * VL_MACROBLOCK_HEIGHT + motion_vector[1].top_field.y * 0.5f) / (float)surface->height;
-	vs_consts->mb_tc_trans[1].top_field.z = 0.0f;
-	vs_consts->mb_tc_trans[1].top_field.w = 0.0f;
-	
-	if (mc_type == VL_FIELD_MC)
-	{
-		vs_consts->denorm.x = (float)surface->width;
-		vs_consts->denorm.y = (float)surface->height;
-		
-		vs_consts->mb_tc_trans[0].bottom_field.x = (mbx * VL_MACROBLOCK_WIDTH + motion_vector[0].bottom_field.x * 0.5f) / (float)surface->width;
-		vs_consts->mb_tc_trans[0].bottom_field.y = (mby * VL_MACROBLOCK_HEIGHT + motion_vector[0].bottom_field.y * 0.5f) / (float)surface->height;
-		vs_consts->mb_tc_trans[0].bottom_field.z = 0.0f;
-		vs_consts->mb_tc_trans[0].bottom_field.w = 0.0f;
-		vs_consts->mb_tc_trans[1].bottom_field.x = (mbx * VL_MACROBLOCK_WIDTH + motion_vector[1].bottom_field.x * 0.5f) / (float)surface->width;
-		vs_consts->mb_tc_trans[1].bottom_field.y = (mby * VL_MACROBLOCK_HEIGHT + motion_vector[1].bottom_field.y * 0.5f) / (float)surface->height;
-		vs_consts->mb_tc_trans[1].bottom_field.z = 0.0f;
-		vs_consts->mb_tc_trans[1].bottom_field.w = 0.0f;
-		
-		pipe->bind_vs_state(pipe, surface->context->states.mc.b_vs[1]);
-		pipe->bind_fs_state(pipe, surface->context->states.mc.b_fs[1]);
-	}
-	else
-	{
-		pipe->bind_vs_state(pipe, surface->context->states.mc.b_vs[0]);
-		pipe->bind_fs_state(pipe, surface->context->states.mc.b_fs[0]);
-	}
-	
-	pipe->winsys->buffer_unmap(pipe->winsys, surface->context->states.mc.vs_const_buf.buffer);
-	
-	surface->context->states.mc.render_target.cbufs[0] = pipe->screen->get_tex_surface
-	(
-		pipe->screen,
-		surface->texture,
-		0, 0, 0, PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE
-	);
-	pipe->set_framebuffer_state(pipe, &surface->context->states.mc.render_target);
-	
-	surface->context->states.mc.textures[3] = past_surface->texture;
-	surface->context->states.mc.textures[4] = future_surface->texture;
-	pipe->set_sampler_textures(pipe, 5, surface->context->states.mc.textures);
-	pipe->bind_sampler_states(pipe, 5, (void**)surface->context->states.mc.samplers);
-	
-	pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, 0, 24);
-	
-	return 0;
+
+	return surface->screen;
 }
 
-int vlPutSurface
+struct vlContext* vlBindToContext
 (
-	struct VL_SURFACE *surface,
-	Drawable drawable,
-	unsigned int srcx,
-	unsigned int srcy,
-	unsigned int srcw,
-	unsigned int srch,
-	unsigned int destx,
-	unsigned int desty,
-	unsigned int destw,
-	unsigned int desth,
-	enum VL_PICTURE picture_type
+	struct vlSurface *surface,
+	struct vlContext *context
 )
 {
-	unsigned int		create_fb = 0;
-	struct pipe_context	*pipe;
-	struct VL_CSC_VS_CONSTS	*vs_consts;
-	
+	struct vlContext *old;
+
 	assert(surface);
-	
-	pipe = surface->context->pipe;
-	
-	if (!surface->context->states.csc.framebuffer.cbufs[0])
-		create_fb = 1;
-	else if
-	(
-		surface->context->states.csc.framebuffer.width != destw ||
-		surface->context->states.csc.framebuffer.height != desth
-	)
-	{
-		pipe->winsys->surface_release
-		(
-			pipe->winsys,
-			&surface->context->states.csc.framebuffer.cbufs[0]
-		);
-		
-		create_fb = 1;
-	}
-	
-	if (create_fb)
-	{
-		surface->context->states.csc.viewport.scale[0] = destw;
-		surface->context->states.csc.viewport.scale[1] = desth;
-		surface->context->states.csc.viewport.scale[2] = 1;
-		surface->context->states.csc.viewport.scale[3] = 1;
-		surface->context->states.csc.viewport.translate[0] = 0;
-		surface->context->states.csc.viewport.translate[1] = 0;
-		surface->context->states.csc.viewport.translate[2] = 0;
-		surface->context->states.csc.viewport.translate[3] = 0;
-		
-		surface->context->states.csc.framebuffer.width = destw;
-		surface->context->states.csc.framebuffer.height = desth;
-		surface->context->states.csc.framebuffer.cbufs[0] = pipe->winsys->surface_alloc(pipe->winsys);
-		pipe->winsys->surface_alloc_storage
-		(
-			pipe->winsys,
-			surface->context->states.csc.framebuffer.cbufs[0],
-			destw,
-			desth,
-			PIPE_FORMAT_A8R8G8B8_UNORM,
-			/* XXX: SoftPipe doesn't change GPU usage to CPU like it does for textures */
-			PIPE_BUFFER_USAGE_CPU_READ | PIPE_BUFFER_USAGE_CPU_WRITE,
-			0
-		);
-	}
-	
-	vlEndRender(surface->context);
-	
-	vs_consts = pipe->winsys->buffer_map
-	(
-		pipe->winsys,
-		surface->context->states.csc.vs_const_buf.buffer,
-		PIPE_BUFFER_USAGE_CPU_WRITE
-	);
-	
-	vs_consts->src_scale.x = srcw / (float)surface->width;
-	vs_consts->src_scale.y = srch / (float)surface->height;
-	vs_consts->src_scale.z = 1;
-	vs_consts->src_scale.w = 1;
-	vs_consts->src_trans.x = srcx / (float)surface->width;
-	vs_consts->src_trans.y = srcy / (float)surface->height;
-	vs_consts->src_trans.z = 0;
-	vs_consts->src_trans.w = 0;
-	
-	pipe->winsys->buffer_unmap(pipe->winsys, surface->context->states.csc.vs_const_buf.buffer);
-	
-	pipe->set_sampler_textures(pipe, 1, &surface->texture);
-	pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLE_STRIP, 0, 4);
-	pipe->flush(pipe, PIPE_FLUSH_RENDER_CACHE, NULL);
-	bind_pipe_drawable(pipe, drawable);
-	/* TODO: Need to take destx, desty into consideration */
-	pipe->winsys->flush_frontbuffer
-	(
-		pipe->winsys,
-		surface->context->states.csc.framebuffer.cbufs[0],
-		pipe->priv
-	);
-	
-	vlBeginRender(surface->context);
-	
-	return 0;
-}
 
+	old = surface->context;
+	surface->context = context;
+
+	return old;
+}
diff --git a/src/gallium/state_trackers/g3dvl/vl_surface.h b/src/gallium/state_trackers/g3dvl/vl_surface.h
index 9f56b77e1e..b975e131fa 100644
--- a/src/gallium/state_trackers/g3dvl/vl_surface.h
+++ b/src/gallium/state_trackers/g3dvl/vl_surface.h
@@ -1,81 +1,66 @@
 #ifndef vl_surface_h
 #define vl_surface_h
 
-#include <X11/Xlib.h>
 #include "vl_types.h"
 
+#ifdef VL_INTERNAL
 struct pipe_texture;
 
-struct VL_SURFACE
+struct vlSurface
 {
-	struct VL_CONTEXT	*context;
+	struct vlScreen		*screen;
+	struct vlContext	*context;
 	unsigned int		width;
 	unsigned int		height;
-	enum VL_FORMAT		format;
+	enum vlFormat		format;
 	struct pipe_texture	*texture;
 };
+#endif
 
-int vlCreateSurface(struct VL_CONTEXT *context, struct VL_SURFACE **surface);
+int vlCreateSurface
+(
+	struct vlScreen *screen,
+	unsigned int width,
+	unsigned int height,
+	enum vlFormat format,
+	struct vlSurface **surface
+);
 
-int vlDestroySurface(struct VL_SURFACE *surface);
+int vlDestroySurface
+(
+	struct vlSurface *surface
+);
 
-int vlRenderIMacroBlock
+int vlRenderMacroBlocksMpeg2
 (
-	enum VL_PICTURE picture_type,
-	enum VL_FIELD_ORDER field_order,
-	unsigned int mbx,
-	unsigned int mby,
-	unsigned int coded_block_pattern,
-	enum VL_DCT_TYPE dct_type,
-	short *blocks,
-	struct VL_SURFACE *surface
+	struct vlMpeg2MacroBlockBatch *batch,
+	struct vlSurface *surface
 );
 
-int vlRenderPMacroBlock
+int vlPutPicture
 (
-	enum VL_PICTURE picture_type,
-	enum VL_FIELD_ORDER field_order,
-	unsigned int mbx,
-	unsigned int mby,
-	enum VL_MC_TYPE mc_type,
-	struct VL_MOTION_VECTOR *motion_vector,
-	unsigned int coded_block_pattern,
-	enum VL_DCT_TYPE dct_type,
-	short *blocks,
-	struct VL_SURFACE *ref_surface,
-	struct VL_SURFACE *surface
+	struct vlSurface *surface,
+	vlNativeDrawable drawable,
+	int srcx,
+	int srcy,
+	int srcw,
+	int srch,
+	int destx,
+	int desty,
+	int destw,
+	int desth,
+	enum vlPictureType picture_type
 );
 
-int vlRenderBMacroBlock
+struct vlScreen* vlSurfaceGetScreen
 (
-	enum VL_PICTURE picture_type,
-	enum VL_FIELD_ORDER field_order,
-	unsigned int mbx,
-	unsigned int mby,
-	enum VL_MC_TYPE mc_type,
-	struct VL_MOTION_VECTOR *motion_vector,
-	unsigned int coded_block_pattern,
-	enum VL_DCT_TYPE dct_type,
-	short *blocks,
-	struct VL_SURFACE *past_surface,
-	struct VL_SURFACE *future_surface,
-	struct VL_SURFACE *surface
+	struct vlSurface *surface
 );
 
-int vlPutSurface
+struct vlContext* vlBindToContext
 (
-	struct VL_SURFACE *surface,
-	Drawable drawable,
-	unsigned int srcx,
-	unsigned int srcy,
-	unsigned int srcw,
-	unsigned int srch,
-	unsigned int destx,
-	unsigned int desty,
-	unsigned int destw,
-	unsigned int desth,
-	enum VL_PICTURE picture_type
+	struct vlSurface *surface,
+	struct vlContext *context
 );
 
 #endif
-
diff --git a/src/gallium/state_trackers/g3dvl/vl_types.h b/src/gallium/state_trackers/g3dvl/vl_types.h
index 4d210c9e0a..504ba8ac81 100644
--- a/src/gallium/state_trackers/g3dvl/vl_types.h
+++ b/src/gallium/state_trackers/g3dvl/vl_types.h
@@ -1,102 +1,106 @@
 #ifndef vl_types_h
 #define vl_types_h
 
-enum VL_FORMAT
-{
-	VL_FORMAT_YCBCR_420,
-	VL_FORMAT_YCBCR_422,
-	VL_FORMAT_YCBCR_444
-};
+#if 1 /*#ifdef X11*/
+#include <X11/Xlib.h>
 
-enum VL_PICTURE
-{
-	VL_TOP_FIELD,
-	VL_BOTTOM_FIELD,
-	VL_FRAME_PICTURE
-};
+typedef Display* vlNativeDisplay;
+typedef Drawable vlNativeDrawable;
+#endif
+
+struct vlDisplay;
+struct vlScreen;
+struct vlContext;
+struct vlSurface;
 
-enum VL_FIELD_ORDER
+enum vlProfile
 {
-	VL_FIELD_FIRST,
-	VL_FIELD_SECOND
+	vlProfileMpeg2Simple,
+	vlProfileMpeg2Main,
+
+	vlProfileCount
 };
 
-enum VL_DCT_TYPE
+enum vlEntryPoint
 {
-	VL_DCT_FIELD_CODED,
-	VL_DCT_FRAME_CODED
+	vlEntryPointIDCT,
+	vlEntryPointMC,
+	vlEntryPointCSC,
+
+	vlEntryPointCount
 };
 
-enum VL_SAMPLE_TYPE
+enum vlFormat
 {
-	VL_FULL_SAMPLE,
-	VL_DIFFERENCE_SAMPLE
+	vlFormatYCbCr420,
+	vlFormatYCbCr422,
+	vlFormatYCbCr444
 };
 
-enum VL_MC_TYPE
+enum vlPictureType
 {
-	VL_FIELD_MC,
-	VL_FRAME_MC,
-	VL_DUAL_PRIME_MC,
-	VL_16x8_MC = VL_FRAME_MC
+	vlPictureTypeTopField,
+	vlPictureTypeBottomField,
+	vlPictureTypeFrame
 };
 
-struct VL_VERTEX4F
+enum vlMotionType
 {
-	float x, y, z, w;
+	vlMotionTypeField,
+	vlMotionTypeFrame,
+	vlMotionTypeDualPrime,
+	vlMotionType16x8
 };
 
-struct VL_VERTEX2F
+enum vlFieldOrder
 {
-	float x, y;
+	vlFieldOrderFirst,
+	vlFieldOrderSecond
 };
 
-struct VL_TEXCOORD2F
+enum vlDCTType
 {
-	float s, t;
+	vlDCTTypeFrameCoded,
+	vlDCTTypeFieldCoded
 };
 
-struct VL_MC_VS_CONSTS
+struct vlVertex2f
 {
-	struct VL_VERTEX4F	scale;
-	struct VL_VERTEX4F	mb_pos_trans;
-	struct VL_VERTEX4F	denorm;
-	struct
-	{
-		struct VL_VERTEX4F	top_field;
-		struct VL_VERTEX4F	bottom_field;
-	} mb_tc_trans[2];
+	float x, y;
 };
 
-struct VL_MC_FS_CONSTS
+struct vlVertex4f
 {
-	struct VL_VERTEX4F	multiplier;
-	struct VL_VERTEX4F	bias;
-	struct VL_VERTEX4F	y_divider;
+	float x, y, z, w;
 };
 
-struct VL_CSC_VS_CONSTS
+enum vlMacroBlockType
 {
-	struct VL_VERTEX4F	src_scale;
-	struct VL_VERTEX4F	src_trans;
+	vlMacroBlockTypeIntra,
+	vlMacroBlockTypeFwdPredicted,
+	vlMacroBlockTypeBkwdPredicted,
+	vlMacroBlockTypeBiPredicted
 };
 
-struct VL_CSC_FS_CONSTS
+struct vlMpeg2MacroBlock
 {
-	struct VL_VERTEX4F	bias;
-	float			matrix[16];
+	unsigned int		mbx, mby;
+	enum vlMacroBlockType	mb_type;
+	enum vlMotionType	mo_type;
+	enum vlDCTType		dct_type;
+	int			PMV[2][2][2];
+	unsigned int		cbp;
+	short			*blocks;
 };
 
-struct VL_MOTION_VECTOR
+struct vlMpeg2MacroBlockBatch
 {
-	struct
-	{
-		int x, y;
-	} top_field, bottom_field;
+	struct vlSurface		*past_surface;
+	struct vlSurface		*future_surface;
+	enum vlPictureType		picture_type;
+	enum vlFieldOrder		field_order;
+	unsigned int			num_macroblocks;
+	struct vlMpeg2MacroBlock	*macroblocks;
 };
 
-struct VL_CONTEXT;
-struct VL_SURFACE;
-
 #endif
-
diff --git a/src/gallium/state_trackers/g3dvl/vl_util.c b/src/gallium/state_trackers/g3dvl/vl_util.c
index 2421ae2210..50aa9af66f 100644
--- a/src/gallium/state_trackers/g3dvl/vl_util.c
+++ b/src/gallium/state_trackers/g3dvl/vl_util.c
@@ -4,14 +4,13 @@
 unsigned int vlRoundUpPOT(unsigned int x)
 {
 	unsigned int i;
-	
+
 	assert(x > 0);
-	
+
 	--x;
-	
+
 	for (i = 1; i < sizeof(unsigned int) * 8; i <<= 1)
 		x |= x >> i;
-	
+
 	return x + 1;
 }
-
diff --git a/src/gallium/state_trackers/g3dvl/vl_util.h b/src/gallium/state_trackers/g3dvl/vl_util.h
index e4b72c4f87..bc98e79df4 100644
--- a/src/gallium/state_trackers/g3dvl/vl_util.h
+++ b/src/gallium/state_trackers/g3dvl/vl_util.h
@@ -4,4 +4,3 @@
 unsigned int vlRoundUpPOT(unsigned int x);
 
 #endif
-
-- 
cgit v1.2.3