From ebc4a9bf2eff7d2c0d89785e865a1df23733e64b Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Thu, 16 Jul 2009 07:50:34 +0100 Subject: tgsi: reduce x86 reg usage in tgsi_sse generated programs Pass the tgsi_exec_machine struct in directly and just hold a single pointer to this struct, rather than keeping one for each of its internal members. --- src/gallium/auxiliary/draw/draw_vs_sse.c | 24 +--- src/gallium/auxiliary/tgsi/tgsi_sse2.c | 190 +++++++++++++------------------ src/gallium/auxiliary/tgsi/tgsi_sse2.h | 28 +++++ src/gallium/drivers/softpipe/sp_fs_sse.c | 23 +--- 4 files changed, 115 insertions(+), 150 deletions(-) (limited to 'src') diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c index a4f72c40ef..fb58983e01 100644 --- a/src/gallium/auxiliary/draw/draw_vs_sse.c +++ b/src/gallium/auxiliary/draw/draw_vs_sse.c @@ -52,24 +52,12 @@ #define SSE_MAX_VERTICES 4 -typedef void (PIPE_CDECL *codegen_function) ( - const struct tgsi_exec_vector *input, /* 1 */ - struct tgsi_exec_vector *output, /* 2 */ - float (*constant)[4], /* 3 */ - struct tgsi_exec_vector *temporary, /* 4 */ - float (*immediates)[4], /* 5 */ - const float (*aos_input)[4], /* 6 */ - uint num_inputs, /* 7 */ - uint input_stride, /* 8 */ - float (*aos_output)[4], /* 9 */ - uint num_outputs, /* 10 */ - uint output_stride ); /* 11 */ struct draw_sse_vertex_shader { struct draw_vertex_shader base; struct x86_function sse2_program; - codegen_function func; + tgsi_sse2_vs_func func; struct tgsi_exec_machine *machine; }; @@ -119,11 +107,9 @@ vs_sse_run_linear( struct draw_vertex_shader *base, /* run compiled shader */ - shader->func(machine->Inputs, - machine->Outputs, - (float (*)[4])constants, - machine->Temps, - (float (*)[4])shader->base.immediates, + shader->func(machine, + constants, + shader->base.immediates, input, base->info.num_inputs, input_stride, @@ -195,7 +181,7 @@ draw_create_vs_sse(struct draw_context *draw, TRUE )) goto fail; - vs->func = (codegen_function) x86_get_func( &vs->sse2_program ); + vs->func = (tgsi_sse2_vs_func) x86_get_func( &vs->sse2_program ); if (!vs->func) { goto fail; } diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c index 5084befc4e..cfe8ef0ecf 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c +++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c @@ -32,6 +32,7 @@ #include "util/u_debug.h" #include "pipe/p_shader_tokens.h" #include "util/u_math.h" +#include "util/u_memory.h" #if defined(PIPE_ARCH_SSE) #include "util/u_sse.h" #endif @@ -104,33 +105,43 @@ get_const_base( void ) } static struct x86_reg -get_input_base( void ) +get_machine_base( void ) { return x86_make_reg( file_REG32, reg_AX ); } +static struct x86_reg +get_input_base( void ) +{ + return x86_make_disp( + get_machine_base(), + Offset(struct tgsi_exec_machine, Inputs) ); +} + static struct x86_reg get_output_base( void ) { - return x86_make_reg( - file_REG32, - reg_DX ); + return x86_make_disp( + get_machine_base(), + Offset(struct tgsi_exec_machine, Outputs) ); } static struct x86_reg get_temp_base( void ) { - return x86_make_reg( - file_REG32, - reg_BX ); + return x86_make_disp( + get_machine_base(), + Offset(struct tgsi_exec_machine, Temps) ); } static struct x86_reg get_coef_base( void ) { - return get_output_base(); + return x86_make_reg( + file_REG32, + reg_BX ); } static struct x86_reg @@ -138,7 +149,7 @@ get_immediate_base( void ) { return x86_make_reg( file_REG32, - reg_DI ); + reg_DX ); } @@ -2551,7 +2562,7 @@ emit_declaration( static void aos_to_soa( struct x86_function *func, uint arg_aos, - uint arg_soa, + uint arg_machine, uint arg_num, uint arg_stride ) { @@ -2566,7 +2577,10 @@ static void aos_to_soa( struct x86_function *func, x86_push( func, x86_make_reg( file_REG32, reg_BX ) ); x86_mov( func, aos_input, x86_fn_arg( func, arg_aos ) ); - x86_mov( func, soa_input, x86_fn_arg( func, arg_soa ) ); + x86_mov( func, soa_input, x86_fn_arg( func, arg_machine ) ); + x86_lea( func, soa_input, + x86_make_disp( soa_input, + Offset(struct tgsi_exec_machine, Inputs) ) ); x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) ); x86_mov( func, stride, x86_fn_arg( func, arg_stride ) ); @@ -2608,28 +2622,30 @@ static void aos_to_soa( struct x86_function *func, x86_jcc( func, cc_NE, inner_loop ); /* Restore EBX */ - x86_pop( func, aos_input ); + x86_pop( func, x86_make_reg( file_REG32, reg_BX ) ); } -static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride ) +static void soa_to_aos( struct x86_function *func, + uint arg_aos, + uint arg_machine, + uint arg_num, + uint arg_stride ) { - struct x86_reg soa_output; - struct x86_reg aos_output; - struct x86_reg num_outputs; - struct x86_reg temp; + struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX ); + struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX ); + struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX ); + struct x86_reg temp = x86_make_reg( file_REG32, reg_DX ); int inner_loop; - soa_output = x86_make_reg( file_REG32, reg_AX ); - aos_output = x86_make_reg( file_REG32, reg_BX ); - num_outputs = x86_make_reg( file_REG32, reg_CX ); - temp = x86_make_reg( file_REG32, reg_DX ); - /* Save EBX */ - x86_push( func, aos_output ); + x86_push( func, x86_make_reg( file_REG32, reg_BX ) ); - x86_mov( func, soa_output, x86_fn_arg( func, soa ) ); - x86_mov( func, aos_output, x86_fn_arg( func, aos ) ); - x86_mov( func, num_outputs, x86_fn_arg( func, num ) ); + x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) ); + x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) ); + x86_lea( func, soa_output, + x86_make_disp( soa_output, + Offset(struct tgsi_exec_machine, Outputs) ) ); + x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) ); /* do */ inner_loop = x86_get_label( func ); @@ -2646,7 +2662,7 @@ static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) ); sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) ); - x86_mov( func, temp, x86_fn_arg( func, stride ) ); + x86_mov( func, temp, x86_fn_arg( func, arg_stride ) ); x86_push( func, aos_output ); sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) ); sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) ); @@ -2670,20 +2686,13 @@ static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, x86_jcc( func, cc_NE, inner_loop ); /* Restore EBX */ - x86_pop( func, aos_output ); + x86_pop( func, x86_make_reg( file_REG32, reg_BX ) ); } /** * Translate a TGSI vertex/fragment shader to SSE2 code. * Slightly different things are done for vertex vs. fragment shaders. * - * Note that fragment shaders are responsible for interpolating shader - * inputs. Because on x86 we have only 4 GP registers, and here we - * have 5 shader arguments (input, output, const, temp and coef), the - * code is split into two phases -- DECLARATION and INSTRUCTION phase. - * GP register holding the output argument is aliased with the coeff - * argument, as outputs are not needed in the DECLARATION phase. - * * \param tokens the TGSI input shader * \param func the output SSE code/function * \param immediates buffer to place immediates, later passed to SSE func @@ -2697,7 +2706,6 @@ tgsi_emit_sse2( boolean do_swizzles ) { struct tgsi_parse_context parse; - boolean instruction_phase = FALSE; unsigned ok = 1; uint num_immediates = 0; @@ -2709,74 +2717,42 @@ tgsi_emit_sse2( /* Can't just use EDI, EBX without save/restoring them: */ - x86_push( - func, - get_immediate_base() ); - - x86_push( - func, - get_temp_base() ); - + x86_push( func, x86_make_reg( file_REG32, reg_BX ) ); + x86_push( func, x86_make_reg( file_REG32, reg_DI ) ); /* * Different function args for vertex/fragment shaders: */ - if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) { - /* DECLARATION phase, do not load output argument. */ - x86_mov( - func, - get_input_base(), - x86_fn_arg( func, 1 ) ); - /* skipping outputs argument here */ - x86_mov( - func, - get_const_base(), - x86_fn_arg( func, 3 ) ); - x86_mov( - func, - get_temp_base(), - x86_fn_arg( func, 4 ) ); - x86_mov( - func, - get_coef_base(), - x86_fn_arg( func, 5 ) ); - x86_mov( - func, - get_immediate_base(), - x86_fn_arg( func, 6 ) ); - } - else { - assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX); - + if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) { if (do_swizzles) aos_to_soa( func, - 6, /* aos_input */ - 1, /* machine->input */ - 7, /* num_inputs */ - 8 ); /* input_stride */ + 4, /* aos_input */ + 1, /* machine */ + 5, /* num_inputs */ + 6 ); /* input_stride */ + } + x86_mov( + func, + get_machine_base(), + x86_fn_arg( func, 1 ) ); + x86_mov( + func, + get_const_base(), + x86_fn_arg( func, 2 ) ); + x86_mov( + func, + get_immediate_base(), + x86_fn_arg( func, 3 ) ); + + if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) { x86_mov( - func, - get_input_base(), - x86_fn_arg( func, 1 ) ); - x86_mov( - func, - get_output_base(), - x86_fn_arg( func, 2 ) ); - x86_mov( - func, - get_const_base(), - x86_fn_arg( func, 3 ) ); - x86_mov( - func, - get_temp_base(), - x86_fn_arg( func, 4 ) ); - x86_mov( - func, - get_immediate_base(), - x86_fn_arg( func, 5 ) ); + func, + get_coef_base(), + x86_fn_arg( func, 4 ) ); } + while( !tgsi_parse_end_of_tokens( &parse ) && ok ) { tgsi_parse_token( &parse ); @@ -2790,17 +2766,6 @@ tgsi_emit_sse2( break; case TGSI_TOKEN_TYPE_INSTRUCTION: - if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) { - if( !instruction_phase ) { - /* INSTRUCTION phase, overwrite coeff with output. */ - instruction_phase = TRUE; - x86_mov( - func, - get_output_base(), - x86_fn_arg( func, 2 ) ); - } - } - ok = emit_instruction( func, &parse.FullToken.FullInstruction ); @@ -2844,18 +2809,17 @@ tgsi_emit_sse2( if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) { if (do_swizzles) - soa_to_aos( func, 9, 2, 10, 11 ); + soa_to_aos( func, + 7, /* aos_output */ + 1, /* machine */ + 8, /* num_outputs */ + 9 ); /* output_stride */ } /* Can't just use EBX, EDI without save/restoring them: */ - x86_pop( - func, - get_temp_base() ); - - x86_pop( - func, - get_immediate_base() ); + x86_pop( func, x86_make_reg( file_REG32, reg_DI ) ); + x86_pop( func, x86_make_reg( file_REG32, reg_BX ) ); emit_ret( func ); diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.h b/src/gallium/auxiliary/tgsi/tgsi_sse2.h index af838b2a25..d81ee3d00e 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_sse2.h +++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.h @@ -34,6 +34,7 @@ extern "C" { struct tgsi_token; struct x86_function; +struct tgsi_interp_coef; unsigned tgsi_emit_sse2( @@ -42,6 +43,33 @@ tgsi_emit_sse2( float (*immediates)[4], boolean do_swizzles ); + +/* This is the function prototype generated when do_swizzles is false + * -- effectively for fragment shaders. + */ +typedef void (PIPE_CDECL *tgsi_sse2_fs_function) ( + struct tgsi_exec_machine *machine, /* 1 */ + const float (*constant)[4], /* 2 */ + const float (*immediate)[4], /* 3 */ + const struct tgsi_interp_coef *coef /* 4 */ + ); + + +/* This is the function prototype generated when do_swizzles is true + * -- effectively for vertex shaders. + */ +typedef void (PIPE_CDECL *tgsi_sse2_vs_func) ( + struct tgsi_exec_machine *machine, /* 1 */ + const float (*constant)[4], /* 2 */ + const float (*immediate)[4], /* 3 */ + const float (*aos_input)[4], /* 4 */ + uint num_inputs, /* 5 */ + uint input_stride, /* 6 */ + float (*aos_output)[4], /* 7 */ + uint num_outputs, /* 8 */ + uint output_stride ); /* 9 */ + + #if defined __cplusplus } #endif diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c index 31c3ca21c5..f9362efcb7 100644 --- a/src/gallium/drivers/softpipe/sp_fs_sse.c +++ b/src/gallium/drivers/softpipe/sp_fs_sse.c @@ -45,17 +45,6 @@ #include "rtasm/rtasm_x86sse.h" -/* Surely this should be defined somewhere in a tgsi header: - */ -typedef void (PIPE_CDECL *codegen_function)( - const struct tgsi_exec_vector *input, - struct tgsi_exec_vector *output, - const float (*constant)[4], - struct tgsi_exec_vector *temporary, - const struct tgsi_interp_coef *coef, - float (*immediates)[4] - //, const struct tgsi_exec_vector *quadPos - ); /** @@ -65,7 +54,7 @@ struct sp_sse_fragment_shader { struct sp_fragment_shader base; struct x86_function sse2_program; - codegen_function func; + tgsi_sse2_fs_function func; float immediates[TGSI_EXEC_NUM_IMMEDIATES][4]; }; @@ -107,12 +96,10 @@ fs_sse_run( const struct sp_fragment_shader *base, tgsi_set_kill_mask(machine, 0x0); tgsi_set_exec_mask(machine, 1, 1, 1, 1); - shader->func( machine->Inputs, - machine->Outputs, + shader->func( machine, machine->Consts, - machine->Temps, - machine->InterpCoefs, - shader->immediates + (const float (*)[4])shader->immediates, + machine->InterpCoefs // , &machine->QuadPos ); @@ -151,7 +138,7 @@ softpipe_create_fs_sse(struct softpipe_context *softpipe, return NULL; } - shader->func = (codegen_function) x86_get_func( &shader->sse2_program ); + shader->func = (tgsi_sse2_fs_function) x86_get_func( &shader->sse2_program ); if (!shader->func) { x86_release_func( &shader->sse2_program ); FREE(shader); -- cgit v1.2.3