/************************************************************************** * * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sub license, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice (including the * next paragraph) shall be included in all copies or substantial portions * of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * **************************************************************************/ #include #include "pipe/p_compiler.h" #include "spu_main.h" #include "spu_texture.h" #include "spu_tile.h" #include "spu_colorpack.h" #include "spu_dcache.h" /** * Mark all tex cache entries as invalid. */ void invalidate_tex_cache(void) { uint unit = 0; uint bytes = 4 * spu.texture[unit].width * spu.texture[unit].height; spu_dcache_mark_dirty((unsigned) spu.texture[unit].start, bytes); } /** * XXX look into getting texels for all four pixels in a quad at once. */ static uint get_texel(uint unit, vec_uint4 coordinate) { /* * XXX we could do the "/ TILE_SIZE" and "% TILE_SIZE" operations as * SIMD since X and Y are already in a SIMD register. */ const unsigned texture_ea = (uintptr_t) spu.texture[unit].start; ushort x = spu_extract(coordinate, 0); ushort y = spu_extract(coordinate, 1); unsigned tile_offset = sizeof(tile_t) * ((y / TILE_SIZE * spu.texture[unit].tiles_per_row) + (x / TILE_SIZE)); ushort texel_offset = (ushort) 4 * (ushort) (((ushort) (y % TILE_SIZE) * (ushort) TILE_SIZE) + (x % TILE_SIZE)); vec_uint4 tmp; spu_dcache_fetch_unaligned((qword *) & tmp, texture_ea + tile_offset + texel_offset, 4); return spu_extract(tmp, 0); } /** * Get four texels from locations (x[0], y[0]), (x[1], y[1]) ... * * NOTE: in the typical case of bilinear filtering, the four texels * are in a 2x2 group so we could get by with just two dcache fetches * (two side-by-side texels per fetch). But when bilinear filtering * wraps around a texture edge, we'll probably need code like we have * now. * FURTHERMORE: since we're rasterizing a quad of 2x2 pixels at a time, * it's quite likely that the four pixels in a quad will need some of the * same texels. So look into doing texture fetches for four pixels at * a time. */ static void get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels) { const unsigned texture_ea = (uintptr_t) spu.texture[unit].start; vec_uint4 tile_x = spu_rlmask(x, -5); /* tile_x = x / 32 */ vec_uint4 tile_y = spu_rlmask(y, -5); /* tile_y = y / 32 */ const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */ const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */ const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].tiles_per_row); const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t)); qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x); tile_offset = si_mpy((qword) tile_offset, tile_size); qword texel_offset = si_a(si_mpyui(offset_y, 32), offset_x); texel_offset = si_mpyui(texel_offset, 4); vec_uint4 offset = (vec_uint4) si_a(tile_offset, texel_offset); spu_dcache_fetch_unaligned((qword *) & texels[0], texture_ea + spu_extract(offset, 0), 4); spu_dcache_fetch_unaligned((qword *) & texels[1], texture_ea + spu_extract(offset, 1), 4); spu_dcache_fetch_unaligned((qword *) & texels[2], texture_ea + spu_extract(offset, 2), 4); spu_dcache_fetch_unaligned((qword *) & texels[3], texture_ea + spu_extract(offset, 3), 4); } /** * Do nearest texture sampling for four pixels. * \param colors returned colors in SOA format (rrrr, gggg, bbbb, aaaa). */ void sample_texture4_nearest(vector float s, vector float t, vector float r, vector float q, uint unit, vector float colors[4]) { vector float ss = spu_mul(s, spu.texture[unit].width4); vector float tt = spu_mul(t, spu.texture[unit].height4); vector unsigned int is = spu_convtu(ss, 0); vector unsigned int it = spu_convtu(tt, 0); vec_uint4 texels[4]; /* PIPE_TEX_WRAP_REPEAT */ is = spu_and(is, spu.texture[unit].tex_size_x_mask); it = spu_and(it, spu.texture[unit].tex_size_y_mask); get_four_texels(unit, is, it, texels); /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */ spu_unpack_A8R8G8B8_transpose4(texels, colors); } /** * Do bilinear texture sampling for four pixels. * \param colors returned colors in SOA format (rrrr, gggg, bbbb, aaaa). */ void sample_texture4_bilinear(vector float s, vector float t, vector float r, vector float q, uint unit, vector float colors[4]) { vector float ss = spu_madd(s, spu.texture[unit].width4, spu_splats(-0.5f)); vector float tt = spu_madd(t, spu.texture[unit].height4, spu_splats(-0.5f)); vector unsigned int is0 = spu_convtu(ss, 0); vector unsigned int it0 = spu_convtu(tt, 0); /* is + 1, it + 1 */ vector unsigned int is1 = spu_add(is0, 1); vector unsigned int it1 = spu_add(it0, 1); /* PIPE_TEX_WRAP_REPEAT */ is0 = spu_and(is0, spu.texture[unit].tex_size_x_mask); it0 = spu_and(it0, spu.texture[unit].tex_size_y_mask); is1 = spu_and(is1, spu.texture[unit].tex_size_x_mask); it1 = spu_and(it1, spu.texture[unit].tex_size_y_mask); /* get packed int texels */ vector unsigned int texels[16]; get_four_texels(unit, is0, it0, texels + 0); /* upper-left */ get_four_texels(unit, is1, it0, texels + 4); /* upper-right */ get_four_texels(unit, is0, it1, texels + 8); /* lower-left */ get_four_texels(unit, is1, it1, texels + 12); /* lower-right */ /* XXX possibly rework following code to compute the weighted sample * colors with integer arithmetic for fewer int->float conversions. */ /* convert packed int texels to float colors */ vector float ftexels[16]; spu_unpack_A8R8G8B8_transpose4(texels + 0, ftexels + 0); spu_unpack_A8R8G8B8_transpose4(texels + 4, ftexels + 4); spu_unpack_A8R8G8B8_transpose4(texels + 8, ftexels + 8); spu_unpack_A8R8G8B8_transpose4(texels + 12, ftexels + 12); /* Compute weighting factors in [0,1] * Multiply texcoord by 1024, AND with 1023, convert back to float. */ vector float ss1024 = spu_mul(ss, spu_splats(1024.0f)); vector signed int iss1024 = spu_convts(ss1024, 0); iss1024 = spu_and(iss1024, 1023); vector float sWeights0 = spu_convtf(iss1024, 10); vector float tt1024 = spu_mul(tt, spu_splats(1024.0f)); vector signed int itt1024 = spu_convts(tt1024, 0); itt1024 = spu_and(itt1024, 1023); vector float tWeights0 = spu_convtf(itt1024, 10); /* 1 - sWeight and 1 - tWeight */ vector float sWeights1 = spu_sub(spu_splats(1.0f), sWeights0); vector float tWeights1 = spu_sub(spu_splats(1.0f), tWeights0); /* reds, for four pixels */ ftexels[ 0] = spu_mul(ftexels[ 0], spu_mul(sWeights1, tWeights1)); /*ul*/ ftexels[ 4] = spu_mul(ftexels[ 4], spu_mul(sWeights0, tWeights1)); /*ur*/ ftexels[ 8] = spu_mul(ftexels[ 8], spu_mul(sWeights1, tWeights0)); /*ll*/ ftexels[12] = spu_mul(ftexels[12], spu_mul(sWeights0, tWeights0)); /*lr*/ colors[0] = spu_add(spu_add(ftexels[0], ftexels[4]), spu_add(ftexels[8], ftexels[12])); /* greens, for four pixels */ ftexels[ 1] = spu_mul(ftexels[ 1], spu_mul(sWeights1, tWeights1)); /*ul*/ ftexels[ 5] = spu_mul(ftexels[ 5], spu_mul(sWeights0, tWeights1)); /*ur*/ ftexels[ 9] = spu_mul(ftexels[ 9], spu_mul(sWeights1, tWeights0)); /*ll*/ ftexels[13] = spu_mul(ftexels[13], spu_mul(sWeights0, tWeights0)); /*lr*/ colors[1] = spu_add(spu_add(ftexels[1], ftexels[5]), spu_add(ftexels[9], ftexels[13])); /* blues, for four pixels */ ftexels[ 2] = spu_mul(ftexels[ 2], spu_mul(sWeights1, tWeights1)); /*ul*/ ftexels[ 6] = spu_mul(ftexels[ 6], spu_mul(sWeights0, tWeights1)); /*ur*/ ftexels[10] = spu_mul(ftexels[10], spu_mul(sWeights1, tWeights0)); /*ll*/ ftexels[14] = spu_mul(ftexels[14], spu_mul(sWeights0, tWeights0)); /*lr*/ colors[2] = spu_add(spu_add(ftexels[2], ftexels[6]), spu_add(ftexels[10], ftexels[14])); /* alphas, for four pixels */ ftexels[ 3] = spu_mul(ftexels[ 3], spu_mul(sWeights1, tWeights1)); /*ul*/ ftexels[ 7] = spu_mul(ftexels[ 7], spu_mul(sWeights0, tWeights1)); /*ur*/ ftexels[11] = spu_mul(ftexels[11], spu_mul(sWeights1, tWeights0)); /*ll*/ ftexels[15] = spu_mul(ftexels[15], spu_mul(sWeights0, tWeights0)); /*lr*/ colors[3] = spu_add(spu_add(ftexels[3], ftexels[7]), spu_add(ftexels[11], ftexels[15])); } /** * Adapted from /opt/cell/sdk/usr/spu/include/transpose_matrix4x4.h */ static INLINE void transpose(vector unsigned int *mOut, vector unsigned int *mIn) { vector unsigned int abcd, efgh, ijkl, mnop; /* input vectors */ vector unsigned int aeim, bfjn, cgko, dhlp; /* output vectors */ vector unsigned int aibj, ckdl, emfn, gohp; /* intermediate vectors */ vector unsigned char shufflehi = ((vector unsigned char) { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17}); vector unsigned char shufflelo = ((vector unsigned char) { 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F}); abcd = *(mIn+0); efgh = *(mIn+1); ijkl = *(mIn+2); mnop = *(mIn+3); aibj = spu_shuffle(abcd, ijkl, shufflehi); ckdl = spu_shuffle(abcd, ijkl, shufflelo); emfn = spu_shuffle(efgh, mnop, shufflehi); gohp = spu_shuffle(efgh, mnop, shufflelo); aeim = spu_shuffle(aibj, emfn, shufflehi); bfjn = spu_shuffle(aibj, emfn, shufflelo); cgko = spu_shuffle(ckdl, gohp, shufflehi); dhlp = spu_shuffle(ckdl, gohp, shufflelo); *(mOut+0) = aeim; *(mOut+1) = bfjn; *(mOut+2) = cgko; *(mOut+3) = dhlp; } /** * Bilinear filtering, using int intead of float arithmetic */ void sample_texture4_bilinear_2(vector float s, vector float t, vector float r, vector float q, uint unit, vector float colors[4]) { static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f}; /* Scale texcoords by size of texture, and add half pixel bias */ vector float ss = spu_madd(s, spu.texture[unit].width4, half); vector float tt = spu_madd(t, spu.texture[unit].height4, half); /* convert float coords to fixed-pt coords with 8 fraction bits */ vector unsigned int is = spu_convtu(ss, 8); vector unsigned int it = spu_convtu(tt, 8); /* compute integer texel weights in [0, 255] */ vector signed int sWeights0 = spu_and((vector signed int) is, 255); vector signed int tWeights0 = spu_and((vector signed int) it, 255); vector signed int sWeights1 = spu_sub(255, sWeights0); vector signed int tWeights1 = spu_sub(255, tWeights0); /* texel coords: is0 = is / 256, it0 = is / 256 */ vector unsigned int is0 = spu_rlmask(is, -8); vector unsigned int it0 = spu_rlmask(it, -8); /* texel coords: i1 = is0 + 1, it1 = it0 + 1 */ vector unsigned int is1 = spu_add(is0, 1); vector unsigned int it1 = spu_add(it0, 1); /* PIPE_TEX_WRAP_REPEAT */ is0 = spu_and(is0, spu.texture[unit].tex_size_x_mask); it0 = spu_and(it0, spu.texture[unit].tex_size_y_mask); is1 = spu_and(is1, spu.texture[unit].tex_size_x_mask); it1 = spu_and(it1, spu.texture[unit].tex_size_y_mask); /* get packed int texels */ vector unsigned int texels[16]; get_four_texels(unit, is0, it0, texels + 0); /* upper-left */ get_four_texels(unit, is1, it0, texels + 4); /* upper-right */ get_four_texels(unit, is0, it1, texels + 8); /* lower-left */ get_four_texels(unit, is1, it1, texels + 12); /* lower-right */ /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */ { static const unsigned char ZERO = 0x80; int i; for (i = 0; i < 16; i++) { texels[i] = spu_shuffle(texels[i], texels[i], ((vector unsigned char) { ZERO, ZERO, ZERO, 1, ZERO, ZERO, ZERO, 2, ZERO, ZERO, ZERO, 3, ZERO, ZERO, ZERO, 0})); } } /* convert RGBA,RGBA,RGBA,RGBA to RRRR,GGGG,BBBB,AAAA */ transpose(texels + 0, texels + 0); transpose(texels + 4, texels + 4); transpose(texels + 8, texels + 8); transpose(texels + 12, texels + 12); /* computed weighted colors */ vector unsigned int c0, c1, c2, c3, cSum; /* red */ c0 = (vector unsigned int) si_mpyu((qword) texels[ 0], si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/ c1 = (vector unsigned int) si_mpyu((qword) texels[ 4], si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/ c2 = (vector unsigned int) si_mpyu((qword) texels[ 8], si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/ c3 = (vector unsigned int) si_mpyu((qword) texels[12], si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/ cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3)); colors[0] = spu_convtf(cSum, 24); /* green */ c0 = (vector unsigned int) si_mpyu((qword) texels[ 1], si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/ c1 = (vector unsigned int) si_mpyu((qword) texels[ 5], si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/ c2 = (vector unsigned int) si_mpyu((qword) texels[ 9], si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/ c3 = (vector unsigned int) si_mpyu((qword) texels[13], si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/ cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3)); colors[1] = spu_convtf(cSum, 24); /* blue */ c0 = (vector unsigned int) si_mpyu((qword) texels[ 2], si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/ c1 = (vector unsigned int) si_mpyu((qword) texels[ 6], si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/ c2 = (vector unsigned int) si_mpyu((qword) texels[10], si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/ c3 = (vector unsigned int) si_mpyu((qword) texels[14], si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/ cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3)); colors[2] = spu_convtf(cSum, 24); /* alpha */ c0 = (vector unsigned int) si_mpyu((qword) texels[ 3], si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/ c1 = (vector unsigned int) si_mpyu((qword) texels[ 7], si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/ c2 = (vector unsigned int) si_mpyu((qword) texels[11], si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/ c3 = (vector unsigned int) si_mpyu((qword) texels[15], si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/ cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3)); colors[3] = spu_convtf(cSum, 24); }