diff options
author | Keith Whitwell <keithw@vmware.com> | 2010-02-17 11:07:45 +0000 |
---|---|---|
committer | Keith Whitwell <keithw@vmware.com> | 2010-02-17 11:12:04 +0000 |
commit | 31c816731ead9abce69829e6b5f55a201da3bb47 (patch) | |
tree | 75aa154c545875ac012b099533aa3e1d5e998f83 /src/gallium | |
parent | cb66e9f2b427afeffc0b92801e6943f0b17bf1f1 (diff) |
llvmpipe: rework do_block_16 to use bitmasks and ffs
Some nice speedups:
gears: 547 -> 597
isosurf: 83 -> 98
Others like gloss unchanged. Could do further work in this direction.
Diffstat (limited to 'src/gallium')
-rw-r--r-- | src/gallium/drivers/llvmpipe/lp_rast_tri.c | 68 |
1 files changed, 41 insertions, 27 deletions
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c index 3f76f159df..82ff2d9c89 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c @@ -146,42 +146,56 @@ static void do_block_16( struct lp_rasterizer_task *rast_task, const struct lp_rast_triangle *tri, int x, int y, + int c0, int c1, - int c2, - int c3 ) + int c2 ) { - const int eo1 = tri->eo1 * 4; - const int eo2 = tri->eo2 * 4; - const int eo3 = tri->eo3 * 4; - const int *step0 = tri->inputs.step[0]; - const int *step1 = tri->inputs.step[1]; - const int *step2 = tri->inputs.step[2]; - int i; + unsigned mask = 0; + int eo[3]; + int c[3]; + int i, j; assert(x % 16 == 0); assert(y % 16 == 0); - for (i = 0; i < 16; i++) { - int cx1 = c1 + step0[i] * 4; - int cx2 = c2 + step1[i] * 4; - int cx3 = c3 + step2[i] * 4; + eo[0] = tri->eo1 * 4; + eo[1] = tri->eo2 * 4; + eo[2] = tri->eo3 * 4; - if (cx1 + eo1 < 0 || - cx2 + eo2 < 0 || - cx3 + eo3 < 0) { - /* the block is completely outside the triangle - nop */ - LP_COUNT(nr_empty_4); - } - else { - int px = x + pos_table4[i][0]; - int py = y + pos_table4[i][1]; - /* Don't bother testing if the 4x4 block is entirely in/out of - * the triangle. It's a little faster to do it in the jit code. - */ - LP_COUNT(nr_non_empty_4); - do_block_4(rast_task, tri, px, py, cx1, cx2, cx3); + c[0] = c0; + c[1] = c1; + c[2] = c2; + + for (j = 0; j < 3; j++) { + const int *step = tri->inputs.step[j]; + int cx = c[j]; + int eox = eo[j]; + + /* Mask has bits set whenever we are outside any of the edges. + */ + for (i = 0; i < 16; i++) { + int out = cx + step[i] * 4 + eox; + mask |= (out >> 31) & (1 << i); } } + + mask = ~mask & 0xffff; + while (mask) { + int i = ffs(mask) - 1; + int px = x + pos_table4[i][0]; + int py = y + pos_table4[i][1]; + int cx1 = c0 + tri->inputs.step[0][i] * 4; + int cx2 = c1 + tri->inputs.step[1][i] * 4; + int cx3 = c2 + tri->inputs.step[2][i] * 4; + + mask &= ~(1 << i); + + /* Don't bother testing if the 4x4 block is entirely in/out of + * the triangle. It's a little faster to do it in the jit code. + */ + LP_COUNT(nr_non_empty_4); + do_block_4(rast_task, tri, px, py, cx1, cx2, cx3); + } } |