diff options
-rw-r--r-- | data/diffraction.cl | 55 | ||||
-rw-r--r-- | src/diffraction-gpu.c | 27 | ||||
-rw-r--r-- | src/diffraction.c | 3 |
3 files changed, 64 insertions, 21 deletions
diff --git a/data/diffraction.cl b/data/diffraction.cl index a3986107..ec3d528f 100644 --- a/data/diffraction.cl +++ b/data/diffraction.cl @@ -29,12 +29,14 @@ const sampler_t sampler_c = CLK_NORMALIZED_COORDS_TRUE float4 get_q(float fs, float ss, float res, float clen, float k, float *ttp, float corner_x, float corner_y, - float fsx, float fsy, float ssx, float ssy) + float fsx, float fsy, float ssx, float ssy, + float xdiv, float ydiv) { float rx, ry, r; float az, tt; float4 q; float xs, ys; + float kx, ky, kz; xs = fs*fsx + ss*ssx; ys = fs*fsy + ss*ssy; @@ -49,9 +51,19 @@ float4 get_q(float fs, float ss, float res, float clen, float k, az = atan2(ry, rx); - q = (float4)(k*native_sin(tt)*native_cos(az), - k*native_sin(tt)*native_sin(az), - k*(native_cos(tt)-1.0), 0.0); + kx = k*native_sin(tt)*native_cos(az); + ky = k*native_sin(tt)*native_sin(az); + kz = k*(native_cos(tt)-1.0); + + /* x divergence */ + kx = kx*cos(xdiv) +kz*sin(xdiv); + kz = -kx*sin(xdiv) +kz*cos(xdiv); + + /* y divergence */ + ky = ky*cos(ydiv) +kz*sin(ydiv); + kz = -ky*sin(ydiv) +kz*cos(ydiv); + + q = (float4)(kx, ky, kz, 0.0); return q; } @@ -202,18 +214,32 @@ kernel void diffraction(global float *diff, global float *tt, float klow, read_only image2d_t func_b, read_only image2d_t func_c, global float *flags, - float fsx, float fsy, float ssx, float ssy) + float fsx, float fsy, float ssx, float ssy, + float divxlow, float divxstep, int divxsamp, + float divylow, float divystep, int divysamp) { float ttv; float fs, ss; float f_lattice, I_lattice; float I_molecule; float4 q; - const int lx = get_local_id(0); - const int ly = get_local_id(1); - const int lb = get_local_id(2); float k = klow + kstep * get_local_id(2); float intensity; + const int ls0 = get_local_size(0); + const int ls1 = get_local_size(1); + const int ls2 = get_local_size(2) / (divxsamp*divysamp); + const int ls3 = divxsamp; + const int ls4 = divysamp; + const int li0 = get_local_id(0); + const int li1 = get_local_id(1); + const int li234 = get_local_id(2); + const int li2 = li234 / (ls3*ls4); + const int li234leftover = li234 % (ls3*ls4); + const int li3 = li234leftover / ls4; + const int li4 = li234leftover % ls4; + const int ls = ls0 * ls1 * ls2 * ls3 * ls4; + float xdiv = divxlow + divxstep*ls4; + float ydiv = divylow + divystep*ls3; /* Calculate fractional coordinates in fs/ss */ fs = convert_float(get_global_id(0)) / convert_float(sampling); @@ -221,7 +247,7 @@ kernel void diffraction(global float *diff, global float *tt, float klow, /* Get the scattering vector */ q = get_q(fs, ss, res, clen, k, &ttv, - corner_x, corner_y, fsx, fsy, ssx, ssy); + corner_x, corner_y, fsx, fsy, ssx, ssy, xdiv, ydiv); /* Calculate the diffraction */ f_lattice = lattice_factor(cell, q, func_a, func_b, func_c); @@ -230,13 +256,14 @@ kernel void diffraction(global float *diff, global float *tt, float klow, /* Write the value to local memory */ intensity = I_molecule * I_lattice; - tmp[lx+sampling*ly+sampling*sampling*lb] = intensity; + tmp[li0 + ls0*li1 + ls0*ls1*li2 + ls0*ls1*ls2*li3 + + ls0*ls1*ls2*ls3*li4] = intensity; /* Memory fence */ barrier(CLK_LOCAL_MEM_FENCE); /* Leader thread sums the values */ - if ( lx + ly + lb == 0 ) { + if ( li0 + li1 + li2 + li3 + li4 == 0 ) { int i; float sum = 0.0; @@ -245,11 +272,9 @@ kernel void diffraction(global float *diff, global float *tt, float klow, idx = convert_int_rtz(fs) + w*convert_int_rtz(ss); - for ( i=0; i<sampling*sampling*get_local_size(2); i++ ) - sum += tmp[i]; + for ( i=0; i<ls; i++ ) sum += tmp[i]; - val = sum / convert_float(get_local_size(0)*get_local_size(1) - *get_local_size(2)); + val = sum / convert_float(ls); diff[idx] = val; /* Leader thread also records the 2theta value */ diff --git a/src/diffraction-gpu.c b/src/diffraction-gpu.c index c0814e79..74e2292a 100644 --- a/src/diffraction-gpu.c +++ b/src/diffraction-gpu.c @@ -34,7 +34,8 @@ #define SAMPLING (4) -#define BWSAMPLING (10) +#define BWSAMPLING (1) +#define DIVSAMPLING (4) #define SINC_LUT_ELEMENTS (4096) @@ -166,7 +167,10 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image, cl_float bwstep; int n_inf = 0; int n_neg = 0; + cl_float divxlow, divxstep; + cl_float divylow, divystep; int n_nan = 0; + int sprod; if ( gctx == NULL ) { ERROR("GPU setup failed.\n"); @@ -183,6 +187,12 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image, khigh = 1.0/(image->lambda*(1.0 - image->beam->bandwidth/2.0)); bwstep = (khigh-klow) / BWSAMPLING; + /* Calculate divergence stuff */ + divxlow = -image->beam->divergence/2.0; + divylow = -image->beam->divergence/2.0; + divxstep = image->beam->divergence / DIVSAMPLING; + divystep = image->beam->divergence / DIVSAMPLING; + ncells.s[0] = na; ncells.s[1] = nb; ncells.s[2] = nc; @@ -201,6 +211,12 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image, if ( set_arg_mem(gctx, 16, gctx->sinc_luts[nb-1]) ) return; if ( set_arg_mem(gctx, 17, gctx->sinc_luts[nc-1]) ) return; if ( set_arg_mem(gctx, 18, gctx->flags) ) return; + if ( set_arg_float(gctx, 23, divxlow) ) return; + if ( set_arg_float(gctx, 24, divxstep) ) return; + if ( set_arg_int(gctx, 25, DIVSAMPLING) ) return; + if ( set_arg_float(gctx, 26, divylow) ) return; + if ( set_arg_float(gctx, 27, divystep) ) return; + if ( set_arg_int(gctx, 28, DIVSAMPLING) ) return; /* Unit cell */ err = clSetKernelArg(gctx->kern, 8, sizeof(cl_float16), &cell); @@ -210,8 +226,8 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image, } /* Local memory for reduction */ - err = clSetKernelArg(gctx->kern, 13, - BWSAMPLING*SAMPLING*SAMPLING*sizeof(cl_float), NULL); + sprod = BWSAMPLING*SAMPLING*SAMPLING*DIVSAMPLING*DIVSAMPLING; + err = clSetKernelArg(gctx->kern, 13, sprod*sizeof(cl_float), NULL); if ( err != CL_SUCCESS ) { ERROR("Couldn't set local memory: %s\n", clError(err)); return; @@ -225,7 +241,8 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image, for ( i=0; i<image->det->n_panels; i++ ) { size_t dims[3]; - size_t ldims[3] = {SAMPLING, SAMPLING, BWSAMPLING}; + size_t ldims[3] = {SAMPLING, SAMPLING, + BWSAMPLING * DIVSAMPLING * DIVSAMPLING}; struct panel *p; cl_mem tt; size_t tt_size; @@ -273,7 +290,7 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image, dims[0] = pan_width * SAMPLING; dims[1] = pan_height * SAMPLING; - dims[2] = BWSAMPLING; + dims[2] = BWSAMPLING * DIVSAMPLING * DIVSAMPLING; err = clEnqueueNDRangeKernel(gctx->cq, gctx->kern, 3, NULL, dims, ldims, 0, NULL, NULL); diff --git a/src/diffraction.c b/src/diffraction.c index cd372a28..b55d2265 100644 --- a/src/diffraction.c +++ b/src/diffraction.c @@ -27,7 +27,8 @@ #define SAMPLING (4) -#define BWSAMPLING (10) +#define BWSAMPLING (1) +#define DIVSAMPLING (4) #define SINC_LUT_ELEMENTS (4096) |