From c7e0450702ab668cdcda34541e6bf815d50be8a8 Mon Sep 17 00:00:00 2001 From: Thomas White Date: Fri, 19 Feb 2010 19:12:55 +0100 Subject: Add bandwidth to GPU calculation Also: alter CPU version to be cleaner and give exactly the same results at GPU, and fix an indexing bug --- data/diffraction.cl | 14 +++++++++----- src/diffraction-gpu.c | 34 ++++++++++++++++++++++------------ src/diffraction.c | 13 +++++++------ 3 files changed, 38 insertions(+), 23 deletions(-) diff --git a/data/diffraction.cl b/data/diffraction.cl index 138af028..41d331b3 100644 --- a/data/diffraction.cl +++ b/data/diffraction.cl @@ -127,11 +127,12 @@ float2 get_sfac(global float2 *sfacs, float16 cell, float4 q) } -kernel void diffraction(global float2 *diff, global float *tt, float k, +kernel void diffraction(global float2 *diff, global float *tt, float klow, int w, float cx, float cy, float res, float clen, float16 cell, global float2 *sfacs, float4 z, int4 ncells, - int xmin, int ymin, int sampling, local float2 *tmp) + int xmin, int ymin, int sampling, local float2 *tmp, + float kstep) { float ttv; const int x = get_global_id(0) + (xmin*sampling); @@ -141,6 +142,8 @@ kernel void diffraction(global float2 *diff, global float *tt, float k, float4 q; const int lx = get_local_id(0); const int ly = get_local_id(1); + const int lb = get_local_id(2); + float k = klow + kstep * get_local_id(2); const int ax = x / sampling; const int ay = y / sampling; @@ -150,7 +153,7 @@ kernel void diffraction(global float2 *diff, global float *tt, float k, f_molecule = get_sfac(sfacs, cell, q); /* Write the value to local memory */ - tmp[lx+sampling*ly] = f_molecule * f_lattice; + tmp[lx+sampling*ly+sampling*sampling*lb] = f_molecule * f_lattice; /* Memory fence */ barrier(CLK_LOCAL_MEM_FENCE); @@ -161,9 +164,10 @@ kernel void diffraction(global float2 *diff, global float *tt, float k, int i; float2 sum = (0.0, 0.0); - for ( i=0; imolecule->cell, &ax, &ay, &az, &bx, &by, &bz, @@ -169,7 +170,9 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image, cell[6] = cx; cell[7] = cy; cell[8] = cz; /* Calculate wavelength */ - kc = 1.0/image->lambda; /* Centre value */ + k = 1.0/image->lambda; /* Centre value */ + klow = k - k*(BANDWIDTH/2.0); /* Lower value */ + bwstep = k * BANDWIDTH / BWSAMPLING; /* Orientation */ orientation[0] = image->orientation.w; @@ -192,7 +195,7 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image, ERROR("Couldn't set arg 1: %s\n", clError(err)); return; } - clSetKernelArg(gctx->kern, 2, sizeof(cl_float), &kc); + clSetKernelArg(gctx->kern, 2, sizeof(cl_float), &klow); if ( err != CL_SUCCESS ) { ERROR("Couldn't set arg 2: %s\n", clError(err)); return; @@ -228,26 +231,33 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image, return; } /* Local memory for reduction */ - clSetKernelArg(gctx->kern, 15, SAMPLING*SAMPLING*2*sizeof(cl_float), - NULL); + clSetKernelArg(gctx->kern, 15, + BWSAMPLING*SAMPLING*SAMPLING*2*sizeof(cl_float), NULL); if ( err != CL_SUCCESS ) { ERROR("Couldn't set arg 15: %s\n", clError(err)); return; } + /* Bandwidth sampling step */ + clSetKernelArg(gctx->kern, 16, sizeof(cl_float), &bwstep); + if ( err != CL_SUCCESS ) { + ERROR("Couldn't set arg 16: %s\n", clError(err)); + return; + } /* Iterate over panels */ event = malloc(image->det.n_panels * sizeof(cl_event)); for ( p=0; pdet.n_panels; p++ ) { - size_t dims[2]; - size_t ldims[2] = {SAMPLING, SAMPLING}; + size_t dims[3]; + size_t ldims[3] = {SAMPLING, SAMPLING, BWSAMPLING}; /* In a future version of OpenCL, this could be done * with a global work offset. But not yet... */ - dims[0] = image->det.panels[0].max_x-image->det.panels[0].min_x; - dims[1] = image->det.panels[0].max_y-image->det.panels[0].min_y; + dims[0] = 1+image->det.panels[0].max_x-image->det.panels[0].min_x; + dims[1] = 1+image->det.panels[0].max_y-image->det.panels[0].min_y; dims[0] *= SAMPLING; dims[1] *= SAMPLING; + dims[2] = BWSAMPLING; clSetKernelArg(gctx->kern, 4, sizeof(cl_float), &image->det.panels[p].cx); @@ -286,7 +296,7 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image, return; } - err = clEnqueueNDRangeKernel(gctx->cq, gctx->kern, 2, NULL, + err = clEnqueueNDRangeKernel(gctx->cq, gctx->kern, 3, NULL, dims, ldims, 0, NULL, &event[p]); if ( err != CL_SUCCESS ) { ERROR("Couldn't enqueue diffraction kernel: %s\n", diff --git a/src/diffraction.c b/src/diffraction.c index 876fac5d..6df00164 100644 --- a/src/diffraction.c +++ b/src/diffraction.c @@ -24,8 +24,8 @@ #define SAMPLING (4) -#define BWSAMPLING (1) -#define BANDWIDTH (0.0 / 100.0) +#define BWSAMPLING (10) +#define BANDWIDTH (1.0 / 100.0) static double lattice_factor(struct rvec q, double ax, double ay, double az, @@ -184,7 +184,7 @@ void get_diffraction(struct image *image, int na, int nb, int nc, int no_sfac) double ax, ay, az; double bx, by, bz; double cx, cy, cz; - float kc; + float k, klow, bwstep; if ( image->molecule == NULL ) return; @@ -206,7 +206,9 @@ void get_diffraction(struct image *image, int na, int nb, int nc, int no_sfac) /* Needed later for Lorentz calculation */ image->twotheta = malloc(image->width * image->height * sizeof(double)); - kc = 1.0/image->lambda; /* Centre value */ + k = 1.0/image->lambda; /* Centre value */ + klow = k - k*(BANDWIDTH/2.0); /* Lower value */ + bwstep = k * BANDWIDTH / BWSAMPLING; for ( xs=0; xswidth*SAMPLING; xs++ ) { for ( ys=0; ysheight*SAMPLING; ys++ ) { @@ -229,8 +231,7 @@ void get_diffraction(struct image *image, int na, int nb, int nc, int no_sfac) double complex val; /* Calculate k this time round */ - k = kc + (kstep-(BWSAMPLING/2)) * - kc*(BANDWIDTH/BWSAMPLING); + k = klow + kstep * bwstep; q = get_q(image, xs, ys, SAMPLING, &twotheta, k); image->twotheta[x + image->width*y] = twotheta; -- cgit v1.2.3