diff options
-rw-r--r-- | data/diffraction.cl | 36 | ||||
-rw-r--r-- | src/diffraction-gpu.c | 48 | ||||
-rw-r--r-- | src/diffraction.c | 2 |
3 files changed, 63 insertions, 23 deletions
diff --git a/data/diffraction.cl b/data/diffraction.cl index 82a4adcf..babfc6b7 100644 --- a/data/diffraction.cl +++ b/data/diffraction.cl @@ -46,7 +46,7 @@ float4 quat_rot(float4 q, float4 z) float4 get_q(int x, int y, float cx, float cy, float res, float clen, float k, - float *ttp, float4 z) + float *ttp, float4 z, int sampling) { float rx, ry, r; float ttx, tty, tt; @@ -131,19 +131,43 @@ kernel void diffraction(global float2 *diff, global float *tt, float k, int w, float cx, float cy, float res, float clen, float16 cell, global float2 *sfacs, float4 z, int4 ncells, - int xmin, int ymin) + int xmin, int ymin, int sampling, local float2 *tmp) { float ttv; const int x = get_global_id(0) + xmin; const int y = get_global_id(1) + ymin; float f_lattice; float2 f_molecule; + float4 q; + const int lx = get_local_id(0); + const int ly = get_local_id(1); + const int ax = x / sampling; + const int ay = y / sampling; - float4 q = get_q(x, y, cx, cy, res, clen, k, &ttv, z); - + /* Calculate value */ + q = get_q(x, y, cx, cy, res, clen, k, &ttv, z, sampling); f_lattice = lattice_factor(cell, q, ncells); f_molecule = get_sfac(sfacs, cell, q); - diff[x+w*y] = f_molecule * f_lattice; - tt[x+w*y] = ttv; + /* Write the value to local memory */ + tmp[lx+sampling*ly] = f_molecule * f_lattice; + + /* Memory fence */ + barrier(CLK_LOCAL_MEM_FENCE); + + /* Leader thread sums the values */ + if ( lx + ly == 0 ) { + + int i; + float2 sum = (0.0, 0.0); + + for ( i=0; i<sampling*sampling; i++ ) sum += tmp[i]; + + diff[ax+w*ay] = sum / (sampling*sampling); + + /* Leader thread also records 2theta value. + * This should really be averaged across all pixels, but + * I strongly suspect this would be a waste of time. */ + tt[ax+w*ay] = ttv; + } } diff --git a/src/diffraction-gpu.c b/src/diffraction-gpu.c index 344da6c6..9057d3cc 100644 --- a/src/diffraction-gpu.c +++ b/src/diffraction-gpu.c @@ -24,9 +24,9 @@ #include "sfac.h" -#define SAMPLING (5) -#define BWSAMPLING (10) -#define BANDWIDTH (1.0 / 100.0) +#define SAMPLING (4) +#define BWSAMPLING (1) +#define BANDWIDTH (0.0 / 100.0) struct gpu_context @@ -150,7 +150,6 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image, double bx, by, bz; double cx, cy, cz; float kc; - size_t dims[2]; cl_event *event; int p; float *tt_ptr; @@ -159,6 +158,7 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image, float *diff_ptr; cl_float4 orientation; cl_int4 ncells; + const int sampling = SAMPLING; cell_get_cartesian(image->molecule->cell, &ax, &ay, &az, &bx, &by, &bz, @@ -221,30 +221,49 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image, ERROR("Couldn't set arg 11: %s\n", clError(err)); return; } + clSetKernelArg(gctx->kern, 14, sizeof(cl_int), &sampling); + if ( err != CL_SUCCESS ) { + ERROR("Couldn't set arg 14: %s\n", clError(err)); + return; + } + /* Local memory for reduction */ + clSetKernelArg(gctx->kern, 15, SAMPLING*SAMPLING*2*sizeof(cl_float), + NULL); + if ( err != CL_SUCCESS ) { + ERROR("Couldn't set arg 15: %s\n", clError(err)); + return; + } /* Iterate over panels */ event = malloc(image->det.n_panels * sizeof(cl_event)); for ( p=0; p<image->det.n_panels; p++ ) { + size_t dims[2]; + size_t ldims[2] = {SAMPLING, SAMPLING}; + cl_float res = image->det.panels[p].res * SAMPLING; + int offsx = image->det.panels[p].min_x * SAMPLING; + int offsy = image->det.panels[p].min_y * SAMPLING; + cl_float centx = image->det.panels[p].cx * SAMPLING; + cl_float centy = image->det.panels[p].cy * SAMPLING; + /* In a future version of OpenCL, this could be done * with a global work offset. But not yet... */ dims[0] = image->det.panels[0].max_x-image->det.panels[0].min_x; dims[1] = image->det.panels[0].max_y-image->det.panels[0].min_y; + dims[0] *= SAMPLING; + dims[1] *= SAMPLING; - clSetKernelArg(gctx->kern, 4, sizeof(cl_float), - &image->det.panels[p].cx); + clSetKernelArg(gctx->kern, 4, sizeof(cl_float), ¢x); if ( err != CL_SUCCESS ) { ERROR("Couldn't set arg 4: %s\n", clError(err)); return; } - clSetKernelArg(gctx->kern, 5, sizeof(cl_float), - &image->det.panels[p].cy); + clSetKernelArg(gctx->kern, 5, sizeof(cl_float), ¢y); if ( err != CL_SUCCESS ) { ERROR("Couldn't set arg 5: %s\n", clError(err)); return; } - clSetKernelArg(gctx->kern, 6, sizeof(cl_float), - &image->det.panels[p].res); + clSetKernelArg(gctx->kern, 6, sizeof(cl_float), &res); if ( err != CL_SUCCESS ) { ERROR("Couldn't set arg 6: %s\n", clError(err)); return; @@ -255,22 +274,19 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image, ERROR("Couldn't set arg 7: %s\n", clError(err)); return; } - - clSetKernelArg(gctx->kern, 12, sizeof(cl_int), - &image->det.panels[p].min_x); + clSetKernelArg(gctx->kern, 12, sizeof(cl_int), &offsx); if ( err != CL_SUCCESS ) { ERROR("Couldn't set arg 12: %s\n", clError(err)); return; } - clSetKernelArg(gctx->kern, 13, sizeof(cl_int), - &image->det.panels[p].min_y); + clSetKernelArg(gctx->kern, 13, sizeof(cl_int), &offsy); if ( err != CL_SUCCESS ) { ERROR("Couldn't set arg 13: %s\n", clError(err)); return; } err = clEnqueueNDRangeKernel(gctx->cq, gctx->kern, 2, NULL, - dims, NULL, 0, NULL, &event[p]); + dims, ldims, 0, NULL, &event[p]); if ( err != CL_SUCCESS ) { ERROR("Couldn't enqueue diffraction kernel: %s\n", clError(err)); diff --git a/src/diffraction.c b/src/diffraction.c index a66c2310..876fac5d 100644 --- a/src/diffraction.c +++ b/src/diffraction.c @@ -23,7 +23,7 @@ #include "sfac.h" -#define SAMPLING (1) +#define SAMPLING (4) #define BWSAMPLING (1) #define BANDWIDTH (0.0 / 100.0) |