aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--data/diffraction.cl36
-rw-r--r--src/diffraction-gpu.c48
-rw-r--r--src/diffraction.c2
3 files changed, 63 insertions, 23 deletions
diff --git a/data/diffraction.cl b/data/diffraction.cl
index 82a4adcf..babfc6b7 100644
--- a/data/diffraction.cl
+++ b/data/diffraction.cl
@@ -46,7 +46,7 @@ float4 quat_rot(float4 q, float4 z)
float4 get_q(int x, int y, float cx, float cy, float res, float clen, float k,
- float *ttp, float4 z)
+ float *ttp, float4 z, int sampling)
{
float rx, ry, r;
float ttx, tty, tt;
@@ -131,19 +131,43 @@ kernel void diffraction(global float2 *diff, global float *tt, float k,
int w, float cx, float cy,
float res, float clen, float16 cell,
global float2 *sfacs, float4 z, int4 ncells,
- int xmin, int ymin)
+ int xmin, int ymin, int sampling, local float2 *tmp)
{
float ttv;
const int x = get_global_id(0) + xmin;
const int y = get_global_id(1) + ymin;
float f_lattice;
float2 f_molecule;
+ float4 q;
+ const int lx = get_local_id(0);
+ const int ly = get_local_id(1);
+ const int ax = x / sampling;
+ const int ay = y / sampling;
- float4 q = get_q(x, y, cx, cy, res, clen, k, &ttv, z);
-
+ /* Calculate value */
+ q = get_q(x, y, cx, cy, res, clen, k, &ttv, z, sampling);
f_lattice = lattice_factor(cell, q, ncells);
f_molecule = get_sfac(sfacs, cell, q);
- diff[x+w*y] = f_molecule * f_lattice;
- tt[x+w*y] = ttv;
+ /* Write the value to local memory */
+ tmp[lx+sampling*ly] = f_molecule * f_lattice;
+
+ /* Memory fence */
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ /* Leader thread sums the values */
+ if ( lx + ly == 0 ) {
+
+ int i;
+ float2 sum = (0.0, 0.0);
+
+ for ( i=0; i<sampling*sampling; i++ ) sum += tmp[i];
+
+ diff[ax+w*ay] = sum / (sampling*sampling);
+
+ /* Leader thread also records 2theta value.
+ * This should really be averaged across all pixels, but
+ * I strongly suspect this would be a waste of time. */
+ tt[ax+w*ay] = ttv;
+ }
}
diff --git a/src/diffraction-gpu.c b/src/diffraction-gpu.c
index 344da6c6..9057d3cc 100644
--- a/src/diffraction-gpu.c
+++ b/src/diffraction-gpu.c
@@ -24,9 +24,9 @@
#include "sfac.h"
-#define SAMPLING (5)
-#define BWSAMPLING (10)
-#define BANDWIDTH (1.0 / 100.0)
+#define SAMPLING (4)
+#define BWSAMPLING (1)
+#define BANDWIDTH (0.0 / 100.0)
struct gpu_context
@@ -150,7 +150,6 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image,
double bx, by, bz;
double cx, cy, cz;
float kc;
- size_t dims[2];
cl_event *event;
int p;
float *tt_ptr;
@@ -159,6 +158,7 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image,
float *diff_ptr;
cl_float4 orientation;
cl_int4 ncells;
+ const int sampling = SAMPLING;
cell_get_cartesian(image->molecule->cell, &ax, &ay, &az,
&bx, &by, &bz,
@@ -221,30 +221,49 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image,
ERROR("Couldn't set arg 11: %s\n", clError(err));
return;
}
+ clSetKernelArg(gctx->kern, 14, sizeof(cl_int), &sampling);
+ if ( err != CL_SUCCESS ) {
+ ERROR("Couldn't set arg 14: %s\n", clError(err));
+ return;
+ }
+ /* Local memory for reduction */
+ clSetKernelArg(gctx->kern, 15, SAMPLING*SAMPLING*2*sizeof(cl_float),
+ NULL);
+ if ( err != CL_SUCCESS ) {
+ ERROR("Couldn't set arg 15: %s\n", clError(err));
+ return;
+ }
/* Iterate over panels */
event = malloc(image->det.n_panels * sizeof(cl_event));
for ( p=0; p<image->det.n_panels; p++ ) {
+ size_t dims[2];
+ size_t ldims[2] = {SAMPLING, SAMPLING};
+ cl_float res = image->det.panels[p].res * SAMPLING;
+ int offsx = image->det.panels[p].min_x * SAMPLING;
+ int offsy = image->det.panels[p].min_y * SAMPLING;
+ cl_float centx = image->det.panels[p].cx * SAMPLING;
+ cl_float centy = image->det.panels[p].cy * SAMPLING;
+
/* In a future version of OpenCL, this could be done
* with a global work offset. But not yet... */
dims[0] = image->det.panels[0].max_x-image->det.panels[0].min_x;
dims[1] = image->det.panels[0].max_y-image->det.panels[0].min_y;
+ dims[0] *= SAMPLING;
+ dims[1] *= SAMPLING;
- clSetKernelArg(gctx->kern, 4, sizeof(cl_float),
- &image->det.panels[p].cx);
+ clSetKernelArg(gctx->kern, 4, sizeof(cl_float), &centx);
if ( err != CL_SUCCESS ) {
ERROR("Couldn't set arg 4: %s\n", clError(err));
return;
}
- clSetKernelArg(gctx->kern, 5, sizeof(cl_float),
- &image->det.panels[p].cy);
+ clSetKernelArg(gctx->kern, 5, sizeof(cl_float), &centy);
if ( err != CL_SUCCESS ) {
ERROR("Couldn't set arg 5: %s\n", clError(err));
return;
}
- clSetKernelArg(gctx->kern, 6, sizeof(cl_float),
- &image->det.panels[p].res);
+ clSetKernelArg(gctx->kern, 6, sizeof(cl_float), &res);
if ( err != CL_SUCCESS ) {
ERROR("Couldn't set arg 6: %s\n", clError(err));
return;
@@ -255,22 +274,19 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image,
ERROR("Couldn't set arg 7: %s\n", clError(err));
return;
}
-
- clSetKernelArg(gctx->kern, 12, sizeof(cl_int),
- &image->det.panels[p].min_x);
+ clSetKernelArg(gctx->kern, 12, sizeof(cl_int), &offsx);
if ( err != CL_SUCCESS ) {
ERROR("Couldn't set arg 12: %s\n", clError(err));
return;
}
- clSetKernelArg(gctx->kern, 13, sizeof(cl_int),
- &image->det.panels[p].min_y);
+ clSetKernelArg(gctx->kern, 13, sizeof(cl_int), &offsy);
if ( err != CL_SUCCESS ) {
ERROR("Couldn't set arg 13: %s\n", clError(err));
return;
}
err = clEnqueueNDRangeKernel(gctx->cq, gctx->kern, 2, NULL,
- dims, NULL, 0, NULL, &event[p]);
+ dims, ldims, 0, NULL, &event[p]);
if ( err != CL_SUCCESS ) {
ERROR("Couldn't enqueue diffraction kernel: %s\n",
clError(err));
diff --git a/src/diffraction.c b/src/diffraction.c
index a66c2310..876fac5d 100644
--- a/src/diffraction.c
+++ b/src/diffraction.c
@@ -23,7 +23,7 @@
#include "sfac.h"
-#define SAMPLING (1)
+#define SAMPLING (4)
#define BWSAMPLING (1)
#define BANDWIDTH (0.0 / 100.0)