From c7e0450702ab668cdcda34541e6bf815d50be8a8 Mon Sep 17 00:00:00 2001
From: Thomas White <taw@physics.org>
Date: Fri, 19 Feb 2010 19:12:55 +0100
Subject: Add bandwidth to GPU calculation

Also: alter CPU version to be cleaner and give exactly the same results at GPU,
and fix an indexing bug
---
 data/diffraction.cl   | 14 +++++++++-----
 src/diffraction-gpu.c | 34 ++++++++++++++++++++++------------
 src/diffraction.c     | 13 +++++++------
 3 files changed, 38 insertions(+), 23 deletions(-)

diff --git a/data/diffraction.cl b/data/diffraction.cl
index 138af028..41d331b3 100644
--- a/data/diffraction.cl
+++ b/data/diffraction.cl
@@ -127,11 +127,12 @@ float2 get_sfac(global float2 *sfacs, float16 cell, float4 q)
 }
 
 
-kernel void diffraction(global float2 *diff, global float *tt, float k,
+kernel void diffraction(global float2 *diff, global float *tt, float klow,
                        int w, float cx, float cy,
                        float res, float clen, float16 cell,
                        global float2 *sfacs, float4 z, int4 ncells,
-                       int xmin, int ymin, int sampling, local float2 *tmp)
+                       int xmin, int ymin, int sampling, local float2 *tmp,
+                       float kstep)
 {
 	float ttv;
 	const int x = get_global_id(0) + (xmin*sampling);
@@ -141,6 +142,8 @@ kernel void diffraction(global float2 *diff, global float *tt, float k,
 	float4 q;
 	const int lx = get_local_id(0);
 	const int ly = get_local_id(1);
+	const int lb = get_local_id(2);
+	float k = klow + kstep * get_local_id(2);
 	const int ax = x / sampling;
 	const int ay = y / sampling;
 
@@ -150,7 +153,7 @@ kernel void diffraction(global float2 *diff, global float *tt, float k,
 	f_molecule = get_sfac(sfacs, cell, q);
 
 	/* Write the value to local memory */
-	tmp[lx+sampling*ly] = f_molecule * f_lattice;
+	tmp[lx+sampling*ly+sampling*sampling*lb] = f_molecule * f_lattice;
 
 	/* Memory fence */
 	barrier(CLK_LOCAL_MEM_FENCE);
@@ -161,9 +164,10 @@ kernel void diffraction(global float2 *diff, global float *tt, float k,
 		int i;
 		float2 sum = (0.0, 0.0);
 
-		for ( i=0; i<sampling*sampling; i++ ) sum += tmp[i];
+		for ( i=0; i<sampling*sampling*get_local_size(2); i++ )
+			sum += tmp[i];
 
-		diff[ax+w*ay] = sum / (sampling*sampling);
+		diff[ax+w*ay] = sum / (sampling*sampling*get_local_size(2));
 
 		/* Leader thread also records the 2theta value.
 		 * This should really be averaged across all pixels, but
diff --git a/src/diffraction-gpu.c b/src/diffraction-gpu.c
index b9485241..133c0bc1 100644
--- a/src/diffraction-gpu.c
+++ b/src/diffraction-gpu.c
@@ -25,8 +25,8 @@
 
 
 #define SAMPLING (4)
-#define BWSAMPLING (1)
-#define BANDWIDTH (0.0 / 100.0)
+#define BWSAMPLING (10)
+#define BANDWIDTH (1.0 / 100.0)
 
 
 struct gpu_context
@@ -150,7 +150,7 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image,
 	double ax, ay, az;
 	double bx, by, bz;
 	double cx, cy, cz;
-	float kc;
+	float k, klow;
 	cl_event *event;
 	int p;
 	float *tt_ptr;
@@ -160,6 +160,7 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image,
 	cl_float4 orientation;
 	cl_int4 ncells;
 	const int sampling = SAMPLING;
+	cl_float bwstep;
 
 	cell_get_cartesian(image->molecule->cell, &ax, &ay, &az,
 		                                  &bx, &by, &bz,
@@ -169,7 +170,9 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image,
 	cell[6] = cx;  cell[7] = cy;  cell[8] = cz;
 
 	/* Calculate wavelength */
-	kc = 1.0/image->lambda;  /* Centre value */
+	k = 1.0/image->lambda;  /* Centre value */
+	klow = k - k*(BANDWIDTH/2.0);  /* Lower value */
+	bwstep = k * BANDWIDTH / BWSAMPLING;
 
 	/* Orientation */
 	orientation[0] = image->orientation.w;
@@ -192,7 +195,7 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image,
 		ERROR("Couldn't set arg 1: %s\n", clError(err));
 		return;
 	}
-	clSetKernelArg(gctx->kern, 2, sizeof(cl_float), &kc);
+	clSetKernelArg(gctx->kern, 2, sizeof(cl_float), &klow);
 	if ( err != CL_SUCCESS ) {
 		ERROR("Couldn't set arg 2: %s\n", clError(err));
 		return;
@@ -228,26 +231,33 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image,
 		return;
 	}
 	/* Local memory for reduction */
-	clSetKernelArg(gctx->kern, 15, SAMPLING*SAMPLING*2*sizeof(cl_float),
-	               NULL);
+	clSetKernelArg(gctx->kern, 15,
+	               BWSAMPLING*SAMPLING*SAMPLING*2*sizeof(cl_float), NULL);
 	if ( err != CL_SUCCESS ) {
 		ERROR("Couldn't set arg 15: %s\n", clError(err));
 		return;
 	}
+	/* Bandwidth sampling step */
+	clSetKernelArg(gctx->kern, 16, sizeof(cl_float), &bwstep);
+	if ( err != CL_SUCCESS ) {
+		ERROR("Couldn't set arg 16: %s\n", clError(err));
+		return;
+	}
 
 	/* Iterate over panels */
 	event = malloc(image->det.n_panels * sizeof(cl_event));
 	for ( p=0; p<image->det.n_panels; p++ ) {
 
-		size_t dims[2];
-		size_t ldims[2] = {SAMPLING, SAMPLING};
+		size_t dims[3];
+		size_t ldims[3] = {SAMPLING, SAMPLING, BWSAMPLING};
 
 		/* In a future version of OpenCL, this could be done
 		 * with a global work offset.  But not yet... */
-		dims[0] = image->det.panels[0].max_x-image->det.panels[0].min_x;
-		dims[1] = image->det.panels[0].max_y-image->det.panels[0].min_y;
+		dims[0] = 1+image->det.panels[0].max_x-image->det.panels[0].min_x;
+		dims[1] = 1+image->det.panels[0].max_y-image->det.panels[0].min_y;
 		dims[0] *= SAMPLING;
 		dims[1] *= SAMPLING;
+		dims[2] = BWSAMPLING;
 
 		clSetKernelArg(gctx->kern, 4, sizeof(cl_float),
 		               &image->det.panels[p].cx);
@@ -286,7 +296,7 @@ void get_diffraction_gpu(struct gpu_context *gctx, struct image *image,
 			return;
 		}
 
-		err = clEnqueueNDRangeKernel(gctx->cq, gctx->kern, 2, NULL,
+		err = clEnqueueNDRangeKernel(gctx->cq, gctx->kern, 3, NULL,
 		                             dims, ldims, 0, NULL, &event[p]);
 		if ( err != CL_SUCCESS ) {
 			ERROR("Couldn't enqueue diffraction kernel: %s\n",
diff --git a/src/diffraction.c b/src/diffraction.c
index 876fac5d..6df00164 100644
--- a/src/diffraction.c
+++ b/src/diffraction.c
@@ -24,8 +24,8 @@
 
 
 #define SAMPLING (4)
-#define BWSAMPLING (1)
-#define BANDWIDTH (0.0 / 100.0)
+#define BWSAMPLING (10)
+#define BANDWIDTH (1.0 / 100.0)
 
 
 static double lattice_factor(struct rvec q, double ax, double ay, double az,
@@ -184,7 +184,7 @@ void get_diffraction(struct image *image, int na, int nb, int nc, int no_sfac)
 	double ax, ay, az;
 	double bx, by, bz;
 	double cx, cy, cz;
-	float kc;
+	float k, klow, bwstep;
 
 	if ( image->molecule == NULL ) return;
 
@@ -206,7 +206,9 @@ void get_diffraction(struct image *image, int na, int nb, int nc, int no_sfac)
 	/* Needed later for Lorentz calculation */
 	image->twotheta = malloc(image->width * image->height * sizeof(double));
 
-	kc = 1.0/image->lambda;  /* Centre value */
+	k = 1.0/image->lambda;  /* Centre value */
+	klow = k - k*(BANDWIDTH/2.0);  /* Lower value */
+	bwstep = k * BANDWIDTH / BWSAMPLING;
 
 	for ( xs=0; xs<image->width*SAMPLING; xs++ ) {
 	for ( ys=0; ys<image->height*SAMPLING; ys++ ) {
@@ -229,8 +231,7 @@ void get_diffraction(struct image *image, int na, int nb, int nc, int no_sfac)
 			double complex val;
 
 			/* Calculate k this time round */
-			k = kc + (kstep-(BWSAMPLING/2)) *
-			                              kc*(BANDWIDTH/BWSAMPLING);
+			k = klow + kstep * bwstep;
 
 			q = get_q(image, xs, ys, SAMPLING, &twotheta, k);
 			image->twotheta[x + image->width*y] = twotheta;
-- 
cgit v1.2.3