aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThomas White <taw@physics.org>2020-09-01 15:57:29 +0200
committerThomas White <taw@physics.org>2020-09-01 15:57:29 +0200
commit6b3389b5b96efd28b70ab4258bc77f59da6b01ea (patch)
tree1fb39e0e5191a9ccd6c033803a9fd0e4a719a9fb
parent4bb477999843f1c14d81672b83607c7394e740b5 (diff)
SLURM: Track job status
-rw-r--r--src/gui_backend_slurm.c72
1 files changed, 70 insertions, 2 deletions
diff --git a/src/gui_backend_slurm.c b/src/gui_backend_slurm.c
index 586f1d0f..9f9ce920 100644
--- a/src/gui_backend_slurm.c
+++ b/src/gui_backend_slurm.c
@@ -52,18 +52,80 @@ struct slurm_indexing_opts
struct slurm_job
{
double frac_complete;
+ int n_frames;
int n_blocks;
uint32_t *job_ids;
+ char **stderr_filenames;
};
+static int read_number_processed(const char *filename)
+{
+ FILE *fh = fopen(filename, "r");
+ int n_proc;
+
+ do {
+ char line[1024];
+ if ( fgets(line, 1024, fh) == NULL ) break;
+
+ if ( strncmp(line, "Final: ", 7) == 0 ) {
+ sscanf(line, "Final: %i images processed", &n_proc);
+ } else if ( strstr(line, " images processed, ") != NULL ) {
+ sscanf(line, "%i ", &n_proc);
+ } else {
+ STATUS("%s\n", line);
+ }
+
+ } while ( 1 );
+
+ return n_proc;
+}
+
+
+static int job_running(uint32_t job_id)
+{
+ job_info_msg_t *job_info;
+ int running = 1;
+
+ if ( slurm_load_job(&job_info, job_id, 0) ) {
+ STATUS("Couldn't get status: %i\n",
+ slurm_strerror(slurm_get_errno()));
+ running = 0;
+ /* FIXME: Distinguish error cond from job complete */
+ }
+
+ /* FIXME: Check that job is actually still running */
+
+ slurm_free_job_info_msg(job_info);
+
+ return running;
+}
+
+
static int get_task_status(void *job_priv,
int *running,
float *frac_complete)
{
struct slurm_job *job = job_priv;
- *frac_complete = job->frac_complete;
- *running = 1;
+ int i;
+ int n_proc = 0;
+ int all_complete = 1;
+
+ for ( i=0; i<job->n_blocks; i++ ) {
+
+ n_proc += read_number_processed(job->stderr_filenames[i]);
+
+ if ( (job->job_ids[i] != 0)
+ && !job_running(job->job_ids[i]) )
+ {
+ job->job_ids[i] = 0;
+ } else {
+ all_complete = 0;
+ }
+ }
+
+ *frac_complete = (double)n_proc / job->n_frames;
+ *running = 1 - all_complete;
return 0;
}
@@ -295,6 +357,7 @@ static void *run_indexing(const char *job_title,
job = malloc(sizeof(struct slurm_job));
if ( job == NULL ) return 0;
+ job->n_frames = n_frames;
job->n_blocks = n_frames / opts->block_size;
if ( n_frames % opts->block_size ) job->n_blocks++;
STATUS("Splitting job into %i blocks of max %i frames\n",
@@ -303,6 +366,9 @@ static void *run_indexing(const char *job_title,
job->job_ids = malloc(job->n_blocks * sizeof(uint32_t));
if ( job->job_ids == NULL ) return NULL;
+ job->stderr_filenames = malloc(job->n_blocks * sizeof(char *));
+ if ( job->stderr_filenames == NULL ) return NULL;
+
for ( i=0; i<job->n_blocks; i++ ) {
char job_name[128];
@@ -343,6 +409,7 @@ static void *run_indexing(const char *job_title,
}
job->job_ids[i] = job_id;
+ job->stderr_filenames[i] = strdup(stderr_file);
STATUS("Submitted SLURM job ID %i\n", job_id);
}
@@ -353,6 +420,7 @@ static void *run_indexing(const char *job_title,
if ( fail ) {
free(job->job_ids);
+ free(job->stderr_filenames);
free(job);
return NULL;
}