From b568f3d4cc06fe96e072b988ed776d986fec4a1a Mon Sep 17 00:00:00 2001 From: Thomas White Date: Tue, 24 Sep 2024 13:59:29 +0200 Subject: GUI: Improve progress bar for large Slurm jobs Previously, it counted the number of "COMPLETED" jobs. This is unreliable, because sub-jobs might be taken out of the scontrol output before the whole job is complete. Now, it counts the PENDING (and similar) states, and subtracts from the total number of tasks to get a reliable count. Fixes: https://gitlab.desy.de/thomas.white/crystfel/-/issues/102 --- src/gui_backend_slurm.c | 84 ++++++++++++++++++------------------------------- 1 file changed, 31 insertions(+), 53 deletions(-) diff --git a/src/gui_backend_slurm.c b/src/gui_backend_slurm.c index c377858e..1c81dc55 100644 --- a/src/gui_backend_slurm.c +++ b/src/gui_backend_slurm.c @@ -102,16 +102,6 @@ static const char *get_str_val(const char *line, const char *key) } -static int job_alive(const char *s) -{ - if ( strcmp(s, "PENDING") == 0 ) return 1; - if ( strcmp(s, "RUNNING") == 0 ) return 1; - if ( strcmp(s, "SUSPENDED") == 0 ) return 1; - if ( strcmp(s, "COMPLETING") == 0 ) return 1; - return 0; -} - - static char *g_bytes_to_terminated_array(GBytes *bytes) { gpointer arr; @@ -132,8 +122,7 @@ static char *g_bytes_to_terminated_array(GBytes *bytes) } -static int get_job_status(int job_id, int *running, - int *n_running, int *n_complete) +static int get_job_status(int job_id, int *n_alive, int *n_running) { const gchar *args[6]; GError *error = NULL; @@ -141,7 +130,6 @@ static int get_job_status(int job_id, int *running, char job_id_str[64]; char *line; char *nl; - int array_task; GBytes *stdout_buf; GBytes *stderr_buf; char *buf; @@ -180,53 +168,45 @@ static int get_job_status(int job_id, int *running, } free(buf_stderr); - if ( strstr(buf, "ArrayTaskId") != NULL ) { - array_task = 1; - *running = 0; - } else { - array_task = 0; - } - + *n_alive = 0; *n_running = 0; - *n_complete = 0; /* Parse output */ line = &buf[0]; nl = strchr(line, '\n'); while ( nl != NULL ) { - nl[0] = '\0'; - - if ( array_task ) { + int p1, p2; - const char *state = get_str_val(line, "JobState"); - const char *array_task_str = get_str_val(line, "ArrayTaskId"); + nl[0] = '\0'; - /* Ignore array job 'leader' */ - if ( strchr(array_task_str, '-') == NULL ) { + const char *state = get_str_val(line, "JobState"); + const char *array_task_str = get_str_val(line, "ArrayTaskId"); - if ( job_alive(state) ) { - (*n_running)++; - *running = 1; - } + if ((strcmp(state, "PENDING") == 0) + || (strcmp(state, "SUSPENDED") == 0)) + { + (*n_alive)++; + } - if ( strcmp(state, "COMPLETED") == 0 ) { - (*n_complete)++; - } + if ((strcmp(state, "RUNNING") == 0) + || (strcmp(state, "COMPLETING") == 0)) + { + (*n_running)++; + } - } else { - if ( job_alive(state) ) { - *running = 1; - } + if ( (array_task_str != NULL) + && (sscanf(array_task_str, "%i-%i", &p1, &p2) == 2) ) + { + /* This is a "job array leader" */ + if ((strcmp(state, "PENDING") == 0) + || (strcmp(state, "SUSPENDED") == 0)) { + (*n_alive) += p2-p1; } - - } else { - - const char *state = get_str_val(line, "JobState"); - *running = job_alive(state); - } + /* We are not interested in: FAILED, COMPLETED, CANCELLED */ + line = nl+1; nl = strchr(line, '\n'); } @@ -237,15 +217,13 @@ static int get_job_status(int job_id, int *running, } -static double indexing_progress(struct slurm_job *job, int *running, - int n_running, int n_complete) +static double indexing_progress(struct slurm_job *job, int n_alive, int n_running) { /* If there are lots of blocks, just count running jobs instead of * reading loads of log files */ if ( job->n_blocks > 15 ) { - return 0.1*(double)(n_running+n_complete) / job->n_blocks - + 0.9*(double)n_complete / job->n_blocks; + return (job->n_blocks - n_alive - 0.5*n_running) / job->n_blocks; } else { @@ -275,9 +253,9 @@ static int get_task_status(void *job_priv, float *frac_complete) { struct slurm_job *job = job_priv; - int n_running, n_complete; + int n_running, n_alive; - if ( get_job_status(job->job_id, running, &n_running, &n_complete) ) { + if ( get_job_status(job->job_id, &n_alive, &n_running) ) { ERROR("Failed to get task status: %i\n", job->job_id); return 1; } @@ -285,8 +263,8 @@ static int get_task_status(void *job_priv, switch ( job->type ) { case GUI_JOB_INDEXING : - *frac_complete = indexing_progress(job, running, - n_running, n_complete); + *frac_complete = indexing_progress(job, n_alive, n_running); + *running = (n_alive+n_running > 0); break; case GUI_JOB_AMBIGATOR : -- cgit v1.2.3