diff options
author | Thomas White <taw@physics.org> | 2024-09-24 13:59:29 +0200 |
---|---|---|
committer | Thomas White <taw@physics.org> | 2024-09-24 13:59:29 +0200 |
commit | b568f3d4cc06fe96e072b988ed776d986fec4a1a (patch) | |
tree | 5a5c0a0e6b9a0032b7fcf3e83140585c524982da | |
parent | 0bd7578915f658892122e72ed2c84034e10a10a7 (diff) |
GUI: Improve progress bar for large Slurm jobs
Previously, it counted the number of "COMPLETED" jobs. This is
unreliable, because sub-jobs might be taken out of the scontrol output
before the whole job is complete. Now, it counts the PENDING (and
similar) states, and subtracts from the total number of tasks to get a
reliable count.
Fixes: https://gitlab.desy.de/thomas.white/crystfel/-/issues/102
-rw-r--r-- | src/gui_backend_slurm.c | 84 |
1 files changed, 31 insertions, 53 deletions
diff --git a/src/gui_backend_slurm.c b/src/gui_backend_slurm.c index c377858e..1c81dc55 100644 --- a/src/gui_backend_slurm.c +++ b/src/gui_backend_slurm.c @@ -102,16 +102,6 @@ static const char *get_str_val(const char *line, const char *key) } -static int job_alive(const char *s) -{ - if ( strcmp(s, "PENDING") == 0 ) return 1; - if ( strcmp(s, "RUNNING") == 0 ) return 1; - if ( strcmp(s, "SUSPENDED") == 0 ) return 1; - if ( strcmp(s, "COMPLETING") == 0 ) return 1; - return 0; -} - - static char *g_bytes_to_terminated_array(GBytes *bytes) { gpointer arr; @@ -132,8 +122,7 @@ static char *g_bytes_to_terminated_array(GBytes *bytes) } -static int get_job_status(int job_id, int *running, - int *n_running, int *n_complete) +static int get_job_status(int job_id, int *n_alive, int *n_running) { const gchar *args[6]; GError *error = NULL; @@ -141,7 +130,6 @@ static int get_job_status(int job_id, int *running, char job_id_str[64]; char *line; char *nl; - int array_task; GBytes *stdout_buf; GBytes *stderr_buf; char *buf; @@ -180,53 +168,45 @@ static int get_job_status(int job_id, int *running, } free(buf_stderr); - if ( strstr(buf, "ArrayTaskId") != NULL ) { - array_task = 1; - *running = 0; - } else { - array_task = 0; - } - + *n_alive = 0; *n_running = 0; - *n_complete = 0; /* Parse output */ line = &buf[0]; nl = strchr(line, '\n'); while ( nl != NULL ) { - nl[0] = '\0'; - - if ( array_task ) { + int p1, p2; - const char *state = get_str_val(line, "JobState"); - const char *array_task_str = get_str_val(line, "ArrayTaskId"); + nl[0] = '\0'; - /* Ignore array job 'leader' */ - if ( strchr(array_task_str, '-') == NULL ) { + const char *state = get_str_val(line, "JobState"); + const char *array_task_str = get_str_val(line, "ArrayTaskId"); - if ( job_alive(state) ) { - (*n_running)++; - *running = 1; - } + if ((strcmp(state, "PENDING") == 0) + || (strcmp(state, "SUSPENDED") == 0)) + { + (*n_alive)++; + } - if ( strcmp(state, "COMPLETED") == 0 ) { - (*n_complete)++; - } + if ((strcmp(state, "RUNNING") == 0) + || (strcmp(state, "COMPLETING") == 0)) + { + (*n_running)++; + } - } else { - if ( job_alive(state) ) { - *running = 1; - } + if ( (array_task_str != NULL) + && (sscanf(array_task_str, "%i-%i", &p1, &p2) == 2) ) + { + /* This is a "job array leader" */ + if ((strcmp(state, "PENDING") == 0) + || (strcmp(state, "SUSPENDED") == 0)) { + (*n_alive) += p2-p1; } - - } else { - - const char *state = get_str_val(line, "JobState"); - *running = job_alive(state); - } + /* We are not interested in: FAILED, COMPLETED, CANCELLED */ + line = nl+1; nl = strchr(line, '\n'); } @@ -237,15 +217,13 @@ static int get_job_status(int job_id, int *running, } -static double indexing_progress(struct slurm_job *job, int *running, - int n_running, int n_complete) +static double indexing_progress(struct slurm_job *job, int n_alive, int n_running) { /* If there are lots of blocks, just count running jobs instead of * reading loads of log files */ if ( job->n_blocks > 15 ) { - return 0.1*(double)(n_running+n_complete) / job->n_blocks - + 0.9*(double)n_complete / job->n_blocks; + return (job->n_blocks - n_alive - 0.5*n_running) / job->n_blocks; } else { @@ -275,9 +253,9 @@ static int get_task_status(void *job_priv, float *frac_complete) { struct slurm_job *job = job_priv; - int n_running, n_complete; + int n_running, n_alive; - if ( get_job_status(job->job_id, running, &n_running, &n_complete) ) { + if ( get_job_status(job->job_id, &n_alive, &n_running) ) { ERROR("Failed to get task status: %i\n", job->job_id); return 1; } @@ -285,8 +263,8 @@ static int get_task_status(void *job_priv, switch ( job->type ) { case GUI_JOB_INDEXING : - *frac_complete = indexing_progress(job, running, - n_running, n_complete); + *frac_complete = indexing_progress(job, n_alive, n_running); + *running = (n_alive+n_running > 0); break; case GUI_JOB_AMBIGATOR : |