aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThomas White <taw@physics.org>2024-09-24 13:59:29 +0200
committerThomas White <taw@physics.org>2024-09-24 13:59:29 +0200
commitb568f3d4cc06fe96e072b988ed776d986fec4a1a (patch)
tree5a5c0a0e6b9a0032b7fcf3e83140585c524982da
parent0bd7578915f658892122e72ed2c84034e10a10a7 (diff)
GUI: Improve progress bar for large Slurm jobs
Previously, it counted the number of "COMPLETED" jobs. This is unreliable, because sub-jobs might be taken out of the scontrol output before the whole job is complete. Now, it counts the PENDING (and similar) states, and subtracts from the total number of tasks to get a reliable count. Fixes: https://gitlab.desy.de/thomas.white/crystfel/-/issues/102
-rw-r--r--src/gui_backend_slurm.c84
1 files changed, 31 insertions, 53 deletions
diff --git a/src/gui_backend_slurm.c b/src/gui_backend_slurm.c
index c377858e..1c81dc55 100644
--- a/src/gui_backend_slurm.c
+++ b/src/gui_backend_slurm.c
@@ -102,16 +102,6 @@ static const char *get_str_val(const char *line, const char *key)
}
-static int job_alive(const char *s)
-{
- if ( strcmp(s, "PENDING") == 0 ) return 1;
- if ( strcmp(s, "RUNNING") == 0 ) return 1;
- if ( strcmp(s, "SUSPENDED") == 0 ) return 1;
- if ( strcmp(s, "COMPLETING") == 0 ) return 1;
- return 0;
-}
-
-
static char *g_bytes_to_terminated_array(GBytes *bytes)
{
gpointer arr;
@@ -132,8 +122,7 @@ static char *g_bytes_to_terminated_array(GBytes *bytes)
}
-static int get_job_status(int job_id, int *running,
- int *n_running, int *n_complete)
+static int get_job_status(int job_id, int *n_alive, int *n_running)
{
const gchar *args[6];
GError *error = NULL;
@@ -141,7 +130,6 @@ static int get_job_status(int job_id, int *running,
char job_id_str[64];
char *line;
char *nl;
- int array_task;
GBytes *stdout_buf;
GBytes *stderr_buf;
char *buf;
@@ -180,53 +168,45 @@ static int get_job_status(int job_id, int *running,
}
free(buf_stderr);
- if ( strstr(buf, "ArrayTaskId") != NULL ) {
- array_task = 1;
- *running = 0;
- } else {
- array_task = 0;
- }
-
+ *n_alive = 0;
*n_running = 0;
- *n_complete = 0;
/* Parse output */
line = &buf[0];
nl = strchr(line, '\n');
while ( nl != NULL ) {
- nl[0] = '\0';
-
- if ( array_task ) {
+ int p1, p2;
- const char *state = get_str_val(line, "JobState");
- const char *array_task_str = get_str_val(line, "ArrayTaskId");
+ nl[0] = '\0';
- /* Ignore array job 'leader' */
- if ( strchr(array_task_str, '-') == NULL ) {
+ const char *state = get_str_val(line, "JobState");
+ const char *array_task_str = get_str_val(line, "ArrayTaskId");
- if ( job_alive(state) ) {
- (*n_running)++;
- *running = 1;
- }
+ if ((strcmp(state, "PENDING") == 0)
+ || (strcmp(state, "SUSPENDED") == 0))
+ {
+ (*n_alive)++;
+ }
- if ( strcmp(state, "COMPLETED") == 0 ) {
- (*n_complete)++;
- }
+ if ((strcmp(state, "RUNNING") == 0)
+ || (strcmp(state, "COMPLETING") == 0))
+ {
+ (*n_running)++;
+ }
- } else {
- if ( job_alive(state) ) {
- *running = 1;
- }
+ if ( (array_task_str != NULL)
+ && (sscanf(array_task_str, "%i-%i", &p1, &p2) == 2) )
+ {
+ /* This is a "job array leader" */
+ if ((strcmp(state, "PENDING") == 0)
+ || (strcmp(state, "SUSPENDED") == 0)) {
+ (*n_alive) += p2-p1;
}
-
- } else {
-
- const char *state = get_str_val(line, "JobState");
- *running = job_alive(state);
-
}
+ /* We are not interested in: FAILED, COMPLETED, CANCELLED */
+
line = nl+1;
nl = strchr(line, '\n');
}
@@ -237,15 +217,13 @@ static int get_job_status(int job_id, int *running,
}
-static double indexing_progress(struct slurm_job *job, int *running,
- int n_running, int n_complete)
+static double indexing_progress(struct slurm_job *job, int n_alive, int n_running)
{
/* If there are lots of blocks, just count running jobs instead of
* reading loads of log files */
if ( job->n_blocks > 15 ) {
- return 0.1*(double)(n_running+n_complete) / job->n_blocks
- + 0.9*(double)n_complete / job->n_blocks;
+ return (job->n_blocks - n_alive - 0.5*n_running) / job->n_blocks;
} else {
@@ -275,9 +253,9 @@ static int get_task_status(void *job_priv,
float *frac_complete)
{
struct slurm_job *job = job_priv;
- int n_running, n_complete;
+ int n_running, n_alive;
- if ( get_job_status(job->job_id, running, &n_running, &n_complete) ) {
+ if ( get_job_status(job->job_id, &n_alive, &n_running) ) {
ERROR("Failed to get task status: %i\n", job->job_id);
return 1;
}
@@ -285,8 +263,8 @@ static int get_task_status(void *job_priv,
switch ( job->type ) {
case GUI_JOB_INDEXING :
- *frac_complete = indexing_progress(job, running,
- n_running, n_complete);
+ *frac_complete = indexing_progress(job, n_alive, n_running);
+ *running = (n_alive+n_running > 0);
break;
case GUI_JOB_AMBIGATOR :