aboutsummaryrefslogtreecommitdiff
path: root/src/im-sandbox.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/im-sandbox.c')
-rw-r--r--src/im-sandbox.c36
1 files changed, 23 insertions, 13 deletions
diff --git a/src/im-sandbox.c b/src/im-sandbox.c
index 51cd5903..f5493453 100644
--- a/src/im-sandbox.c
+++ b/src/im-sandbox.c
@@ -3,13 +3,13 @@
*
* Sandbox for indexing
*
- * Copyright © 2012-2016 Deutsches Elektronen-Synchrotron DESY,
+ * Copyright © 2012-2017 Deutsches Elektronen-Synchrotron DESY,
* a research centre of the Helmholtz Association.
* Copyright © 2012 Richard Kirian
* Copyright © 2012 Lorenzo Galli
*
* Authors:
- * 2010-2016 Thomas White <taw@physics.org>
+ * 2010-2017 Thomas White <taw@physics.org>
* 2014 Valerio Mariani
* 2011 Richard Kirian
* 2012 Lorenzo Galli
@@ -73,10 +73,18 @@ struct sandbox
struct index_args *iargs;
+ /* Worker processes */
int n_proc;
pid_t *pids;
-
int *running;
+ time_t *last_response;
+ int last_ping[MAX_NUM_WORKERS];
+
+ /* Streams to read from (NB not the same indices as the above) */
+ int n_read;
+ FILE **fhs;
+ int *fds;
+
int serial;
struct sb_shm *shared;
@@ -84,14 +92,6 @@ struct sandbox
char *tmpdir;
- /* The last time each worker was heard from */
- time_t *last_response;
-
- /* Streams to read from */
- int n_read;
- FILE **fhs;
- int *fds;
-
/* Final output */
Stream *stream;
};
@@ -124,6 +124,7 @@ static time_t get_monotonic_seconds()
static void stamp_response(struct sandbox *sb, int n)
{
sb->last_response[n] = get_monotonic_seconds();
+ sb->last_ping[n] = sb->shared->pings[n];
}
@@ -131,14 +132,21 @@ static void check_hung_workers(struct sandbox *sb)
{
int i;
time_t tnow = get_monotonic_seconds();
- for ( i=0; i<sb->n_read; i++ ) {
+ for ( i=0; i<sb->n_proc; i++ ) {
+
if ( !sb->running[i] ) continue;
+
+ if ( sb->shared->pings[i] != sb->last_ping[i] ) {
+ stamp_response(sb, i);
+ }
+
if ( tnow - sb->last_response[i] > 240 ) {
STATUS("Worker %i did not respond for 240 seconds - "
"sending it SIGKILL.\n", i);
kill(sb->pids[i], SIGKILL);
stamp_response(sb, i);
}
+
}
}
@@ -516,7 +524,6 @@ static void try_read(struct sandbox *sb, TimeAccounts *taccs)
/* If the chunk cannot be read, assume the connection
* is broken and that the process will die soon. */
time_accounts_set(taccs, TACC_STREAMREAD);
- stamp_response(sb, i);
if ( pump_chunk(sb->fhs[i], ofd) ) {
remove_pipe(sb, i);
}
@@ -535,6 +542,9 @@ static void start_worker_process(struct sandbox *sb, int slot)
return;
}
+ sb->shared->pings[slot] = 0;
+ sb->last_ping[slot] = 0;
+
p = fork();
if ( p == -1 ) {
ERROR("fork() failed!\n");