/*
* Copyright (c) 2001-2004 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 1998-2001 University of Notre Dame.
* All rights reserved.
* Copyright (c) 1994-1998 The Ohio State University.
* All rights reserved.
*
* This file is part of the LAM/MPI software package. For license
* information, see the LICENSE file in the top level directory of the
* LAM/MPI source distribution.
*
* $HEADER$
*
* $Id: hboot.c,v 6.35 2004/01/02 00:20:46 jsquyres Exp $
*
* Function: - boots OTB operating system
*/
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
#include <syslog.h>
#include <lam_config.h>
#include <all_list.h>
#include <args.h>
#include <all_opt.h>
#include <boot.h>
#include <lamdebug.h>
#include <kio.h>
#include <portable.h>
#include <proc_schema.h>
#include <sfh.h>
#include <terror.h>
#include <etc_misc.h>
#define MAXCLOSEFD 32 /* max close-on-exec fdecs. */
#define LAM_MAXPATHLEN 255 /* max path name len */
/*
* external functions
*/
extern int _lam_few(char**);
extern int psc_parse();
extern struct psc *psc_find();
extern struct psc *psc_fill();
/*
* local functions
*/
static void setdefaults(void);
/*
* local variables
*/
static char *t_tkill; /* tkill tool */
static char rtfbuf[32]; /* RTF env. var. */
static int fl_debug; /* debugging option */
static int fl_verbose; /* verbose option */
static OPT *ad = NULL;
int
main(int argc, char *argv[])
{
LIST *list_psc; /* parsed process schema list */
struct psc *p;
int i, n;
int fd; /* file descriptor for /dev/null */
int pid; /* child PID */
int ac_cmd; /* # command arguments */
char **av_cmd; /* command arguments */
char buf[32]; /* formatting buffer */
char *tail; /* tail of full pathname */
char *inet_topo;
char *rtr_topo;
int ret;
char *debug_loc;
char *prefix = NULL;
char *new_path = NULL;
char *opt_prefix;
char *tkill_path;
char *path_env;
int prefix_len;
int fl_prefix;
int tkill_len;
int debug;
int len;
lam_debug_stream_info_t debug_lds;
/* Ensure that we are not root */
#ifndef LAM_ALLOW_RUN_AS_ROOT
if (getuid() == 0 || geteuid() == 0) {
show_help(NULL, "deny-root", NULL);
exit(EACCES);
}
#endif
/*
* Initialize option parser.
*/
ad = ao_init();
if (ad == 0) {
ret = errno;
perror("hboot (ao_init)");
exit(ret);
}
ao_setflags(ad, AOPRESERVE_ARGV);
ao_setopt1(ad, "dhstvNVn", 0, 0, 0);
ao_setopt(ad, "c", 0, 1, 0);
ao_setopt(ad, "I", 0, 1, 0);
ao_setopt(ad, "R", 0, 1, 0);
ao_setopt(ad, "sessionprefix", 0, 1, 0);
ao_setopt(ad, "sessionsuffix", 0, 1, 0);
ao_setopt(ad, "H", 0, 1, 0);
ao_setopt(ad, "debug-output", 0, 1, 0);
ao_setopt(ad, "prefix", 0, 1, 0);
if (ao_parse(ad, &argc, argv)) {
show_help("hboot", "usage", NULL);
exit(EUSAGE);
}
if (ao_taken(ad, "h")) {
show_help("hboot", "usage", NULL);
exit(0);
}
/* set some generic defaults */
setdefaults();
debug_lds.lds_fl_debug = 0;
debug_lds.lds_fl_syslog = 0;
debug_lds.lds_fl_stdout = 0;
debug_lds.lds_fl_stderr = 0;
debug_lds.lds_fl_file = 0;
debug_lds.lds_prefix = "hboot: ";
if (ao_taken(ad, "debug-output")) {
debug_loc = ao_param(ad, "debug-output", 0, 0);
if (!strcmp(debug_loc, "stdout")) {
debug_lds.lds_fl_stdout = 1;
} else if (!strcmp(debug_loc, "syslog")) {
debug_lds.lds_fl_syslog = 1;
debug_lds.lds_syslog_priority = LOG_INFO;
debug_lds.lds_syslog_ident = argv[0];
} else {
show_help("hboot", "usage", NULL);
exit(EUSAGE);
}
} else {
debug_lds.lds_fl_stdout = 1;
}
debug = lam_debug_open(&debug_lds);
/* turn-off at init appears "not good" */
lam_debug_switch(debug, fl_debug || fl_verbose);
if (ao_taken(ad, "I")) {
inet_topo = ao_param(ad, "I", 0, 0);
/* Remove quotes if enclosed in quotes */
if (inet_topo[0] == '"') {
inet_topo[strlen(inet_topo) - 1] = '\0';
inet_topo = &inet_topo[1];
}
} else {
inet_topo = NULL;
}
if (ao_taken(ad, "R")) {
rtr_topo = ao_param(ad, "R", 0, 0);
} else {
rtr_topo = NULL;
}
/* get list of things we should do */
if (hbootparse(debug, ad, inet_topo, rtr_topo, &list_psc)) {
show_help("hboot", "cant-parse-config", NULL);
exit(1);
}
/* get the path prefix if provided */
fl_prefix = ao_taken(ad, "prefix");
if (fl_prefix) {
opt_prefix = ao_param(ad, "prefix", 0, 0);
prefix_len = strlen(opt_prefix) + strlen("/bin/") + 1;
prefix = (char *) malloc (prefix_len * sizeof(char));
if (prefix == NULL) {
show_help(NULL, "lib-call-fail", "malloc", NULL);
return LAMERROR;
}
snprintf(prefix, prefix_len, "%s/bin/", opt_prefix);
/* Prepend the prefix to the existing env path, so that lamd
can call the correct tkill during the lamhalt, by
looking at the path */
len = strlen(prefix) + 16;
if ((path_env = getenv("PATH")) != NULL) {
len += strlen(path_env);
new_path = malloc(len);
if (new_path == NULL) {
show_help(NULL, "lib-call-fail", "malloc", NULL);
return LAMERROR;
}
snprintf(new_path, len, "PATH=%s:%s", prefix, path_env);
} else {
new_path = malloc(len);
if (new_path == NULL) {
show_help(NULL, "lib-call-fail", "malloc", NULL);
return LAMERROR;
}
snprintf(new_path, len, "PATH=%s", prefix);
}
if (putenv(new_path) < 0) {
show_help(NULL, "lib-call-fail", "putenv", NULL);
exit(errno);
}
}
/*
* Bail out here, if pretending.
*/
if (ao_taken(ad, "N")) {
if (fl_verbose)
printf("Fake hboot -- quitting\n");
exit(0);
}
/*
* Tkill if needed.
*/
if (ao_taken(ad, "t")) {
DBUG("hboot: performing %s\n", t_tkill);
ac_cmd = 0;
av_cmd = 0;
if (fl_prefix) {
tkill_len = strlen(prefix) + strlen(t_tkill) + 1;
tkill_path = (char *) malloc (tkill_len * sizeof(char));
if (tkill_path == NULL) {
show_help(NULL, "lib-call-fail", "malloc", NULL);
return LAMERROR;
}
snprintf(tkill_path, tkill_len, "%s%s", prefix, t_tkill);
sfh_argv_add(&ac_cmd, &av_cmd, tkill_path);
free(tkill_path);
free(prefix);
}
else
sfh_argv_add(&ac_cmd, &av_cmd, t_tkill);
if (ao_taken(ad, "sessionprefix")) {
sfh_argv_add(&ac_cmd, &av_cmd, "-sessionprefix");
sfh_argv_add(&ac_cmd, &av_cmd,
ao_param(ad, "sessionprefix", 0, 0));
}
if (ao_taken(ad, "sessionsuffix")) {
sfh_argv_add(&ac_cmd, &av_cmd, "-sessionsuffix");
sfh_argv_add(&ac_cmd, &av_cmd,
ao_param(ad, "sessionsuffix", 0, 0));
}
if (fl_debug) {
sfh_argv_add(&ac_cmd, &av_cmd, "-d");
printf("hboot: ");
for (i = 0; i < ac_cmd; i++)
printf("%s ", av_cmd[i]);
printf("\n");
}
if (_lam_few(av_cmd)) {
show_help("hboot", "tkill-fail", NULL);
exit(errno);
}
}
else if (fl_prefix)
free(prefix);
/*
* Boot.
*/
DBUG("hboot: booting...\n");
sprintf(rtfbuf, "TROLLIUSRTF=%d", RTF_SYSGEN);
if (putenv(rtfbuf) < 0) {
show_help(NULL, "lib-call-fail", "putenv", NULL);
exit(errno);
}
/* Take us out of the parent's group so that we don't get
killed. NOTE: this does *not* hurt us in environment such
as TM and SLURM because hboot is not used in these
environments. */
setsid();
if (ao_taken(ad, "s")) {
/*
* Make any extraneous file descriptors close-on-exec.
*/
for (i = 3; i < MAXCLOSEFD; ++i) {
if ((fcntl(i, F_SETFD, 1) != 0) && (errno != EBADF)) {
show_help(NULL, "system-call-fail",
"fcntl(set close-on-exec)", NULL);
exit(errno);
}
}
}
n = 0;
/*
* Loop through all the programs in the parsed config file.
*/
for (p = al_top(list_psc); p; p = al_next(list_psc, p)) {
DBUG("hboot: fork %s\n", p->psc_argv[0]);
if ((pid = fork()) < 0) {
show_help(NULL, "system-call-fail", "fork", NULL);
exit(errno);
}
else if (pid == 0) { /* child */
/* Put this setsid() here mainly for SGE --
their tight integration with LAM/MPI does
something like this:
lamboot -> qrsh (to a remote node) -> hboot
-> qrsh -> lamd
Without having a setsid() here, there is a
race condition between when hboot quits and
SGE thinks the job is over (and therefore
starts killing things) and when the second
qrsh is able to establish itself and/or the
lamd and tell SGE that the job is, in fact,
*not* over. So putting a setsid() here in
the child, then the hboot child (and
therefore the vulnerable period of the 2nd
qrsh) escape being killed by SGE while
still making progress on the overall
lamboot.
*/
setsid();
if (ao_taken(ad, "s")) {
/*
* Safely get rid of the stdio descriptors.
*/
if ((fd = open("/dev/null", O_RDWR)) < 0) {
show_help(NULL, "system-call-fail",
"open(\"/dev/null\"/, O_RDWR)",
NULL);
exit(errno);
}
if ((dup2(fd, 0) < 0) || (dup2(fd, 1) < 0) ||
(dup2(fd, 2) < 0)) {
show_help(NULL, "system-call-fail", "dup2",
NULL);
exit(errno);
}
close(fd);
}
if (fl_debug) {
printf("hboot: attempting to execute \n");
}
execvp(p->psc_argv[0], p->psc_argv);
exit(errno);
}
else { /* parent */
n++;
if (fl_debug) {
tail = strrchr(p->psc_argv[0], STRDIR);
tail = (tail) ? tail + 1 : p->psc_argv[0];
sprintf(buf, "[%d]", n);
printf("%-4.4s %5d %s", buf, pid, tail);
for (i = 1; i < p->psc_argc; i++) {
printf(" %s", p->psc_argv[i]);
}
printf("\n");
}
}
if (p->psc_delay > 0) {
sleep((unsigned int) p->psc_delay);
}
}
al_free(list_psc);
return(0);
}
/*
* setdefaults
*
* Function: - sets default files and paths
*/
static void
setdefaults(void)
{
/*
* flags
*/
fl_debug = ao_taken(ad, "d");;
fl_verbose = ao_taken(ad, "v");
t_tkill = DEFTRESETH;
}
syntax highlighted by Code2HTML, v. 0.9.1