/*
 * Copyright (c) 2001-2003 The Trustees of Indiana University.  
 *                         All rights reserved.
 * Copyright (c) 1998-2001 University of Notre Dame. 
 *                         All rights reserved.
 * Copyright (c) 1994-1998 The Ohio State University.  
 *                         All rights reserved.
 * 
 * This file is part of the LAM/MPI software package.  For license
 * information, see the LICENSE file in the top level directory of the
 * LAM/MPI source distribution.
 * 
 * $HEADER$
 *
 * $Id: lamboot.c,v 6.72 2004/03/06 19:59:29 jsquyres Exp $
 *
 *	Function:	- fully-connected LAM booting tool
 *			- boots Trollius on a network of UNIX hosts
 *			- hosts specified using host file syntax
 *			- uses hboot
 */

#include <stdio.h>
#include <signal.h>
#include <stdlib.h>
#include <string.h>

#include <all_opt.h>
#include <args.h>
#include <kio.h>
#include <mpisys.h>
#include <lamnet.h>
#include <laminternal.h>
#include <etc_misc.h>
#include <lam-ssi.h>
#include <lam-ssi-boot.h>
#include <sfh.h>


/*
 * local variables
 */
static int num_started = -1;
static int fl_verbose = 0;
static int fl_debug = 0;
static int main_argc = 0;
static char **main_argv = NULL;
static OPT *ad = NULL;


/* 
 * local functions
 */
static void bail(int);
static void wipe(void);


int
main(int argc, char *argv[])
{
  int i, ret = 0;
  int iorigin;
  int nlamnet;
  struct lamnode *lamnet;
  int unusedc;
  char **unusedv;
  char *prefix = NULL;
  char *suffix = NULL;
  char *lamprefix = NULL;
  char **cmdv = NULL;
  int cmdc;
  int cmdv0_len;

  main_argc = argc;
  main_argv = sfh_argv_dup(argv);

  /* Parse the command line */

  ad = ao_init();
  if (ad == 0) {
    ret = errno;
    perror("lamboot (ao_init)");
    exit(ret);
  }
  ao_setflags(ad, AOPRESERVE_ARGV);
  ao_setopt1(ad, "bdhsvxHVl", 0, 0, 0);
  ao_setopt(ad, "nn", 0, 0, 0);
  ao_setopt(ad, "np", 0, 0, 0);
  ao_setopt(ad, "c", 0, 1, 0);
  ao_setopt(ad, "sessionprefix", 0, 1, 0);
  ao_setopt(ad, "sessionsuffix", 0, 1, 0);
  ao_setopt(ad, "prefix", 0, 1, 0);
  ao_setopt(ad, "withlamprefixpath", 0, 0, 0);

  lam_ssi_base_open(ad);
  lam_ssi_base_ao_setup(ad);

  if (ao_parse(ad, &argc, argv)) {
    show_help("lamboot", "usage", NULL);
    lam_ssi_base_close();
    exit(EUSAGE);
  }

  /* Check for -d and -v *before* we intialize the boot SSI */

  fl_verbose = ao_taken(ad, "v");
  fl_debug = ao_taken(ad, "d");
  
  if (fl_verbose)
    lam_ssi_base_param_set_string("boot_verbose", "level:0,stderr");
  if (fl_debug)
    lam_ssi_base_param_set_string("boot_verbose", "level:1000,stderr");
  if (ao_taken(ad, "b"))
    lam_ssi_base_param_set_string("boot_rsh_fast", "1");

  /* Check for -withlamprefixpath. If not supplied and prefix supplied
     then we need to call the prefix lamboot */

  if (!ao_taken(ad, "withlamprefixpath") && ao_taken(ad, "prefix")) {

    /* We have got the prefix, so launch the prefix lamboot */
    lamprefix = ao_param(ad, "prefix", 0, 0);
      
    /* Build the command to be passed to lamboot -- add the option 
       -with-lam-prefix-path to make lamboot know that it has been called 
       and has been supplied a prefix for locating other binaries */

    cmdv = sfh_argv_dup(argv);
    cmdc = argc;

    /* Replace cmdv[0] with the new lamboot path */
    free(cmdv[0]);

    cmdv0_len = strlen("/bin/lamboot") + strlen(lamprefix) + 2;
    cmdv[0] = (char *) malloc (cmdv0_len);

    if (cmdv[0] == NULL) {
      show_help(NULL, "system-call-fail", "malloc", NULL);
      return LAMERROR;
    }
    snprintf(cmdv[0], cmdv0_len, "%s%s", lamprefix, "/bin/lamboot");

    /* Check to see if path is valid and you got permissions */
    if (access(cmdv[0], X_OK) == -1) {
      show_help("boot","invalid-path-or-no-permission", lamprefix, "lamboot");
      return LAMERROR;
    }
    sfh_argv_add(&cmdc, &cmdv, "-withlamprefixpath");

    /* Fork the prefix lamboot */
    _lam_few(cmdv); 
    return (0);
  }
  
  /* Compatability arguments, from pre-SSI days (and it's just a
     heckuva lot shorter to type!) */

  if (ao_taken(ad, "nn"))
    lam_ssi_base_param_set_int("rsh_base_no_n", 1);
  if (ao_taken(ad, "np"))
    lam_ssi_base_param_set_int("rsh_base_no_profile", 1);

  /* Batch system gorp */
  if (ao_taken(ad, "sessionprefix")) {
    prefix = ao_param(ad, "sessionprefix", 0, 0);
  } else {
    prefix = NULL;
  }

  if (ao_taken(ad, "sessionsuffix")) {
    suffix = ao_param(ad, "sessionsuffix", 0, 0);
  } else {
    suffix = NULL;
  }
  lam_tmpdir_init_opt(prefix, suffix, 0);
    
  /* Let SSI parse all base SSI-related arguments.  It will print out
     its own error message if necessary, so exiting upon failure is
     sufficient here. */

  if (lam_ssi_base_ao_process_args(ad) != 0) {
    lam_ssi_base_close();
    exit(errno);
  }

  /* Just a version output? */

  if (ao_taken(ad, "V")) {
    lam_show_version(1);

    /* For backwards comparability only -- use laminfo for more detail */

    printf("\tSSI rpi:\t%s\n", RPI_MODULES);
    ao_free(ad);
    if (main_argv != 0)
      sfh_argv_free(main_argv);
    lam_ssi_base_close();
    exit(0);
  }

  /* Ensure that we are not root */

#ifndef LAM_ALLOW_RUN_AS_ROOT
  if (getuid() == 0 || geteuid() == 0) {
    show_help(NULL, "deny-root", NULL);
    lam_ssi_base_close();
    exit(EACCES);
  }
#endif

  ao_unused(ad, &unusedc, &unusedv);
  if ((errno = (unusedc <= 2) ? 0 : EUSAGE)) {
    show_help("lamboot", "usage", NULL);
    lam_ssi_base_close();
    exit(errno);
  }

  if (ao_taken(ad, "h")) {
    show_help("lamboot", "usage", NULL);
    ao_free(ad);
    if (main_argv != 0)
      sfh_argv_free(main_argv);
    lam_ssi_base_close();
    exit(0);
  }

  /* Open the boot SSI */

  if (lam_ssi_boot_open(ad) != 0 ||
      lam_ssi_boot_select(LAM_SSI_BOOT_LOCATION_ROOT) != 0) {
    lam_ssi_base_close();
    exit(1);
  }

  if (!ao_taken(ad, "H"))
    lam_show_version(0);

  /* Let the boot SSI parse the command line.  Relevant errors will be
     displayed before returning from this function. */

  if (lam_ssi_boot.lsba_parse_options(ad, 1) != 0) {
    lam_ssi_base_close();
    exit(1);
  }

  /* Let the boot SSI generate a list of nodes.  Relevant errors will
     be displayed before returning from this function. */

  if (lam_ssi_boot.lsba_allocate_nodes(&lamnet, &nlamnet, &iorigin) != 0) {
    lam_ssi_base_close();
    exit(1);
  }

  /* Let the boot SSI do an error check on the nodes.  Relevant errors
     will be displayed before returning from this function. */

  if (lam_ssi_boot.lsba_verify_nodes(lamnet, nlamnet) != 0) {
    lam_ssi_base_close();
    exit(1);
  }

  /* Setup the _kio struct */

  _kio.ki_origin = iorigin;

  /* Let the boot SSI prepare to boot.  Relevant errors will be
     displayed before returning from this function. */

  if (lam_ssi_boot.lsba_prepare_boot() != 0) {
    lam_ssi_base_close();
    exit(1);
  }

  /* Clean up on interrupt */

  if (signal(SIGINT, bail) == SIG_ERR) {
    show_help(NULL, "system-call-fail", "signal", NULL);
    lam_ssi_base_close();
    exit(1);
  }

  /* Now actually have the boot SSI boot the nodes.  Relevant errors
     will be displayed before returning from this function. */

  if (lam_ssi_boot.lsba_start_rte_procs(lamnet, nlamnet, 
                                        LAM_SSI_BOOT_PROC_LAMD,
                                        &num_started) != 0 ||
      num_started != nlamnet)
    bail(-1);

  /* Now that we've started everyone correctly, check to see if there
     any scheduleable nodes */

  for (i = 0; i < nlamnet; ++i)
    if ((lamnet[i].lnd_type & NT_WASTE) == 0)
      break;
  if (i >= nlamnet)
    show_help("boot", "no-schedulable-nodes", NULL);

  /* Free up resources */

  if (lam_ssi_boot.lsba_deallocate_nodes(&lamnet, &nlamnet) != 0) {
    lam_ssi_base_close();
    exit(1);
  }

  /* All done with the boot SSI module (don't care about errors at
     this point) */

  if (lam_ssi_boot_close() != 0)
    ret = 1;

  /* All done */

  if (main_argv != 0)
    sfh_argv_free(main_argv);

  ao_free(ad);
  lam_ssi_base_close();
  return ret;
}


/*
 *	bail
 *
 *	Function:	- cleans up and bails out
 *	Returns:	- does not return, exits with error code
 */
static void
bail(int sig)
{
  int err_save;			/* saved error code */

  err_save = errno;
  wipe();
  if (fl_verbose || fl_debug) 
    fprintf(stderr, "lamboot did NOT complete successfully\n");
  lam_ssi_base_close();
  if (err_save != 0)
    exit(err_save);
  else
    exit(sig); 
  /* Since we want to return with non zero exit status, exit status is
     set to signal number if its 0 */
}


/*
 *	wipe
 *
 *	Function:	- executes the wipe tool
 *			- kills all host Trollius sessions
 */
static void
wipe(void)
{
  int i;
  int cmdn;
  char **cmdv;

  if (num_started <= 0)
    return;

  show_help("boot", "about-to-lamwipe", "lamboot", NULL);
  cmdn = 0;
  cmdv = 0;
  sfh_argv_add(&cmdn, &cmdv, DEFTWIPE);
  for (i = 1; i < main_argc; ++i)
    sfh_argv_add(&cmdn, &cmdv, main_argv[i]);
  sfh_argv_add(&cmdn, &cmdv, "-H");

  /* This is a last ditch effort, so if _lam_few fails, there's really
     nothing we can do about it.  So just ignore the return value from
     _lam_few(). */

  _lam_few(cmdv);
}


syntax highlighted by Code2HTML, v. 0.9.1