"""
CIME restart upon failed node test.
"""
from CIME.XML.standard_module_setup import *
from CIME.SystemTests.ers import ERS
from CIME.utils import get_model
logger = logging.getLogger(__name__)
[docs]
class NODEFAIL(ERS):
def __init__(self, case, **kwargs):
"""
initialize an object interface to the ERS system test
"""
ERS.__init__(self, case, **kwargs)
self._fail_sentinel = os.path.join(case.get_value("RUNDIR"), "FAIL_SENTINEL")
self._fail_str = case.get_value("NODE_FAIL_REGEX")
def _restart_fake_phase(self):
# Swap out model.exe for one that emits node failures
rundir = self._case.get_value("RUNDIR")
exeroot = self._case.get_value("EXEROOT")
driver = self._case.get_value("COMP_INTERFACE")
if driver == "nuopc":
logname = "med"
else:
logname = "cpl"
fake_exe = """#!/bin/bash
fail_sentinel={0}
cpl_log={1}/{4}.log.$LID
model_log={1}/{2}.log.$LID
touch $cpl_log
touch $fail_sentinel
declare -i num_fails=$(cat $fail_sentinel | wc -l)
declare -i times_to_fail=${{NODEFAIL_NUM_FAILS:-3}}
if ((num_fails < times_to_fail)); then
echo FAKE FAIL >> $cpl_log
echo FAIL >> $fail_sentinel
echo '{3}' >> $model_log
sleep 1
exit -1
else
echo Insta pass
echo SUCCESSFUL TERMINATION > $cpl_log
fi
""".format(
self._fail_sentinel, rundir, get_model(), self._fail_str, logname
)
fake_exe_file = os.path.join(exeroot, "fake.sh")
with open(fake_exe_file, "w") as fd:
fd.write(fake_exe)
os.chmod(fake_exe_file, 0o755)
prev_run_exe = self._case.get_value("run_exe")
env_mach_specific = self._case.get_env("mach_specific")
env_mach_specific.set_value("run_exe", fake_exe_file)
self._case.flush(flushall=True)
# This flag is needed by mpt to run a script under mpiexec
mpilib = self._case.get_value("MPILIB")
if mpilib == "mpt":
os.environ["MPI_SHEPHERD"] = "true"
self.run_indv(suffix=None)
if mpilib == "mpt":
del os.environ["MPI_SHEPHERD"]
env_mach_specific = self._case.get_env("mach_specific")
env_mach_specific.set_value("run_exe", prev_run_exe)
self._case.flush(flushall=True)
[docs]
def run_phase(self):
self._ers_first_phase()
self._restart_fake_phase()
self._ers_second_phase()