Source code for CIME.hist_utils

"""
Functions for actions pertaining to history files.
"""

from CIME.XML.standard_module_setup import *
from CIME.test_status import TEST_NO_BASELINES_COMMENT

from CIME.utils import get_current_commit, get_timestamp, get_model, safe_copy

import logging, glob, os, re, stat, filecmp
logger = logging.getLogger(__name__)

BLESS_LOG_NAME = "bless_log"

def _iter_model_file_substrs(case):
    models = case.get_compset_components()
    models.append('cpl')
    for model in models:
        yield model

def _get_all_hist_files(testcase, model, from_dir, suffix=""):
    suffix = (".{}".format(suffix) if suffix else "")

    # Match hist files produced by run
    test_hists = glob.glob("{}/{}.{}*.h?.nc{}".format(from_dir, testcase, model, suffix))
    test_hists.extend(glob.glob("{}/{}.{}*.h.nc{}".format(from_dir, testcase, model, suffix)))

    # Match multi-instance files produced by run
    test_hists.extend(glob.glob("{}/{}.{}*.h?.*.nc{}".format(from_dir, testcase, model, suffix)))
    test_hists.extend(glob.glob("{}/{}.{}*.h.*.nc{}".format(from_dir, testcase, model, suffix)))

    # suffix == "" implies baseline comparison, baseline hist files have simpler names

    if suffix == "":
        test_hists.extend(glob.glob("{}/{}*.h.nc".format(from_dir, model)))
        test_hists.extend(glob.glob("{}/{}*.h?.nc".format(from_dir, model)))
        test_hists.extend(glob.glob("{}/{}*.h.*.nc".format(from_dir, model)))
        test_hists.extend(glob.glob("{}/{}*.h?.*.nc".format(from_dir, model)))

    test_hists.sort()
    return test_hists

def _get_latest_hist_files(testcase, model, from_dir, suffix=""):
    test_hists = _get_all_hist_files(testcase, model, from_dir, suffix)
    latest_files = {}
    histlist = []
    for hist in test_hists:
        ext = get_extension(model, hist)
        latest_files[ext] = hist

    for key in latest_files.keys():
        histlist.append(latest_files[key])
    return histlist

[docs]def copy(case, suffix):
    """Copy the most recent batch of hist files in a case, adding the given suffix.

    This can allow you to temporarily "save" these files so they won't be blown
    away if you re-run the case.

    case - The case containing the files you want to save
    suffix - The string suffix you want to add to saved files, this can be used to find them later.
    """
    rundir   = case.get_value("RUNDIR")
    testcase = case.get_value("CASE")

    # Loop over models
    comments = "Copying hist files to suffix '{}'\n".format(suffix)
    num_copied = 0
    for model in _iter_model_file_substrs(case):
        comments += "  Copying hist files for model '{}'\n".format(model)
        test_hists = _get_latest_hist_files(testcase, model, rundir)
        num_copied += len(test_hists)
        for test_hist in test_hists:
            new_file = "{}.{}".format(test_hist, suffix)
            if os.path.exists(new_file):
                os.remove(new_file)

            comments += "    Copying '{}' to '{}'\n".format(test_hist, new_file)

            # Need to copy rather than move in case there are some history files
            # that will need to continue to be filled on the next phase; this
            # can be the case for a restart run.
            #
            # (If it weren't for that possibility, a move/rename would be more
            # robust here: The problem with a copy is that there can be
            # confusion after the second run as to which files were created by
            # the first run and which by the second. For example, if the second
            # run fails to output any history files, the test will still pass,
            # because the test system will think that run1's files were output
            # by run2. But we live with that downside for the sake of the reason
            # noted above.)
            safe_copy(test_hist, new_file)

    expect(num_copied > 0, "copy failed: no hist files found in rundir '{}'".format(rundir))

    return comments

def _hists_match(model, hists1, hists2, suffix1="", suffix2=""):
    """
    return (num in set 1 but not 2 , num in set 2 but not 1, matchups)

    >>> hists1 = ['FOO.G.cpl.h1.nc', 'FOO.G.cpl.h2.nc', 'FOO.G.cpl.h3.nc']
    >>> hists2 = ['cpl.h2.nc', 'cpl.h3.nc', 'cpl.h4.nc']
    >>> _hists_match('cpl', hists1, hists2)
    (['FOO.G.cpl.h1.nc'], ['cpl.h4.nc'], [('FOO.G.cpl.h2.nc', 'cpl.h2.nc'), ('FOO.G.cpl.h3.nc', 'cpl.h3.nc')])
    >>> hists1 = ['FOO.G.cpl.h1.nc.SUF1', 'FOO.G.cpl.h2.nc.SUF1', 'FOO.G.cpl.h3.nc.SUF1']
    >>> hists2 = ['cpl.h2.nc.SUF2', 'cpl.h3.nc.SUF2', 'cpl.h4.nc.SUF2']
    >>> _hists_match('cpl', hists1, hists2, 'SUF1', 'SUF2')
    (['FOO.G.cpl.h1.nc.SUF1'], ['cpl.h4.nc.SUF2'], [('FOO.G.cpl.h2.nc.SUF1', 'cpl.h2.nc.SUF2'), ('FOO.G.cpl.h3.nc.SUF1', 'cpl.h3.nc.SUF2')])
    >>> hists1 = ['cam.h0.1850-01-08-00000.nc']
    >>> hists2 = ['cam_0001.h0.1850-01-08-00000.nc','cam_0002.h0.1850-01-08-00000.nc']
    >>> _hists_match('cam', hists1, hists2, '', '')
    ([], [], [('cam.h0.1850-01-08-00000.nc', 'cam_0001.h0.1850-01-08-00000.nc'), ('cam.h0.1850-01-08-00000.nc', 'cam_0002.h0.1850-01-08-00000.nc')])
    >>> hists1 = ['cam_0001.h0.1850-01-08-00000.nc.base','cam_0002.h0.1850-01-08-00000.nc.base']
    >>> hists2 = ['cam_0001.h0.1850-01-08-00000.nc.rest','cam_0002.h0.1850-01-08-00000.nc.rest']
    >>> _hists_match('cam', hists1, hists2, 'base', 'rest')
    ([], [], [('cam_0001.h0.1850-01-08-00000.nc.base', 'cam_0001.h0.1850-01-08-00000.nc.rest'), ('cam_0002.h0.1850-01-08-00000.nc.base', 'cam_0002.h0.1850-01-08-00000.nc.rest')])
    """
    normalized1, normalized2 = [], []
    multi_normalized1, multi_normalized2 = [], []
    multiinst = False

    for hists, suffix, normalized, multi_normalized in [(hists1, suffix1, normalized1, multi_normalized1), (hists2, suffix2, normalized2, multi_normalized2)]:
        for hist in hists:
            normalized_name = hist[hist.rfind(model):]
            if suffix != "":
                expect(normalized_name.endswith(suffix), "How did '{}' not have suffix '{}'".format(hist, suffix))
                normalized_name = normalized_name[:len(normalized_name) - len(suffix) - 1]

            m = re.search("(.+)_[0-9]{4}(.+.nc)",normalized_name)
            if m is not None:
                multiinst = True
                multi_normalized.append(m.group(1)+m.group(2))

            normalized.append(normalized_name)

    set_of_1_not_2 = set(normalized1) - set(normalized2)
    set_of_2_not_1 = set(normalized2) - set(normalized1)

    one_not_two = sorted([hists1[normalized1.index(item)] for item in set_of_1_not_2])
    two_not_one = sorted([hists2[normalized2.index(item)] for item in set_of_2_not_1])

    both = set(normalized1) & set(normalized2)

    match_ups = sorted([ (hists1[normalized1.index(item)], hists2[normalized2.index(item)]) for item in both])

    # Special case - comparing multiinstance to single instance files

    if multi_normalized1 != multi_normalized2:
        # in this case hists1 contains multiinstance hists2 does not
        if set(multi_normalized1) == set(normalized2):
            for idx, norm_hist1 in enumerate(multi_normalized1):
                for idx1, hist2 in enumerate(hists2):
                    norm_hist2 = normalized2[idx1]
                    if norm_hist1 == norm_hist2:
                        match_ups.append((hists1[idx], hist2))
                        if hist2 in two_not_one:
                            two_not_one.remove(hist2)
                        if hists1[idx] in one_not_two:
                            one_not_two.remove(hists1[idx])
        # in this case hists2 contains multiinstance hists1 does not
        if set(multi_normalized2) == set(normalized1):
            for idx, norm_hist2 in enumerate(multi_normalized2):
                for idx1, hist1 in enumerate(hists1):
                    norm_hist1 = normalized1[idx1]
                    if norm_hist2 == norm_hist1:
                        match_ups.append((hist1, hists2[idx]))
                        if hist1 in one_not_two:
                            one_not_two.remove(hist1)
                        if hists2[idx] in two_not_one:
                            two_not_one.remove(hists2[idx])

    if not multiinst:
        expect(len(match_ups) + len(set_of_1_not_2) == len(hists1), "Programming error1")
        expect(len(match_ups) + len(set_of_2_not_1) == len(hists2), "Programming error2")

    return one_not_two, two_not_one, match_ups

def _compare_hists(case, from_dir1, from_dir2, suffix1="", suffix2="", outfile_suffix=""):
    if from_dir1 == from_dir2:
        expect(suffix1 != suffix2, "Comparing files to themselves?")

    testcase = case.get_value("CASE")
    casedir = case.get_value("CASEROOT")
    all_success = True
    num_compared = 0
    comments = "Comparing hists for case '{}' dir1='{}', suffix1='{}',  dir2='{}' suffix2='{}'\n".format(testcase, from_dir1, suffix1, from_dir2, suffix2)
    multiinst_driver_compare = False
    for model in _iter_model_file_substrs(case):
        if model == 'cpl' and suffix2 == 'multiinst':
            multiinst_driver_compare = True
        comments += "  comparing model '{}'\n".format(model)
        hists1 = _get_latest_hist_files(testcase, model, from_dir1, suffix1)
        hists2 = _get_latest_hist_files(testcase, model, from_dir2, suffix2)
        if len(hists1) == 0 and len(hists2) == 0:
            comments += "    no hist files found for model {}\n".format(model)
            continue

        one_not_two, two_not_one, match_ups = _hists_match(model, hists1, hists2, suffix1, suffix2)
        for item in one_not_two:
            comments += "    File '{}' had no counterpart in '{}' with suffix '{}'\n".format(item, from_dir2, suffix2)
            all_success = False
        for item in two_not_one:
            comments += "    File '{}' had no counterpart in '{}' with suffix '{}'\n".format(item, from_dir1, suffix1)
            all_success = False

        num_compared += len(match_ups)

        for hist1, hist2 in match_ups:
            success, cprnc_log_file = cprnc(model, hist1, hist2, case, from_dir1,
                                            multiinst_driver_compare=multiinst_driver_compare,
                                            outfile_suffix=outfile_suffix)
            if success:
                comments += "    {} matched {}\n".format(hist1, hist2)
            else:
                comments += "    {} did NOT match {}\n".format(hist1, hist2)
                comments += "    cat " + cprnc_log_file + "\n"
                expected_log_file = os.path.join(casedir, os.path.basename(cprnc_log_file))
                if not (os.path.exists(expected_log_file) and filecmp.cmp(cprnc_log_file, expected_log_file)):
                    try:
                        safe_copy(cprnc_log_file, casedir)
                    except (OSError, IOError) as _:
                        logger.warning("Could not copy {} to {}".format(cprnc_log_file, casedir))

                all_success = False

    if num_compared == 0:
        all_success = False
        comments += "Did not compare any hist files! Missing baselines?\n"

    comments += "PASS" if all_success else "FAIL"

    return all_success, comments

[docs]def compare_test(case, suffix1, suffix2):
    """
    Compares two sets of component history files in the testcase directory

    case - The case containing the hist files to compare
    suffix1 - The suffix that identifies the first batch of hist files
    suffix1 - The suffix that identifies the second batch of hist files

    returns (SUCCESS, comments)
    """
    rundir   = case.get_value("RUNDIR")

    return _compare_hists(case, rundir, rundir, suffix1, suffix2)

[docs]def cprnc(model, file1, file2, case, rundir, multiinst_driver_compare=False, outfile_suffix=""):
    """
    Run cprnc to compare two individual nc files

    file1 - the full or relative path of the first file
    file2 - the full or relative path of the second file
    case - the case containing the files
    rundir - the rundir for the case
    outfile_suffix - if non-blank, then the output file name ends with this
        suffix (with a '.' added before the given suffix).
        Use None to avoid permissions issues in the case dir.

    returns (True if the files matched, log_name)
    """
    cprnc_exe = case.get_value("CCSM_CPRNC")
    basename = os.path.basename(file1)
    multiinst_regex = re.compile(r'.*%s[^_]*(_[0-9]{4})[.]h.?[.][^.]+?[.]nc' % model)
    mstr = ''
    mstr1 = ''
    mstr2 = ''
    #  If one is a multiinstance file but the other is not add an instance string
    m1 = multiinst_regex.match(file1)
    m2 = multiinst_regex.match(file2)
    if m1 is not None:
        mstr1 = m1.group(1)
    if m2 is not None:
        mstr2 = m2.group(1)
    if mstr1 != mstr2:
        mstr = mstr1+mstr2

    output_filename = os.path.join(rundir, "{}{}.cprnc.out".format(basename, mstr))
    if outfile_suffix:
        output_filename += ".{}".format(outfile_suffix)

    if outfile_suffix is None:
        cpr_stat, out, _ = run_cmd("{} -m {} {}".format(cprnc_exe, file1, file2), combine_output=True)
    else:
        cpr_stat = run_cmd("{} -m {} {}".format(cprnc_exe, file1, file2), combine_output=True, arg_stdout=output_filename)[0]
        with open(output_filename, "r") as fd:
            out = fd.read()

    if multiinst_driver_compare:
        #  In a multiinstance test the cpl hist file will have a different number of
        # dimensions and so cprnc will indicate that the files seem to be DIFFERENT
        # in this case we only want to check that the fields we are able to compare
        # have no differences.
        return (cpr_stat == 0 and " 0 had non-zero differences" in out, output_filename)
    else:
        return (cpr_stat == 0 and "files seem to be IDENTICAL" in out, output_filename)

[docs]def compare_baseline(case, baseline_dir=None, outfile_suffix=""):
    """
    compare the current test output to a baseline result

    case - The case containing the hist files to be compared against baselines
    baseline_dir - Optionally, specify a specific baseline dir, otherwise it will be computed from case config
    outfile_suffix - if non-blank, then the cprnc output file name ends with
        this suffix (with a '.' added before the given suffix). if None, no output file saved.

    returns (SUCCESS, comments)
    SUCCESS means all hist files matched their corresponding baseline
    """
    rundir   = case.get_value("RUNDIR")
    if baseline_dir is None:
        baselineroot = case.get_value("BASELINE_ROOT")
        basecmp_dir = os.path.join(baselineroot, case.get_value("BASECMP_CASE"))
        dirs_to_check = (baselineroot, basecmp_dir)
    else:
        basecmp_dir = baseline_dir
        dirs_to_check = (basecmp_dir,)

    for bdir in dirs_to_check:
        if not os.path.isdir(bdir):
            return False, "ERROR {} baseline directory '{}' does not exist".format(TEST_NO_BASELINES_COMMENT,bdir)

    success, comments = _compare_hists(case, rundir, basecmp_dir, outfile_suffix=outfile_suffix)
    if get_model() == "e3sm":
        bless_log = os.path.join(basecmp_dir, BLESS_LOG_NAME)
        if os.path.exists(bless_log):
            last_line = open(bless_log, "r").readlines()[-1]
            comments += "\n  Most recent bless: {}".format(last_line)

    return success, comments

[docs]def get_extension(model, filepath):
    """
    For a hist file for the given model, return what we call the "extension"

    model - The component model
    filepath - The path of the hist file

    >>> get_extension("cpl", "cpl.hi.nc")
    'hi'
    >>> get_extension("cpl", "cpl.h.nc")
    'h'
    >>> get_extension("cpl", "cpl.h1.nc")
    'h1'
    >>> get_extension("cpl", "TESTRUNDIFF_Mmpi-serial.f19_g16_rx1.A.melvin_gnu.C.fake_testing_only_20160816_164150-20160816_164240.cpl.hi.0.nc.base")
    'hi'
    >>> get_extension("cpl", "TESTRUNDIFF_Mmpi-serial.f19_g16_rx1.A.melvin_gnu.C.fake_testing_only_20160816_164150-20160816_164240.cpl.h.nc")
    'h'
    >>> get_extension("clm","clm2_0002.h0.1850-01-06-00000.nc")
    '0002.h0'
    >>> get_extension("pop","PFS.f09_g16.B1850.cheyenne_intel.allactive-default.GC.c2_0_b1f2_int.pop.h.ecosys.nday1.0001-01-02.nc")
    'h'
    """
    basename = os.path.basename(filepath)
    ext_regex = re.compile(r'.*%s[^_]*_?([0-9]{4})?[.](h.?)([.].*[^.])?[.]nc' % model)

    m = ext_regex.match(basename)
    expect(m is not None, "Failed to get extension for file '{}'".format(filepath))

    if m.group(1) is not None:
        result = m.group(1)+'.'+m.group(2)
    else:
        result = m.group(2)

    return result

[docs]def generate_baseline(case, baseline_dir=None, allow_baseline_overwrite=False):
    """
    copy the current test output to baseline result

    case - The case containing the hist files to be copied into baselines
    baseline_dir - Optionally, specify a specific baseline dir, otherwise it will be computed from case config
    allow_baseline_overwrite must be true to generate baselines to an existing directory.

    returns (SUCCESS, comments)
    """
    rundir   = case.get_value("RUNDIR")
    if baseline_dir is None:
        baselineroot = case.get_value("BASELINE_ROOT")
        basegen_dir = os.path.join(baselineroot, case.get_value("BASEGEN_CASE"))
    else:
        basegen_dir = baseline_dir
    testcase = case.get_value("CASE")

    if not os.path.isdir(basegen_dir):
        os.makedirs(basegen_dir)

    if (os.path.isdir(os.path.join(basegen_dir,testcase)) and
        not allow_baseline_overwrite):
        expect(False, " Cowardly refusing to overwrite existing baseline directory")

    comments = "Generating baselines into '{}'\n".format(basegen_dir)
    num_gen = 0
    for model in _iter_model_file_substrs(case):
        comments += "  generating for model '{}'\n".format(model)
        hists =  _get_latest_hist_files(testcase, model, rundir)
        logger.debug("latest_files: {}".format(hists))
        num_gen += len(hists)
        for hist in hists:
            basename = hist[hist.rfind(model):]
            baseline = os.path.join(basegen_dir, basename)
            if os.path.exists(baseline):
                os.remove(baseline)

            safe_copy(hist, baseline)
            comments += "    generating baseline '{}' from file {}\n".format(baseline, hist)

    # copy latest cpl log to baseline
    # drop the date so that the name is generic
    newestcpllogfile = case.get_latest_cpl_log(coupler_log_path=case.get_value("RUNDIR"))
    if newestcpllogfile is None:
        logger.warning("No cpl.log file found in directory {}".format(case.get_value("RUNDIR")))
    else:
        safe_copy(newestcpllogfile, os.path.join(basegen_dir, "cpl.log.gz"))

    expect(num_gen > 0, "Could not generate any hist files for case '{}', something is seriously wrong".format(os.path.join(rundir, testcase)))
    #make sure permissions are open in baseline directory
    for root, _, files in os.walk(basegen_dir):
        for name in files:
            try:
                os.chmod(os.path.join(root,name), stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH)
            except OSError:
                # We tried. Not worth hard failure here.
                pass

    if get_model() == "e3sm":
        bless_log = os.path.join(basegen_dir, BLESS_LOG_NAME)
        with open(bless_log, "a") as fd:
            fd.write("sha:{} date:{}\n".format(get_current_commit(repo=case.get_value("CIMEROOT")),
                                               get_timestamp(timestamp_format="%Y-%m-%d_%H:%M:%S")))

    return True, comments