Source code for um_utils.cumf

# *****************************COPYRIGHT******************************
# (C) Crown copyright Met Office. All rights reserved.
# For further details please refer to the file LICENCE.txt
# which you should have received as part of this distribution.
# *****************************COPYRIGHT******************************
#
# This file is part of the UM utilities module, which use the Mule API.
#
# Mule and these utilities are free software: you can redistribute it and/or
# modify them under the terms of the Modified BSD License, as published by the
# Open Source Initiative.
#
# These utilities are distributed in the hope that they will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# Modified BSD License for more details.
#
# You should have received a copy of the Modified BSD License
# along with these utilities.
# If not, see <http://opensource.org/licenses/BSD-3-Clause>.
"""
CUMF (Compare UM FieldsFiles) is a utility to assist in examining UM files.

Usage:

 * Compare :class:`mule.UMFile` objects with the
   :class:`UMFileComparison` class:

    >>> comp = UMFileComparison(umfile_object1, umfile_object2)

 * This object can be manually examined for details, or you can print either
   a short summary or a full report (note a full report is a super-set of a
   summary report):

   >>> summary_report(comp)
   >>> full_report(comp)

    .. Note::
       The field difference objects behave like the original fields, but their
       data stores the absolute differences.  You could retrieve the data
       using "get_data" to examine it, or write it out to a file.

Global comparison settings:

    The module contains a global "COMPARISON_SETTINGS" dictionary, which
    defines default values for the various options; these may be overidden
    for an entire script/session if desired, or in a startup file e.g.

    >>> from um_utils import cumf
    >>> cumf.COMPARISON_SETTINGS["ignore_missing"] = True

    Alternatively each of these settings may be supplied to the main comparison
    class as keyword arguments.  The available settings are:

    * ignore_templates:
        A dictionary indicating which indices should be ignored when making
        comparisons.  The keys give the names of the components and the values
        are lists of the indices to ignore
        (e.g. {"fixed_length_header": [1,2,3], "lookup": [5,42]})
        (default: ignore creation time in fixed length header only)

    * ignore_missing:
        Flag which sets all positional header indices to be ignored - this is
        useful if the file objects being compared have fields which are
        missing from either file. (default: False)

    * only_report_failures:
        Flag which indicates that the printed output should not contain any
        sections which are simply stating that they agree.  (This cuts down
        on the amount of output for larger files). (default:True)

    * lookup_print_func:
        A callback function which is called for each printed field comparison
        to provide extra information about the fields.  It will be passed 2
        arguments - the comparison field and the stdout object to write to.

    * show_missing:
        Flag which causes a list of fields missing from each file to be
        generated in the report. (default: False)

    * show_missing_max:
        Maximum number of missing fields to display. Set to -1 to indicate no
        maximum. (default: -1)

"""
import re
import sys
import mule
import mule.pp
import errno
import argparse
import textwrap
import numpy as np
import warnings
from six import StringIO
from collections import defaultdict
from um_utils.stashmaster import STASHmaster
from um_utils.pumf import pprint, _banner
from um_utils.version import report_modules


# The following functions are defaults which are used to print some additional
# information about the lookups being compared (to assist in distinguishing
# between similar fields)
def _print_lookup(field, stdout):
    """Prints the validity time, level and processing information."""
    validity_format = "t1({0:04d}/{1:02d}/{2:02d} {3:02d}:{4:02d}:{5:02d})"
    validity = validity_format.format(*field.raw[1:7])

    lev_format = "lblev({0})/blev({1})"
    lev = lev_format.format(field.raw[33], field.raw[52])

    proc_format = "lbproc({0})"
    proc = proc_format.format(field.raw[25])

    stdout.write("  " + "  ".join([validity, lev, proc])+"\n")


# This version is switched to by default for the "full" output mode
def _print_lookup_full(field, stdout):
    """Prints the entire lookup contents using pumf."""
    pprint(field, stdout=stdout, headers_only=True)


# This dictionary stores a list of global settings that control the
# comparison - when called as a main program these can be overidden by
# the command line arguments, or the user can easily adjust these in
# various ways to customise their output.
COMPARISON_SETTINGS = {
    "ignore_templates": {
        "fixed_length_header": [35, 36, 37, 38, 39, 40, 41],
        },
    "ignore_missing": False,
    "only_report_failures": True,
    "lookup_print_func": _print_lookup,
    "show_missing": False,
    "show_missing_max": -1,
    }

# Lookup indices which should be ignored when the user indicates
# they wish to ignore missing fields from either file
_INDEX_IGNORE_MISSING_FIELDS = [
    29,  # lbegin (field start positions will be offset differently)
    40,  # lbuser(2) (for same reasons as above)
    ]
# Entries from the fixed-length-header which should be ignored when
# the user indicates they wish to ignore missing fields
_INDEX_IGNORE_MISSING_FLH = [
    152,  # Number of lookups (different if some fields are missing)
    153,  # Num prog. fields (different if some are missing)
    160,  # Data start (different if number of lookups is different)
    161,  # Data dim1 (different if some fields are missing)
    162,  # Data dim2 (as above)
    ]

# Lookup indices which must be ignored to allow the index to be
# created that matches lookups in the file against each other
_INDEX_IGNORED_LOOKUP = [
    15,  # lblrec (length on disk could be different due to packing)
    20,  # lbext (extra data may be different)
    22,  # lbrel (may have different header release numbers)
    28,  # lbexp (could be from different experiments)
    29,  # lbegin (field start position won't agree if ordered differently)
    30,  # lbnrec (length on disk could be different due to packing)
    38,  # lbsrce (could be different if model version doesn't agree)
    40   # lbuser(2) (for same reason as lbegin)
    ]



[docs]
class DifferenceField(mule.Field):
    """
    Difference object - for two :class:`mule.Field` objects.

    A special subclass of :class:`mule.Field` which looks and behaves just
    like the original class, but defines some extra properties that are useful
    when performing a comparison.

    """
    match = None
    """Global matching flag; True if both the lookup and data match."""

    data_match = None
    """Data matching flag; True if the field data matches."""

    data_shape_match = None
    """Data shape matching flag: True if fields are the same shape."""

    compared = None
    """
    Tuple containing the number of points which are different and the total
    number of points in the field.
    """

    rms_diff = None
    """Root-Mean-Squared difference between the two fields."""

    rms_norm_diff_1 = None
    """
    Root-Mean-Squared difference between the two fields, normalised by the
    values in the first field.

    """

    rms_norm_diff_2 = None
    """
    Root-Mean-Squared difference between the two fields, normalised by the
    values in the second field.

    """

    max_diff = None
    """Maximum difference between the two fields."""

    file_1_index = None
    """The field-index of the first field in its original file."""

    file_2_index = None
    """The field-index of the second field in its original file."""

    lookup_comparison = None
    """
    Holds a :class:`ComponentComparison` object that describes any differences
    in the lookup component of the fields.

    """




[docs]
class DifferenceField2(mule.Field2, DifferenceField):
    """A :class:`DifferenceField` object for :class:`mule.Field2` objects."""
    pass




[docs]
class DifferenceField3(mule.Field3, DifferenceField):
    """A :class:`DifferenceField` object for :class:`mule.Field3` objects."""
    pass



# Maps header release version number onto a difference field class
_DIFFERENCE_FIELDS = {2: DifferenceField2,
                      3: DifferenceField3,
                      -99: DifferenceField,
                      mule._INTEGER_MDI: DifferenceField3}



[docs]
class DifferenceOperator(mule.DataOperator):
    """
    This is a simple operator that calculates the difference between
    the data in two fields.

    """

[docs]
    def __init__(self):
        """Initialise the object."""
        pass



[docs]
    def new_field(self, fields):
        """
        Create a new field instance from the 2 fields being compared.

        This returns a new :class:`DifferenceField` object with the same
        lookup headers as the first field in the list.  It's data will
        contain the absolute difference of the fields (field_1 - field_2).

        Several statistical quantities will also be calculated and saved
        to the new object, for later inspection.

        Args:
            * fields:
                List containing the 2 :class:`mule.Field` objects
                to be compared.

        .. Note::
            Unlike most other operators the data is retrieved in
            this method as well as in the transform method; because
            we need to know if the fields compare.

        """
        # Copy the header from the first field (if they are being
        # compared the headers should already be the same)
        diff_field_class = _DIFFERENCE_FIELDS.get(fields[0].lbrel,
                                                  DifferenceField)
        new_field = diff_field_class(fields[0]._lookup_ints,
                                     fields[0]._lookup_reals,
                                     None)

        # Copy the STASH entry (if it exists)
        new_field.stash = fields[0].stash

        # Get the data from the fields and check if it matches
        # Note: this is an abnormal use of the operator; usually
        # get_data should not be called in this method, however in
        # this case we need to know if the objects are different
        # immediately
        data1 = fields[0].get_data()
        data2 = fields[1].get_data()

        # A quick helper function which calculates the RMS of the arrays
        def rms(array, mdi_val=None):
            if mdi_val is not None:
                rms_points = array[array != mdi_val]
                if rms_points.size == 0:
                    rms_points = array
            else:
                rms_points = array
            return np.sqrt(np.mean(np.square(rms_points)))

        # Store whether the field matches, and several statistical measures
        # of the differences if any are found
        new_field.data_match = np.array_equal(data1, data2, equal_nan=True)

        # If the fields aren't the same shape, it isn't possible to calculate
        # anymore comparison information
        new_field.data_shape_match = data1.shape == data2.shape
        if not new_field.data_shape_match:
            new_field.data_match = False
            return new_field

        if not new_field.data_match:
            bool_field = data1 == data2
            diff = np.abs(data1 - data2)
            # Maximum absolute difference and RMS difference
            new_field.max_diff = np.max(diff)
            new_field.rms_diff = rms(diff)

            # Get the RMS of each field
            rms_field1 = rms(data1, mdi_val=fields[0].bmdi)
            rms_field2 = rms(data2, mdi_val=fields[1].bmdi)

            # Save the normalised RMS difference as a % of each field (if
            # the field was non-zero)
            if rms_field1 > 0.0:
                new_field.rms_norm_diff_1 = (
                    100.0*(new_field.rms_diff / rms_field1))
            if rms_field2 > 0.0:
                new_field.rms_norm_diff_2 = (
                    100.0*(new_field.rms_diff / rms_field2))

            # Save the number of points compared and the total number of points
            new_field.compared = (bool_field.size - np.sum(bool_field),
                                  bool_field.size)
        else:
            # If nothing was compared ensure everything is set appropriately
            new_field.max_diff = 0.0
            new_field.rms_diff = 0.0
            new_field.rms_norm_diff_1 = 0.0
            new_field.rms_norm_diff_2 = 0.0
            new_field.compared = (0, data1.size)

        # Add 1 to lbproc - to indicate it is a different between fields
        # (note the default "Field" objects do not know this property)
        if new_field.lbrel in (2, 3):
            new_field.lbproc += 1

        # Turn off WGDOS packing if used - we can't guarantee that the
        # differences will be able to be packed to the original accuracy
        if new_field.lbpack == 1:
            new_field.lbpack = 0
            new_field.bacc = -99.0

        return new_field



[docs]
    def transform(self, fields, new_field):
        """Return the absolute differences between the two fields."""
        data1 = fields[0].get_data()
        data2 = fields[1].get_data()
        return data1 - data2





[docs]
class ComponentComparison(object):
    """
    This class stores an individual comparison result; valid for any
    pair of UM header components.

    """
    match = None
    """Global matching flag; True if both the lookup and data match."""

    compared = None
    """
    Tuple pair indicating how many values were compared and the total
    number of possible comparisons.

    """

    in_file_1 = None
    """Presence flag; True if the first component exists."""

    in_file_2 = None
    """Presence flag; True if the second component exists."""

    same_shape = None
    """Shape flag; True if the components are the same shape."""

    ignored = None
    """Stores a list of any indices which were ignored."""

    diffs = None
    """
    If the components differ, this list stores the differences; it will
    contain one tuple for each difference, consisting of:

        * The index into the components where the difference occurs.
        * The value of the item in component_1.
        * The value of the item in component_2.

    .. Note:
        The length of this list should not be relied on to detect if
        the components match.  If the other flags dictate that the two
        components are not the same shape or either of them are missing,
        the comparison will never be done and diffs will still return
        an empty list.

    """
    component_1 = None
    """A reference to the first component."""

    component_2 = None
    """A reference to the second component"""


[docs]
    def __init__(self, component_1, component_2, ignore_indices=[]):
        """
        Return elements of the components which do not agree.

        Args:
            * component_1:
                The first component to compare.
            * component_2:
                The second component to compare.

        Kwargs:
            * ignore_indices:
                If provided, a list of indices to ignore when performing
                the check.

        """

        # Initialise the matching and difference list.
        self.match = True
        self.diffs = []
        self.compared = (0, 0)
        self.component_1 = component_1
        self.component_2 = component_2

        # Save a copy of which (if any) indices were ignored (for reporting)
        self.ignored = ignore_indices

        # Check if the components are both present.
        self.in_file_1 = component_1 is not None
        self.in_file_2 = component_2 is not None
        if not (self.in_file_1 and self.in_file_2):
            self.match = self.in_file_1 == self.in_file_2
            return

        # Get the component shapes.
        shape_1 = component_1.raw.shape
        shape_2 = component_2.raw.shape

        # Check if the two are the same shape; save this information then
        # abort the comparison if they aren't compatible
        self.same_shape = shape_1 == shape_2
        if not self.same_shape:
            self.match = False
            return

        # Zip the raw values together - call ravel on them first so that
        # any 2-d component becomes a 1-d equivalent (it won't hurt any
        # actual 1-d arrays).
        component_zip = zip(component_1.raw.ravel(),
                            component_2.raw.ravel())

        # Go through the values; if the elements aren't equal add
        # the index and both element values to the list.
        comparison_count = 0
        for i_element, elements in enumerate(component_zip):
            # Note: call unravel here to work out the original
            # index in the 2-d case.
            index = np.unravel_index(i_element, shape_1)
            if len(index) == 1:
                index = index[0]

            # Only perform the check if this index wasn't
            # explicitly filtered out by the user
            if index not in ignore_indices:
                comparison_count += 1
                if elements[0] != elements[1]:
                        # The list then contains the index and the elements
                        # that are different
                        self.diffs.append((index, elements))

        # Save the number of comparisons performed and the total size (note
        # the "null" quantities padding the components have to be subtracted
        # to give a true value)
        if len(shape_1) == 2:
            null_values = shape_1[0]
        else:
            null_values = 1

        self.compared = (comparison_count - null_values,
                         len(component_1.raw.ravel()) - null_values)

        # Toggle the match flag if any differences were found
        if len(self.diffs) != 0:
            self.match = False





[docs]
class UMFileComparison(object):
    """
    A structure which stores comparison information between two
    :class:`mule.UMFile` subclasses.

    """

    match = None
    """Global matching flag; True if everything about the files matches."""

    file_1 = None
    """A reference to the first file object."""

    file_2 = None
    """A reference to the second file object."""

    files_are_same_type = None
    """Type flag; True if both files are the same file type."""

    comparisons = None
    """
    A dictionary containing a :class:`ComponentComparison` object for
    each of the possible UM file header components (except the lookup).
    The dictionary keys are the component names (e.g. "fixed_length_header")

    """

    field_comparisons = None
    """
    A list of :class:`DifferenceField` objects; one for each pair of fields
    compared between the two files.

    """
    lookup_ignores = None
    """
    A list of the lookup indices which were ignored for this comparison

    """

    show_missing = False
    """
    Flag which details if a list of missing fields for each file should be
    generated in reports.

    """

    show_missing_max = -1
    """
    The maximum number of missing fields to list. Set to -1 to indicate no
    maximum.

    """

    max_rms_diff_1 = None
    """
    A tuple containing the maximum encountered RMS difference relative to
    the data in the first file, and the index of the field containing it.

    """

    max_rms_diff_2 = None
    """
    A tuple containing the maximum encountered RMS difference relative to
    the data in the second file, and the index of the field containing it.

    """

    unmatched_file_1 = []
    """
    A list containing the indices of any fields which exist in file 1 but
    were not successfully matched to a field in file 2.

    """

    unmatched_file_2 = []
    """
    A list containing the indices of any fields which exist in file 2 but
    were not successfully matched to a field in file 1.

    """


[docs]
    def __init__(self, um_file1, um_file2, **kwargs):
        """
        Create the comparison object.

        Args:
            * um_file1:
                The first :class:`mule.UMFile` subclass.
            * um_file2:
                The second :class:`mule.UMFile` subclass.

        Kwargs:
            Any other keywords are assumed to be settings to override
            the values in the global COMPARISON_SETTINGS dictionary,
            see the docstring of the :mod:`cumf` module for details

        """

        # Deal with the possible keywords - take the global print settings
        # dictionary as a starting point and add any changes supplied in
        # the call to this method
        comp_settings = COMPARISON_SETTINGS.copy()
        for keyword, value in kwargs.items():
            if keyword in comp_settings:
                comp_settings[keyword] = value
            else:
                msg = "Keyword not recognised: {0}"
                raise ValueError(msg.format(keyword))

        # Global flag to indicate if the files match
        self.match = True

        # Store a reference to the two original file objects
        self.file_1 = um_file1
        self.file_2 = um_file2

        # Check if the files are of the same type
        self.files_are_same_type = type(um_file1) is type(um_file2)
        self.match = self.match and self.files_are_same_type

        # Remove the empty lookups (if the files are fieldsfiles)
        if type(um_file1) is mule.FieldsFile:
            um_file1.remove_empty_lookups()
        if type(um_file2) is mule.FieldsFile:
            um_file2.remove_empty_lookups()

        # First we want to create a list of comparisons of the header
        # compoennts in the file
        self.comparisons = {}

        # Create a list of expected component names; take these from the
        # UMFile base class since it is setup to include all possible/known
        # component names used in the different classes
        component_list = (["fixed_length_header"] +
                          [name for name, _ in mule.UMFile.COMPONENTS])

        # Compare each component, accounting for the possibility that one
        # file might contain it and the other might not
        for name in component_list:
            component_1 = getattr(um_file1, name, None)
            component_2 = getattr(um_file2, name, None)

            # If the template for ignores sets up any indices to ignore
            # for this component extract them here - use the list cast to
            # avoid changing the settings dictionary in-place
            component_ignores = list(
                comp_settings["ignore_templates"].get(name, []))

            if (comp_settings["ignore_missing"] and
                    name == "fixed_length_header"):
                component_ignores.extend(_INDEX_IGNORE_MISSING_FLH)

            comparison = ComponentComparison(component_1, component_2,
                                             component_ignores)
            self.comparisons[name] = comparison

            # Update the global matching if any component fails to match
            self.match = self.match and comparison.match

        # For the fields we will need the difference operator defined above,
        # but it needs to be initialised first
        difference_op = DifferenceOperator()

        # Get the (user) list of lookup elements to ignore
        lookup_ignores = (
            comp_settings["ignore_templates"].get("lookup", []))

        # If the user has chosen to ignore missing fields, add the required
        # elements of the lookup to the ignore list
        if comp_settings["ignore_missing"]:
            lookup_ignores.extend(_INDEX_IGNORE_MISSING_FIELDS)

        lookup_ignores = sorted(list(set(lookup_ignores)))

        # Save the list of ignored lookup indices to this object for later
        self.lookup_ignores = lookup_ignores

        # Save the show-missing option
        self.show_missing = comp_settings["show_missing"]
        self.show_missing_max = comp_settings["show_missing_max"]

        # Initialise the elements which hold the field comparison objects
        self.field_comparisons = []
        self.max_rms_diff_1 = [0, 0]
        self.max_rms_diff_2 = [0, 0]

        # If there aren't any fields in the first file, there isn't anything
        # to compare
        if len(um_file1.fields) == 0:
            # And unless this is allowed or expected, it's also a failure
            if (len(um_file2.fields) != 0 and
                    not comp_settings["ignore_missing"]):
                self.match = False
            return

        # Create a mapping which relates the lookups in the two files (in
        # case the ordering of fields has changed)
        index = self._create_index(um_file1, um_file2, lookup_ignores)

        # If the matchings don't account for all fields, the files cannot
        # completely match (unless the user has specified that this is okay)
        n_indices = len(index)
        if ((n_indices != len(um_file1.fields) or
                n_indices != len(um_file2.fields)) and
                not comp_settings["ignore_missing"]):
            self.match = False

        # Now iterate through the fields whose lookups appear to match
        for ifield_1, ifield_2 in index:

            field_1 = um_file1.fields[ifield_1]
            field_2 = um_file2.fields[ifield_2]

            # Compare the lookups themselves with a comparison object
            lookup_comparison = ComponentComparison(field_1, field_2,
                                                    lookup_ignores)

            # Create a field difference object, which stores information about
            # the differences and the means to obtain a difference map.
            # Note: technically this operator is reading both fields at this
            # point, since it must do this to determine if the fields are
            # different - this is intentional and differs from how operators
            # are commonly used)
            diff_field = difference_op([field_1, field_2])

            diff_field.file_1_index = ifield_1
            diff_field.file_2_index = ifield_2
            diff_field.lookup_comparison = lookup_comparison

            # Keep a running total of the largest RMS differences
            if diff_field.rms_norm_diff_1 is not None:
                if self.max_rms_diff_1[0] < diff_field.rms_norm_diff_1:
                    self.max_rms_diff_1 = (diff_field.rms_norm_diff_1,
                                           ifield_1 + 1)

            if diff_field.rms_norm_diff_2 is not None:
                if self.max_rms_diff_2[0] < diff_field.rms_norm_diff_2:
                    self.max_rms_diff_2 = (diff_field.rms_norm_diff_2,
                                           ifield_2 + 1)

            # The field object only matches if both the lookups and the
            # data match
            diff_field.match = (lookup_comparison.match and
                                diff_field.data_match)

            # Update the global matching if any field or lookup fails to match
            self.match = self.match and diff_field.match

            # Append the information and objects to the comparison list
            self.field_comparisons.append(diff_field)


    def _create_index(self, um_file1, um_file2, lookup_ignores=[]):
        """
        Method to attempt to match fields in the two files by their lookups.

        """
        # Create a base set of lookups to ignore when trying to match fields,
        # these entries can change very readily even in files which do
        # technically compare, so should always be ignored
        set_ignored_lookups = set(_INDEX_IGNORED_LOOKUP)

        # The user may additionally have provided their own set of indices
        # to ignore - combine them with the default ones here to make a
        # complete set
        set_ignored_lookups = set_ignored_lookups | set(lookup_ignores)

        # Generate a set of indices the length of the first lookup header
        # in the file and take its compliment with the list above to end up
        # with only the indices we wish to compare as a list
        set_lookups_to_check = set(range(1, len(um_file1.fields[0].raw)))
        set_lookups_to_check = set_lookups_to_check - set_ignored_lookups
        lookups_to_check = sorted(list(set_lookups_to_check))

        # Create a list of the indices of the fields in file 1, the indices
        # will be removed from this list as the fields are processed
        set_unmatched_in_file1 = set(range(len(um_file1.fields)))
        index = []

        # Create a dictionary storing sets of the indices in file 2 separated
        # according to their stash code, with that stash code as the keys
        file_2_fields_by_stash = defaultdict(list)
        for ifield2, field in enumerate(um_file2.fields):
            file_2_fields_by_stash[field.lbuser4].append(ifield2)

        # Can now go through the fields in file 1 and identify matches
        for ifield1, field1 in enumerate(um_file1.fields):
            lookup1 = field1.raw[lookups_to_check]
            # Look for matching lookup in file_2.fields
            stash_item = field1.lbuser4
            if stash_item in file_2_fields_by_stash:
                for ifield2 in file_2_fields_by_stash[stash_item]:
                    # When comparing each lookup, check only the indices that
                    # were specified above (some indices will rarely match)
                    lookup2 = um_file2.fields[ifield2].raw[lookups_to_check]
                    if all(lookup1 == lookup2):
                        # Save the indices of the matched fields, and remove
                        # them from both sets so that they can't be matched
                        # multiple times and for some minor performance savings
                        index.append((ifield1, ifield2))
                        set_unmatched_in_file1.remove(ifield1)
                        file_2_fields_by_stash[stash_item].remove(ifield2)
                        break  # Move to next field in file 1

        # Any indices left in either list represent fields for which a
        # match was not found between the files.  Save these indices
        # so that they can be referred to in any reporting
        self.unmatched_file_1 = sorted(
            list(set_unmatched_in_file1))

        # The file 2 dictionary needs to be unravelled from the stash code
        # dictionary and back into a flat list
        self.unmatched_file_2 = []
        for stash_item in file_2_fields_by_stash:
            self.unmatched_file_2.extend(file_2_fields_by_stash[stash_item])
        self.unmatched_file_2 = sorted(self.unmatched_file_2)

        return index




[docs]
def summary_report(comparison, stdout=None):
    """
    Print a report giving a brief summary of a comparison object.

    Args:
        * comparison:
            A :class:`UMFileComparison` object, populated with the
            differences between two files.

    Kwargs:
        * stdout:
            A open file-like object to write the report to.

    """
    # Setup output
    if stdout is None:
        stdout = sys.stdout

    stdout.write(_banner("CUMF-II Comparison Report")+"\n")

    # Report the names of the files
    stdout.write("File 1: {0}\n".format(comparison.file_1._source_path))
    stdout.write("File 2: {0}\n".format(comparison.file_2._source_path))

    # Assume files differ unless proven otherwise
    files_differ = True

    # First of all do the files compare overall
    if comparison.match:
        stdout.write("Files compare\n")
        files_differ = False
    else:
        stdout.write("Files DO NOT compare\n")
        files_differ = True

    # Warn if the files are not the same type
    if not comparison.files_are_same_type:
        stdout.write("WARNING: Files are not the same type!  This is likely "
                     "to cause unknown differences\n")

    # Create the component list from the base file class
    component_list = (["fixed_length_header"] +
                      [name for name, _ in mule.UMFile.COMPONENTS])

    # First pass loop to present a quick overview of what exactly is wrong
    for name in component_list:
        comp_comp = comparison.comparisons[name]
        if not comp_comp.match:
            stdout.write(
                "  * {0} differences in {1} (with {2} ignored indices)\n"
                .format(len(comp_comp.diffs), name, len(comp_comp.ignored)))
        elif len(comp_comp.ignored) > 0:
            stdout.write(
                "  * 0 differences in {0} (with {1} ignored indices)\n"
                .format(name, len(comp_comp.ignored)))

    if len(comparison.field_comparisons) > 0:
        field_matches = np.array(
            [(comp_field.match, comp_field.data_match)
                for comp_field in comparison.field_comparisons])
        n_diff, n_data_diff = np.sum(np.bitwise_not(field_matches), axis=0)
        stdout.write("  * {0} field differences, of which {1} are in data\n"
                     .format(n_diff, n_data_diff))
    stdout.write("\n")

    # Summarise the field differences
    fields_compared = len(comparison.field_comparisons)
    if comparison.unmatched_file_1 is None:
        total_fields = 0
    else:
        total_fields = (fields_compared +
                        len(comparison.unmatched_file_1) +
                        len(comparison.unmatched_file_2))
    matches = sum([fcomp.match for fcomp in comparison.field_comparisons])
    stdout.write("Compared {0}/{1} fields, with {2} matches\n"
                 .format(fields_compared, total_fields, matches))

    # If not all the fields were matched, report on the distribution of the
    # mis-match
    if len(comparison.unmatched_file_1) > 0:
        msg = "{0} fields found in file 1 were not in file 2\n"
        stdout.write(msg.format(len(comparison.unmatched_file_1)))
    if len(comparison.unmatched_file_2) > 0:
        msg = "{0} fields found in file 2 were not in file 1\n"
        stdout.write(msg.format(len(comparison.unmatched_file_2)))

    # If not all fields were compared, report here, and exit if none were
    # compared, unless --show-missing was requested. In that case continue
    # far enough to print lookup ignores, which are now relevant.
    if fields_compared != total_fields and fields_compared == 0:
        if not comparison.show_missing:
            stdout.write("\n")
            return files_differ

    # Report missing fields if requested
    if comparison.show_missing:
        if fields_compared == 0:
            stdout.write("Not listing specific missing fields,"
                         " because all fields are missing from both files."
                         " (No fields are common.)\n")
        else:
            msg = " * {0}/{1}: {2} -{3}\n"
            counts = [comparison.unmatched_file_1, comparison.unmatched_file_2]
            umfiles = [comparison.file_1, comparison.file_2]
            for ifile, (count, umfile) in enumerate(zip(counts, umfiles)):
                total_missing_shown = 0
                file_a = str(ifile % 2 + 1)
                file_b = str((ifile + 1) % 2 + 1)
                if len(count) > 0:
                    stdout.write("\n")
                    stdout.write("Fields in file {0} but not file {1}:\n"
                                 .format(file_a, file_b))
                    for index in count:
                        if (total_missing_shown >=
                                comparison.show_missing_max and
                                comparison.show_missing_max != -1):
                            stdout.write(
                                "  More fields are missing from file {0:s},"
                                .format(file_b) +
                                " but the print maximum has been reached.\n")
                            break
                        if umfile.fields[index].stash is not None:
                            if umfile.fields[index].stash.name is not None:
                                stashname = umfile.fields[index].stash.name
                            else:
                                stashname = "Unknown STASH (code: {})".format(
                                    umfile.fields[index].lbuser4)
                        else:
                            stashname = "Unknown STASH (code: {})".format(
                                umfile.fields[index].lbuser4)
                        lookup_info = StringIO()
                        _print_lookup(umfile.fields[index], lookup_info)
                        stdout.write(msg.format(index+1,
                                                len(umfile.fields),
                                                stashname,
                                                lookup_info.getvalue()))
                        lookup_info.close()
                        total_missing_shown = total_missing_shown + 1

    # Report which indices were ignored from the lookups
    if len(comparison.lookup_ignores) > 0:
        ignored = []
        stdout.write("\n")
        for index in comparison.lookup_ignores:
            indexstr = str(index)
            for map_name, map_ind in mule._LOOKUP_HEADER_3:
                if map_ind == index:
                    indexstr = "{0} ({1})".format(index, map_name)
                    break
            ignored.append(indexstr)
        stdout.write("Ignored lookup indices:\n  Index {0}\n"
                     .format("\n  Index ".join(ignored)))

    stdout.write("\n")

    # If not all fields were compared, report here, and exit if none were
    # compared
    if fields_compared != total_fields and fields_compared == 0:
        return files_differ

    # Report on the maximum RMS diff percentage
    if comparison.max_rms_diff_1[0] > 0.0:
        stdout.write(
            "Maximum RMS diff as % of data in file 1: "
            "{0:<18.17g} (field {1})\n"
            .format(*comparison.max_rms_diff_1))
    if comparison.max_rms_diff_2[0] > 0.0:
        stdout.write(
            "Maximum RMS diff as % of data in file 2: "
            "{0:<18.17g} (field {1})\n"
            .format(*comparison.max_rms_diff_2))

    if (comparison.max_rms_diff_1[0] > 0.0 or
            comparison.max_rms_diff_2[0] > 0.0):
        stdout.write("\n")

    return files_differ




[docs]
def full_report(comparison, stdout=None, **kwargs):
    """
    Print a report giving a full analysis of a comparison object.

    Args:
        * comparison:
            A :class:`UMFileComparison` object, populated with the
            differences between two files.

    Kwargs:
        * stdout:
            A open file-like object to write the report to.

    Other Kwargs:
        Any other keywords are assumed to be settings to override
        the values in the global COMPARISON_SETTINGS dictionary,
        see the docstring of the :mod:`cumf` module for details

    """
    # Setup output
    if stdout is None:
        stdout = sys.stdout

    # Deal with the possible keywords - take the global print settings
    # dictionary as a starting point and add any changes supplied in
    # the call to this method
    comp_settings = COMPARISON_SETTINGS.copy()
    for keyword, value in kwargs.items():
        if keyword in comp_settings:
            comp_settings[keyword] = value
        else:
            msg = "Keyword not recognised: {0}"
            raise ValueError(msg.format(keyword))

    # The full report contains the summary at the beginning
    files_differ = summary_report(comparison, stdout)

    # Create the component list from the base file class
    component_list = (["fixed_length_header"] +
                      [name for name, _ in mule.UMFile.COMPONENTS])

    # Get the verbosity setting from the dictionary
    only_report_failures = comp_settings["only_report_failures"]

    # Define a quick function for convenience since it's used in a two
    # places below - this is used to format the index report nicely
    def report_index_errors(diffmap, stdout, name_mapping):
        to_output = []
        max_width = [0, 0, 0]
        # Capture the widest width needed for each element
        for index, (value_1, value_2) in diffmap:
            # Set the index string to be the numerical value
            # As of Numpy 2.0 str((np.int, np.int)) has different formatting, so
            # actively create the same formatting here to satisfy unit tests
            try:
                indexstr = f"({', '.join(str(x) for x in index)})"
            except TypeError:
                indexstr = str(index)
            if name_mapping is not None:
                # If a mapping was given, add the associated name here
                for map_name, map_ind in name_mapping:
                    if map_ind == index:
                        indexstr = "{0} ({1})".format(index, map_name)
                        break

            valstr_1 = str(value_1)
            valstr_2 = str(value_2)
            max_width[0] = max(max_width[0], len(indexstr))
            max_width[1] = max(max_width[1], len(valstr_1))
            max_width[2] = max(max_width[2], len(valstr_2))
            to_output.append((indexstr, valstr_1, valstr_2))

        # Construct an appropriate width statement
        width_format = ("  Index {0:"+str(max_width[0])+"} differs - "
                        "file_1: {1: >"+str(max_width[1])+"}  "
                        "file_2: {2: >"+str(max_width[2])+"}\n")

        # Output the nicely formatter lines
        for output in to_output:
            stdout.write(width_format.format(*output))

    # Now report on differences bettween the components
    for name in component_list:
        comp_comp = comparison.comparisons[name]

        # Prepare a message showing the number of values compared
        msg_values = "(compared {0}/{1} values)".format(*comp_comp.compared)

        # Also report the indices that were ignored
        if len(comp_comp.ignored) > 0:
            ignored = []
            mapping = comp_comp.component_1.HEADER_MAPPING
            for index in comp_comp.ignored:
                indexstr = str(index)
                if mapping is not None:
                    for map_name, map_ind in mapping:
                        if map_ind == index:
                            indexstr = "{0} ({1})".format(index, map_name)
                            break
                ignored.append(indexstr)
            msg_ignore = ("\nIgnored indices:\n  Index {0}"
                          .format("\n  Index ".join(ignored)))
            msg_values += msg_ignore

        if comp_comp.match:
            # If they agree simply state this and move on
            if not only_report_failures:
                stdout.write(_banner(name))
                stdout.write("Components compare {0}\n\n".format(msg_values))
        else:
            # If they disagree move onto the reason/s why
            stdout.write(_banner(name))
            stdout.write("Components DO NOT compare {0}\n".format(msg_values))

            # Check to see if they component was missing in either file
            if not comp_comp.in_file_1:
                stdout.write("Component missing from file 1\n")
            if not comp_comp.in_file_2:
                stdout.write("Component missing from file 2\n")
            # ... and if they were the same shape
            if not comp_comp.same_shape:
                stdout.write("Component shape do not match\n")

            # Get the possible names for the indices (1d headers only)
            if len(comp_comp.component_1.shape) == 1:
                index_name_ref = comp_comp.component_1.HEADER_MAPPING
            else:
                index_name_ref = None

            stdout.write("Component differences:\n")
            report_index_errors(comp_comp.diffs, stdout, index_name_ref)
            stdout.write("\n")

    # Get the total number of fields
    fields_compared = len(comparison.field_comparisons)

    # Get the printing callback function from the settings dictionary
    print_lookups = comp_settings["lookup_print_func"]

    # If this is the default pumf case, and the callback hasn't been
    # overidden by the user, switch it for the more verbose version
    if not only_report_failures and print_lookups is _print_lookup:
        print_lookups = _print_lookup_full

    # Each field is treated individually for both its lookup and data parts
    for ifield, comp_field in enumerate(comparison.field_comparisons):

        comp_lookup = comp_field.lookup_comparison

        # First a simple message explaining if the field broadly compares
        heading = "Field {0}/{1} ".format(ifield + 1, fields_compared)
        if comp_field.stash is not None:
            heading += "- " + comp_field.stash.name

        if comp_field.match:
            # If the field compares report this and continue
            if only_report_failures:
                continue
            stdout.write(_banner(heading))
            stdout.write("Field compares\n")
        else:
            stdout.write(_banner(heading))
            # Report the status of the two components separately
            if comp_lookup.match:
                stdout.write("Lookup compares, ")
            else:
                stdout.write("Lookup DOES NOT compare, ")
            if comp_field.data_match:
                stdout.write("data compares\n")
            else:
                stdout.write("data DOES NOT compare\n")

        # Indicate how many lookup values were actually compared
        stdout.write("Compared {0}/{1} lookup values.\n"
                     .format(*comp_lookup.compared))

        # Print some extra information about the fields
        stdout.write("File_1 lookup info:\n")
        print_lookups(comp_field.lookup_comparison.component_1, stdout)

        # Report if there was a difference in the ordering of the fields
        if comp_field.file_1_index != comp_field.file_2_index:
            msg = ("Order difference: field is #{0} in file 1 "
                   "but #{1} in file 2\n")
            stdout.write(msg.format(comp_field.file_1_index+1,
                                    comp_field.file_2_index+1))

        if not comp_lookup.match:
            # If there were any lookup differences report them
            stdout.write("Lookup differences:\n")
            report_index_errors(comp_lookup.diffs, stdout,
                                comp_field.HEADER_MAPPING)

        if not comp_field.data_shape_match:
            # If the data shape wasn't the same there isn't much to report here
            stdout.write("Data shapes are different, no comparison possible\n")
        elif not comp_field.data_match:
            # If there were any data differences report them
            stdout.write("Data differences:\n")
            stdout.write("  Number of point differences  : {0}/{1}\n"
                         .format(*comp_field.compared))
            stdout.write("  Maximum absolute difference  : {0:<18.17g}\n"
                         .format(comp_field.max_diff))
            stdout.write("  RMS difference               : {0:<18.17g}\n"
                         .format(comp_field.rms_diff))
            if comp_field.rms_norm_diff_1 is None:
                stdout.write("  RMS diff as % of file_1 data : "
                             "NaN (File 1 data all zero) \n")
            else:
                stdout.write("  RMS diff as % of file_1 data : {0:<18.17g}\n"
                             .format(comp_field.rms_norm_diff_1))
            if comp_field.rms_norm_diff_2 is None:
                stdout.write("  RMS diff as % of file_2 data : "
                             "NaN (File 2 data all zero) \n")
            else:
                stdout.write("  RMS diff as % of file_2 data : {0:<18.17g}\n"
                             .format(comp_field.rms_norm_diff_2))

        stdout.write("\n")

    return files_differ



def _main():
    """
    Main function; accepts command line arguments to override the comparison
    settings and provides a pair of UM files to compare.

    """
    # Setup help text
    help_prolog = """    usage:
      %(prog)s [-h] [options] file_1 file_2

    This script will compare all headers and data from two UM files,
    and write a report describing any differences to stdout.  The
    assumptions made by the comparison may be customised with a
    variety of options (see below).
    """
    title = _banner(
        "CUMF-II - Comparison tool for UM Files, version II "
        "(using the Mule API)", banner_char="=")

    # Include a list of the component names as they appear in Mule
    component_names = ", ".join(
        (["fixed_length_header"] +
         [name for name, _ in mule.UMFile.COMPONENTS] +
         ["lookup"]))

    help_epilog = """
    possible component names for the ignore option:
    {0}

    for details of the indices see UMDP F03:
      https://code.metoffice.gov.uk/doc/um/latest/papers/umdp_F03.pdf
    """.format(textwrap.fill(component_names,
                             width=80,
                             initial_indent=4*" ",
                             subsequent_indent=8*" "))

    class ShowMissingAction(argparse.Action):
        def __init__(self, option_strings, dest, nargs=None, **kwargs):
            super(ShowMissingAction, self).__init__(
                option_strings, dest, **kwargs)

        def __call__(self, parser, namespace, values, option_string=None):
            setattr(namespace, "show_missing", [True, values])

    # Setup the parser
    parser = argparse.ArgumentParser(
        usage=argparse.SUPPRESS,
        description=title + textwrap.dedent(help_prolog),
        epilog=textwrap.dedent(help_epilog),
        formatter_class=argparse.RawTextHelpFormatter,
        )

    # No need to output help text for the two input files (these are obvious)
    parser.add_argument("file_1", help=argparse.SUPPRESS)
    parser.add_argument("file_2", help=argparse.SUPPRESS)

    parser.add_argument(
        '--ignore',
        help="ignore specific indices of a component; provide the name of \n"
        "the component and a comma separated list of indices or ranges \n"
        "(i.e. M:N) to ignore.  This may be specified multiple times to \n"
        "ignore indices from more than one component. \n"
        "If this option is not used, by default mule-cumf will ignore \n"
        "creation time in fixed length header only\n ",
        metavar="component_name=index1[,index2][...]",
        action="append")
    parser.add_argument(
        '--ignore-missing',
        action='store_true',
        help="if present, positional headers will be ignored (required if \n"
        "missing fields from either file should not be considered a failure \n"
        "to compare)\n ")
    parser.add_argument(
        '--diff-file',
        help="a filename to write a new UM file to which contains the \n"
        "absolute differences for any fields that differ\n ",
        metavar="filename")
    parser.add_argument(
        '--full', action='store_true',
        help="if not using summary output, will increase the verbosity by \n"
        "reporting on all comparisons (default behaviour is to only report \n"
        "on failures)\n ")
    parser.add_argument(
        '--summary', action='store_true',
        help="print a much shorter report which summarises the differences \n"
        "between the files without going into much detail\n ")
    parser.add_argument(
        "--stashmaster",
        help="either the full path to a valid stashmaster file, or a UM \n"
        "version number e.g. '10.2'; if given a number cumf will look in \n"
        "the path defined by: \n"
        "  mule.stashmaster.STASHMASTER_PATH_PATTERN \n"
        "which by default is : \n"
        "  $UMDIR/vnX.X/ctldata/STASHmaster/STASHmaster_A\n")
    parser.add_argument(
        "--show-missing",
        nargs='1',
        action=ShowMissingAction,
        default=[False, -1],
        metavar='[=N]',
        help="display missing fields from either file. If given, N is the\n"
        " maximum number of fields to display.\n")
    parser.add_argument(
        "--fail-if-differ",
        help="if set, then exit with a return code of 1 if two files differ.\n",
        action='store_true',
        default=False
    )

    # If the user supplied no arguments, print the help text and exit
    if len(sys.argv) == 1:
        parser.print_help()
        parser.exit(1)

    # set the default value for --show-missing if none was given
    try:
        sys.argv[sys.argv.index("--show-missing")] = "--show-missing=-1"
    except ValueError:
        pass

    args = parser.parse_args()

    # Print version information
    print(_banner("(CUMF-II) Module Information")),
    report_modules()
    print("")

    # Process ignoring indices from
    if args.ignore is not None:
        for ignore_list in args.ignore:
            if "=" in ignore_list:
                name, indices = ignore_list.split("=")
                ignores = []
                for arg in indices.split(","):
                    if re.match(r"^\d+$", arg):
                        ignores.append(int(arg))
                    elif re.match(r"^\d+:\d+$", arg):
                        ignores += range(*map(int, arg.split(":")))
                    else:
                        msg = "Unrecognised index in ignore list: {0}"
                        raise ValueError(msg.format(ignore_list))
                COMPARISON_SETTINGS["ignore_templates"][name] = ignores

    # Process the ignore missing flag
    COMPARISON_SETTINGS["ignore_missing"] = args.ignore_missing
    COMPARISON_SETTINGS["show_missing"] = args.show_missing[0]
    if args.show_missing[0]:
        COMPARISON_SETTINGS["show_missing_max"] = args.show_missing[1]

    # If provided, load the given stashmaster
    stashm = None
    if args.stashmaster is not None:
        if re.match(r"\d+.\d+", args.stashmaster):
            stashm = STASHmaster.from_version(args.stashmaster)
        else:
            stashm = STASHmaster.from_file(args.stashmaster)
        if stashm is None:
            msg = "Cannot load user supplied STASHmaster"
            raise ValueError(msg)

    if args.full:
        COMPARISON_SETTINGS["only_report_failures"] = False
        COMPARISON_SETTINGS["show_missing"] = True

    # Check if either of these are pp files
    um_files = []
    pp_mode = False
    for input_file in (args.file_1, args.file_2):
        if mule.pp.file_is_pp_file(input_file):
            # Make an empty fieldsfile object and attach the pp file's
            # field objects to it
            pp_mode = True
            um_file = mule.FieldsFile()
            um_file.fields = mule.pp.fields_from_pp_file(input_file)
            um_file._source_path = input_file
            if stashm is not None:
                um_file.attach_stashmaster_info(stashm)
        else:
            um_file = mule.load_umfile(input_file, stashmaster=stashm)
        um_files.append(um_file)

    comparison = UMFileComparison(um_files[0], um_files[1])

    # Now print a report to stdout, if a SIGPIPE is received handle
    # it appropriately
    try:
        if args.summary:
            files_differ = summary_report(comparison)
        else:
            files_differ = full_report(comparison)
    except IOError as error:
        if error.errno != errno.EPIPE:
            raise

    # If requested, and any data differences exist, write to diff file
    if args.diff_file is not None:
        # Cannot do this if a pp file was involved
        if pp_mode:
            msg = ("At least one of the files was a pp file, cannot "
                   "produce a difference file in this case")
            raise ValueError(msg)

        diff_file = args.diff_file
        new_ff = um_files[0].copy()
        new_ff.fields = [field for field in comparison.field_comparisons
                         if not field.data_match and field.data_shape_match]

        # Check if a land sea mask exists in the first file
        lsm = None
        for field in um_files[0].fields:
            if field.lbrel in (2, 3) and field.lbuser4 == 30:
                lsm = field
                break

        # Now double check that there weren't any differences between the
        # mask in the 2 files (if there were, the output for a land/sea
        # compressed field in this diff file will be very misleading)
        for field in new_ff.fields:
            if field.lbuser4 == 30:
                # If there is a land/sea mask difference, disable it
                lsm = None

        # Now check for land/sea packed fields within the diff file
        for ifield, field in enumerate(new_ff.fields):
            if (field.lbpack // 10) % 10 != 0:
                if lsm is not None:
                    # If we have the LSM, add it to the diff file
                    new_ff.fields.insert(0, lsm)
                    break
                else:
                    # Otherwise Mule won't let us output the field, so warn
                    # about this here and then remove it from the diff file
                    msg = ("Unable to output Field {0} as it is land/sea "
                           "packed but no suitable land-sea mask was found")
                    warnings.warn(msg.format(ifield + 1))
                    new_ff.fields.remove(field)

        # Assuming there are still writable fields in the diff file, write it
        if len(new_ff.fields) > 0:
            new_ff.to_file(diff_file)

    # If the files differ and the --fail-if-differ option has been used then
    # exit with a value of 1.
    if files_differ and args.fail_if_differ:
        sys.exit(1)

if __name__ == "__main__":
    _main()