Source code for tsunami_ip_utils.comparisons

"""Tools for generating comparisons between TSUNAMI-IP calculated integral parameters (e.g. :math:`c_k`, :math:`E`) and calculated
values using correlation methods with either cross section sampling or uncertainty contributions."""

from tsunami_ip_utils.readers import read_integral_indices
from tsunami_ip_utils.integral_indices import calculate_E
import numpy as np
from uncertainties import unumpy
import pandas as pd
from pandas import DataFrame as df
from pathlib import Path
from tsunami_ip_utils.perturbations import generate_points
from typing import List, Dict, Tuple, Any, Optional, Union
from tsunami_ip_utils.viz.scatter_plot import EnhancedPlotlyFigure, InteractiveScatterLegend
from tsunami_ip_utils.viz.plot_utils import generate_plot_objects_array_from_perturbations, generate_plot_objects_array_from_contributions
from tsunami_ip_utils.integral_indices import get_uncertainty_contributions, calculate_E_contributions
from tsunami_ip_utils.viz import matrix_plot
import multiprocessing
from tsunami_ip_utils.utils import _run_and_read_TSUNAMI_IP, _convert_paths


[docs]
def E_calculation_comparison(application_filenames: Union[List[str], List[Path]], 
                             experiment_filenames: Union[List[str], List[Path]], coverx_library: str="252groupcov7.1", 
                             tsunami_ip_output_filename: Optional[Union[str, Path]]=None) -> Dict[str, df]:
    """Function that compares the calculated similarity parameter E with the TSUNAMI-IP output for each application with each
    experiment. The comparison is done for the nominal values and the uncertainties of the E values. In addition, the
    difference between manually calculated uncertainties and automatically calculated uncertainties (i.e. via the uncertainties
    package) is also calculated. The results are returned as a pandas DataFrame.
    
    Parameters
    ----------
    application_filenames
        Paths to the application sdf files.
    experiment_filenames
        Paths to the experiment sdf files.
    coverx
        The coverx library to use for TSUNAMI-IP. Default is ``'252groupcov7.1'``.
    tsunami_ip_output_filename
        Optional path to the TSUNAMI-IP output file, if not specified, the function will run TSUNAMI-IP to calculate the
        E values using the template file :ref:`MG_reader.inp`.
    
    Returns
    -------
        Dictionary of pandas DataFrames for each type of E index. The keys are ``'total'``, ``'fission'``, ``'capture'``, 
        and ``'scatter'``. The DataFrames contain the calculated E values, the manual uncertainties, the TSUNAMI-IP values, 
        the relative difference in the mean, and the relative difference in the manually computed uncertainty. The DataFrames 
        are indexed by the experiment number and the columns are a MultiIndex with the application number as the main index 
        and the attributes as the subindex.
        
    Notes
    -----
    Each of the results dataframes in the dictionary can easily be written to excel using the pandas ``to_excel`` method."""
    
    # First perform the manual calculations for each type of E index
    E_types = ['total', 'fission', 'capture', 'scatter']
    E = {}
    for E_type in E_types + ['total_manual', 'fission_manual', 'capture_manual', 'scatter_manual']:
        if 'manual' in E_type: # Manual uncertainty propagation
            if E_type.replace('_manual', '') == 'total':
                E[E_type] = calculate_E(application_filenames, experiment_filenames, reaction_type='all', uncertainties='manual')
            elif E_type.replace('_manual', '') == 'scatter':
                E[E_type] = calculate_E(application_filenames, experiment_filenames, reaction_type='elastic', uncertainties='manual')
            else:
                E[E_type] = calculate_E(application_filenames, experiment_filenames, reaction_type=E_type.replace('_manual', ''), uncertainties='manual')
        else: # Automatic uncertainty propagation
            if E_type == 'total':
                E[E_type] = calculate_E(application_filenames, experiment_filenames, reaction_type='all')
            elif E_type == 'scatter':
                E[E_type] = calculate_E(application_filenames, experiment_filenames, reaction_type='elastic')
            else:
                E[E_type] = calculate_E(application_filenames, experiment_filenames, reaction_type=E_type)

    print("Done with calculations")

    # Now read the tsunami_ip output if given, and calculate it if not
    if tsunami_ip_output_filename is not None:
        tsunami_ip_output = read_integral_indices(tsunami_ip_output_filename)
    else:
        tsunami_ip_output = _run_and_read_TSUNAMI_IP(application_filenames, experiment_filenames, coverx_library)

    # Compare the nominal values
    E_diff = {}
    for E_type in E_types:
        E_diff[E_type] = np.abs(unumpy.nominal_values(E[E_type]) - unumpy.nominal_values(tsunami_ip_output[f"E_{E_type}"])) \
                            / unumpy.nominal_values(tsunami_ip_output[f"E_{E_type}"])

    # Compare the calculated (manual) uncertainty with the TSUNAMI-IP uncertainty
    E_diff_unc = {}
    for E_type in E_types:
        E_diff_unc[E_type] = np.abs( unumpy.std_devs(E[E_type + '_manual']) - unumpy.std_devs(tsunami_ip_output[f"E_{E_type}"]) ) \
                                / unumpy.std_devs(tsunami_ip_output[f"E_{E_type}"])

    # -----------------------------------------
    # Format the results as a pandas DataFrame
    # -----------------------------------------

    # Create a MultiIndex for columns
    num_experiments, num_applications = np.shape(E['total'])

    columns = pd.MultiIndex.from_product([
        np.arange(1, num_applications + 1),  # Main column indices
        ['Calculated', 'Manual Uncertainty', 'TSUNAMI-IP', 'Relative Difference in Mean', 'Relative Difference in Manual Uncertainty']  # Subcolumns
    ], names=['Application Number', 'Attribute'])

    # Initialize DataFrame
    data = {}

    print("Creating pandas dataframes")

    # Create a pandas DataFrame for each type of E index
    for E_type in E_types:
        data[E_type] = pd.DataFrame(index=pd.Index(np.arange(1, num_experiments + 1), name='Experiment Number'), columns=columns)

        # Populate DataFrame
        for application_index in range(num_applications):
            for experiment_index in range(num_experiments):
                # Now write the data to the DataFrame
                data[E_type].loc[experiment_index + 1, (application_index + 1, 'Calculated')] = \
                    f"{E[E_type][experiment_index, application_index].n:1.3E}+/-{E[E_type][experiment_index, application_index].s:1.2E}"
                data[E_type].loc[experiment_index + 1, (application_index + 1, 'Manual Uncertainty')] = \
                    f"{E[E_type + '_manual'][experiment_index, application_index].s:1.2E}"
                data[E_type].loc[experiment_index + 1, (application_index + 1, 'TSUNAMI-IP')] = \
                    f"{tsunami_ip_output[f'E_{E_type}'][experiment_index, application_index].n:1.3E}+/-{tsunami_ip_output[f'E_{E_type}'][experiment_index, application_index].s:1.2E}"
                data[E_type].loc[experiment_index + 1, (application_index + 1, 'Relative Difference in Mean')] = f"{E_diff[E_type][experiment_index, application_index]:1.4E}"
                data[E_type].loc[experiment_index + 1, (application_index + 1, 'Relative Difference in Manual Uncertainty')] = f"{E_diff_unc[E_type][experiment_index, application_index]:1.4E}"

    return data



[docs]
def _update_annotation(fig: EnhancedPlotlyFigure, integral_index: float, index_name: str
                       ) -> Tuple[EnhancedPlotlyFigure, float, float]:
    """Update the annotation on the plot to include the TSUNAMI-IP c_k value and the percent difference from the
    Pearson correlation coefficient. If the percent difference is greater than 5%, the annotation will be colored red.
    
    Parameters
    ----------
    fig
        The plotly figure object.
    integral_index
        The TSUNAMI-IP integral_index value.
    index_name
        The name of the integral index.
        
    Returns
    -------
        * fig
            The plotly figure object with the updated annotation.
        * calculated_value
            The calculated value of the integral index.
        * percent_difference
            The percent difference between the TSUNAMI-IP integral_index value and the Pearson correlation coefficient.
    """

    fig_has_annotations = isinstance(fig, EnhancedPlotlyFigure)  or isinstance(fig, InteractiveScatterLegend)
    if  not fig_has_annotations:
        # On the diagonal are usually the contributions in a pie plot, and the calculated c_k is 1.0
        calculated_value = 1.0
        percent_difference = (integral_index - calculated_value) / integral_index * 100
        return fig, calculated_value, percent_difference
    
    if isinstance(fig, InteractiveScatterLegend):
        # The figure is actually stored under the 'fig' attribute
        summary_stats_annotation = fig.fig.layout.annotations[0]
        calculated_value = fig.fig.statistics['pearson']
        percent_difference = (integral_index - calculated_value)/integral_index * 100
        summary_stats_annotation.text += f"<br>TSUNAMI-IP {index_name}: <b>{integral_index}</b><br>Percent Difference: <b>{percent_difference}</b>%"
        
        if abs(percent_difference) > 5:
            summary_stats_annotation.update(bordercolor='red')
        return fig,  calculated_value, percent_difference
    else:
        summary_stats_annotation = fig.layout.annotations[0]
        calculated_value = fig.statistics['pearson']
        percent_difference = (integral_index - calculated_value)/integral_index * 100
        summary_stats_annotation.text += f"<br>TSUNAMI-IP {index_name}: <b>{integral_index}</b><br>Percent Difference: <b>{percent_difference}</b>%"
        
        if abs(percent_difference) > 5:
            summary_stats_annotation.update(bordercolor='red')
        return fig,  calculated_value, percent_difference



[docs]
def _process_pair(args):
    application_file, experiment_file, base_library, perturbation_factors, num_perturbations, integral_value = args
    points_array = generate_points(
        application_file, 
        experiment_file, 
        base_library=base_library, 
        perturbation_factors=perturbation_factors, 
        num_perturbations=num_perturbations
    )
    x_points = unumpy.nominal_values([ pair[0] for pair in points_array ])
    y_points = unumpy.nominal_values([ pair[1] for pair in points_array ])

    # Calculate the Pearson correlation coefficient
    calculated_value = np.corrcoef(x_points, y_points)[0, 1]
    percent_difference = (integral_value - calculated_value) / integral_value * 100
    return calculated_value, percent_difference



[docs]
@_convert_paths
def correlation_comparison(integral_index_matrix: unumpy.uarray, integral_index_name: str, 
                           application_files: Union[List[str], List[Path]], 
                           experiment_files: Union[List[str], List[Path]], method: str, 
                           base_library: Optional[Union[str, Path]]=None, 
                           perturbation_factors: Optional[Union[str, Path]]=None, 
                           num_perturbations: Optional[int]=None, make_plot: bool=True, 
                           num_cores: int=multiprocessing.cpu_count() - 2,
                           plot_objects_kwargs: dict={},
                           matrix_plot_kwargs: dict={}) -> Tuple[pd.DataFrame, Any]:
    """Function that compares the calculated similarity parameter :math:`c_k` (calculated using the cross section sampling method) 
    with the TSUNAMI-IP output for each application and each experiment. NOTE: that the experiment sdfs and application sdfs
    must correspond with those in hte TSUNAMI-IP input file.

    Notes
    -----
    * If the chosen method is 'perturbation', the matrix plot can become extremely memory intensive, so it is recommended
      to set ``make_plot=False`` (if only the matrix of comparisons is desired) and/or to use ``num_cores=1`` or a small
      number of perturbations to avoid memory issues.
    
    Parameters
    ----------
    integral_index_matrix:
        The matrix representing the given integral index. Expected shape is ``(num_applications, num_experiments)``.
    integral_index_name:
        The name of the integral index (used for selecting the method for the plot). Allowed values of ``'c_k'``, ``'E'``.
    application_files
        Paths to the input files for the application (required by the chosen method, either TSUNAMI ``.out`` files or 
        TSUNAMI ``.sdf`` files).
    experiment_files
        Paths to the input files for the experiment (required by the chosen method, either TSUNAMI ``.out`` files or 
        TSUNAMI ``.sdf`` files).
    method
        The method for visualizing the given integral index. Allowed values of ``'perturbation'``, 
        ``'uncertainty_contributions_nuclide'``, ``'uncertainty_contributions_nuclide_reaction'``, ``'variance_contributions_nuclide'``,
        ``'variance_contributions_nuclide_reaction'``. ``'E_contributions_nuclide'``, ``'E_contributions_nuclide_reaction'``, 
        ``'c_k_contributions'``, 
    base_library
        Path to the base library
    perturbation_factors
        Path to the perturbation factors.
    num_perturbations
        Number of perturbations to generate.
    make_plot
        Whether to generate the matrix plot. Default is ``True``.
    num_cores
        If ``make_plot`` is ``False``, the number of cores to use for multiprocessing. Default is two less than the number
        of available cores.
    plot_objects_kwargs
        Optional keyword arguments to pass when generating the plot objects.
    matrix_plot_kwargs
        Optional keyword arguments to pass when generating the matrix plot.
    
    Returns
    -------
        * comparisons
            A pandas DataFrame containing the calculated integral index values, the TSUNAMI-IP values, and the percent
            difference between the two values.
        * matrix_plot
            If ``make_plot=True``, the matrix plot object containing the integral index values and the percent difference,
            otherwise the output is just ``comparisons``.
    """

    # ===================================
    # Perform checks on input parameters
    # ===================================

    # Check for consistent dimensions of the integral index matrix
    num_applications, num_experiments = np.shape(integral_index_matrix)
    if num_applications != len(application_files) or num_experiments != len(experiment_files):
        raise ValueError("The dimensions of the integral index matrix do not match the number of applications and experiments.")

    # Check for missing input parameters or inconsistent method
    missing_perturbation_parameters = any([base_library is None, perturbation_factors is None,
                                             num_perturbations is None])
    if method == 'perturbation' and missing_perturbation_parameters:
        raise ValueError("The method is 'perturbation', and some of the required additional parameters are missing.")

    method_for_calculating_c_k = method in ['uncertainty_contributions_nuclide', 'uncertainty_contributions_nuclide_reaction',
                                            'variance_contributions_nuclide', 'variance_contributions_nuclide_reaction',
                                            'perturbation', 'c_k_contributions']
    method_for_calculating_E = method in ['E_contributions_nuclide', 'E_contributions_nuclide_reaction']
    if integral_index_name == 'c_k' and not method_for_calculating_c_k:
        raise ValueError("The integral index name is 'c_k', but a method for calculating c_k was not selected, instead"
                         f"the method selected was {method}. Please select a method for calculating c_k.")
    if integral_index_name == 'E' and not method_for_calculating_E:
        raise ValueError("The integral index name is 'E', but the method selected was not 'E_contributions'. Please"
                         f"the method selected was {method}. Please select a method for calculating E.")


    # ===================================
    # Generate plots for the matrix plot
    # ===================================
    match method:
        case 'perturbation':
            # This is the most memory intensive method, so instead of storing the entire points array, if make_plot is False,
            # we will just calculate the Pearson correlation coefficient from the points themselves, and only store the
            # results array.
            if make_plot:
                points_array = generate_points(application_files, experiment_files, base_library, perturbation_factors, 
                                               num_perturbations)
                plot_objects_array = generate_plot_objects_array_from_perturbations(points_array, **plot_objects_kwargs)
            else:
                num_applications = len(application_files)
                num_experiments = len(experiment_files)
                
                # Prepare arguments for multiprocessing
                tasks = [(application_files[i], experiment_files[j], base_library, perturbation_factors, num_perturbations, integral_index_matrix[i, j])
                        for i in range(num_applications) for j in range(num_experiments)]
                
                # Use multiprocessing to process data
                with multiprocessing.Pool(processes=num_cores) as pool:
                    results = pool.map(_process_pair, tasks)
                
                # Reshape the results into matrices
                calculated_values = np.array([result[0] for result in results]).reshape(num_applications, num_experiments)
                percent_differences = np.array([result[1] for result in results]).reshape(num_applications, num_experiments)
                

        case 'uncertainty_contributions_nuclide':
            contributions_nuclide, _ = get_uncertainty_contributions(application_files, experiment_files)
            plot_objects_array = generate_plot_objects_array_from_contributions(contributions_nuclide, '%Δk/k', **plot_objects_kwargs) \
                                    if make_plot else None

        case 'uncertainty_contributions_nuclide_reaction':
            _, contributions_nuclide_reaction = get_uncertainty_contributions(application_files, experiment_files)
            plot_objects_array = generate_plot_objects_array_from_contributions(contributions_nuclide_reaction, '%Δk/k', **plot_objects_kwargs) \
                                    if make_plot else None
            
        case 'variance_contributions_nuclide':
            contributions_nuclide, _ = get_uncertainty_contributions(application_files, experiment_files, variance=True)
            plot_objects_array = generate_plot_objects_array_from_contributions(contributions_nuclide, '%(Δk/k)^2', **plot_objects_kwargs) \
                                    if make_plot else None

        case 'variance_contributions_nuclide_reaction':
            _, contributions_nuclide_reaction = get_uncertainty_contributions(application_files, experiment_files, variance=True)
            plot_objects_array = generate_plot_objects_array_from_contributions(contributions_nuclide_reaction, '%(Δk/k)^2', **plot_objects_kwargs) \
                                    if make_plot else None

        case 'E_contributions_nuclide':
            contributions_nuclide, _ =  calculate_E_contributions(application_files, experiment_files)
            
            plot_objects_array = generate_plot_objects_array_from_contributions(contributions_nuclide, 
                                                                                integral_index_name, **plot_objects_kwargs) \
                                    if make_plot else None

        case 'E_contributions_nuclide_reaction':
            _, contributions_nuclide_reaction =  calculate_E_contributions(application_files, experiment_files)
            plot_objects_array = generate_plot_objects_array_from_contributions(contributions_nuclide_reaction, 
                                                                                integral_index_name, **plot_objects_kwargs) \
                                    if make_plot else None

        case 'c_k_contributions':
            raise NotImplementedError("The method 'c_k_contributions' is not yet implemented.")

    # ==============================
    # Update plots with annotations
    # ==============================
    if make_plot:
        percent_differences = np.empty_like(integral_index_matrix)
        calculated_values = np.empty_like(integral_index_matrix)
        for i, row in enumerate(plot_objects_array):
            for j, plot_object in enumerate(row):
                updated_plot_object, calculated_value, percent_difference = \
                    _update_annotation(plot_object, integral_index_matrix[i, j], integral_index_name)
                calculated_values[i, j] = calculated_value
                percent_differences[i, j] = percent_difference
                plot_objects_array[i, j] = updated_plot_object
    elif method != 'perturbation':
        percent_differences = np.empty_like(integral_index_matrix)
        calculated_values = np.empty_like(integral_index_matrix)
        # Just calculate pearson correlation coefficient from the points themselves
        for i in range(num_experiments):
            for j in range(num_applications):
                x_points = unumpy.nominal_values(points_array[i, j, :, 0])
                y_points = unumpy.nominal_values(points_array[i, j, :, 1])
                calculated_values[i, j] = np.corrcoef(x_points, y_points)[0, 1]
                percent_differences[i, j] = (integral_index_matrix[i, j] - calculated_values[i, j])/integral_index_matrix[i, j] * 100

    # ===================================
    # Create dataframes with comparisons
    # ===================================
    num_applications, num_experiments = np.shape(integral_index_matrix)
    columns = pd.MultiIndex.from_product([
        np.arange(1, num_applications + 1),  # Main column indices
        ['Calculated', 'TSUNAMI-IP', 'Percent Difference']  # Subcolumns
    ], names=['Application Number', 'Attribute'])

    comparisons = pd.DataFrame(index=pd.Index(np.arange(1, num_experiments + 1), name='Experiment Number'), columns=columns)
    # Populate DataFrame
    for application_index in range(num_applications):
        for experiment_index in range(num_experiments):
            # Now write the data to the DataFrame
            comparisons.loc[experiment_index + 1, (application_index + 1, 'Calculated')] = \
                f"{calculated_values[experiment_index, application_index]:1.3E}"
            comparisons.loc[experiment_index + 1, (application_index + 1, 'TSUNAMI-IP')] = \
                f"{integral_index_matrix[experiment_index, application_index].n:1.3E}+/-{integral_index_matrix[experiment_index, application_index].s:1.2E}"
            comparisons.loc[experiment_index + 1, (application_index + 1, 'Percent Difference')] = \
                f"{percent_differences[experiment_index, application_index].n:2.2f}+/-{percent_differences[experiment_index, application_index].s:1.2E}"
    
    # ===================
    # Create matrix plot
    # ===================
    if make_plot:
        fig = matrix_plot(plot_objects_array, 'interactive', **matrix_plot_kwargs)

        return comparisons, fig
    else:
        return comparisons