import pickle
import sys
import warnings
import arviz as az
import numpy as np
import pandas as pd
from scipy.stats import norm
from tqdm.notebook import tqdm
pd.options.mode.chained_assignment = None
warnings.filterwarnings("ignore", ".*The group X_new is not defined in the InferenceData scheme.*")
warnings.filterwarnings("ignore", ".*X_new group is not defined in the InferenceData scheme.*")
[docs]
def load_data(sample_file, ages_file, proxies = ['d13c'], proxy_sigma_default = 0.1, drop_excluded_samples = True, drop_excluded_ages = True, combine_no_superposition = False):
"""
Import and pre-process proxy data and age constraints from .csv files formatted according to the :ref:`Data table formatting <datatable_target>` guidelines. To combine data from different .csv files, load each file separately and then combine the DataFrames with :py:meth:`combine_data() <stratmc.data.combine_data>`.
By default, samples marked ``Exclude? = True`` will be dropped from the data table. If ``sample_file.csv`` includes multiple proxy observations from the same stratigraphic horizon (for a given proxy), then all measurements marked ``Exclude? = False`` and ``superposition? = True `` will be combined using :py:meth:`combine_duplicates() <stratmc.data.combine_duplicates>`. Samples marked ``superposition? = False`` will remain separate, and their order will be randomized within the inference model. These default behaviors can be modified by passing the ``drop_excluded_samples`` and ``combine_no_superposition`` arguments.
Parameters
----------
sample_file: str
Path to .csv file containing proxy data for all sections (without '.csv' extension).
ages_file: str
Path to .csv file containing age constraints for all sections (without '.csv' extension).
proxies: str or list(str), optional
proxy names (must match column headers in ``sample_file.csv``); defaults to 'd13c'.
proxy_sigma_default: float or dict{float}, optional
Measurement uncertainty (:math:`1\\sigma`) to use for proxy observations if not specified in ``proxy_std`` column of ``sample_df``. To set a different value for each proxy, pass a dictionary with proxy names as keys. Defaults to 0.1.
drop_excluded_samples: bool, optional
Whether to remove samples with ``Exclude? = True`` from the ``sample_df``; defaults to ``True``. If excluded samples are not dropped, their ages will be passively tracked within the inference model (but they will not be considered during the proxy signal reconstruction).
drop_excluded_ages: bool, optional
Whether to remove ages with ``Exclude? = True`` from the ``ages_df``; defaults to ``True``.
combine_no_superposition: bool, optional
Whether to combine samples without superposition information by averaging their proxy values; defaults to ``False``.
Returns
-------
sample_df: pandas.DataFrame
:class:`pandas.DataFrame` containing proxy data for all sections.
ages_df: pandas.DataFrame
:class:`pandas.DataFrame` containing age constraints for all sections.
"""
if type(proxies) == str:
proxies = list([proxies])
samples = pd.read_csv(sample_file + '.csv')
ages = pd.read_csv(ages_file + '.csv')
samples['section']=samples['section'].apply(str)
ages['section']=ages['section'].apply(str)
if 'shared?' not in list(ages.columns):
ages['shared?'] = False
if 'name' not in list(ages.columns):
ages['name'] = np.nan
ages['name']=ages['name'].apply(str)
if 'distribution_type' not in list(ages.columns):
ages['distribution_type'] = 'Normal'
if 'param_1' not in list(ages.columns):
ages['param_1'] = np.nan
if 'param_1_name' not in list(ages.columns):
ages['param_1_name'] = np.nan
if 'param_2' not in list(ages.columns):
ages['param_2'] = np.nan
if 'param_2_name' not in list(ages.columns):
ages['param_2_name'] = np.nan
if 'intermediate detrital?' not in list(ages.columns):
ages['intermediate detrital?'] = False
if 'intermediate intrusive?' not in list(ages.columns):
ages['intermediate intrusive?'] = False
if 'depositional?' not in list(ages.columns):
ages['depositional?'] = False
if 'Exclude?' not in list(ages.columns):
ages['Exclude?'] = False
if 'Exclude?' not in list(samples.columns):
samples['Exclude?'] = False
if 'superposition?' not in list(samples.columns):
samples['superposition?'] = True
if 'depositional age' not in list(samples.columns):
samples['depositional age'] = np.nan
if ('depth' in list(samples.columns)) or ('depth' in list(ages.columns)):
sample_df, ages_df = depth_to_height(samples, ages)
else:
sample_df = samples
ages_df = ages
if drop_excluded_samples:
sample_df = sample_df[~sample_df['Exclude?']]
if drop_excluded_ages:
ages_df = ages_df[~ages_df['Exclude?']]
# where there's more than 1 measurement for a proxy, combine (unless superposition = False)
sample_df = combine_duplicates(sample_df, proxies, proxy_sigma_default, combine_no_superposition = combine_no_superposition)
ages_df.sort_values(by = ['section', 'height'], inplace = True)
ages_df.reset_index(inplace = True, drop = True)
return sample_df, ages_df
[docs]
def depth_to_height(sample_df, ages_df):
"""
Helper function for converting depth in core to height in section.
Parameters
----------
sample_df: pandas.DataFrame
:class:`pandas.DataFrame` containing proxy data for all sections.
ages_df: pandas.DataFrame
:class:`pandas.DataFrame` containing age constraints for all sections.
Returns
-------
sample_df: pandas.DataFrame
:class:`pandas.DataFrame` containing proxy data for all sections, with depth in core converted to height in section.
ages_df: pandas.DataFrame
:class:`pandas.DataFrame` containing age constraints for all sections, with depth in core converted to height in section.
"""
height = {}
age_height = {}
sections = np.unique(sample_df['section'])
for section in sections:
# if there are depth values, convert to height in section
if not sample_df[sample_df['section'] == section]['depth'].isnull().all():
depth_vec = sample_df[sample_df['section'] == section]['depth'].values
age_depth_vec = ages_df[ages_df['section'] == section]['depth'].values
all_depths = np.concatenate((sample_df[sample_df['section'] == section]['depth'].values,
ages_df[ages_df['section'] == section]['depth'].values))
max_depth = np.nanmax(all_depths)
height[section] = (depth_vec - max_depth) * -1
age_height[section] = (age_depth_vec - max_depth) * -1
sample_ind = sample_df.index[sample_df['section'] == section]
age_ind = ages_df.index[ages_df['section'] == section]
sample_df.loc[sample_ind, 'height'] = height[section]
ages_df.loc[age_ind, 'height'] = age_height[section]
ages_df = ages_df.sort_values(by = ['section', 'height'])
sample_df = sample_df.sort_values(by = ['section', 'height'])
return sample_df, ages_df
[docs]
def clean_data(sample_df, ages_df, proxies, sections):
"""
Helper function for cleaning sample data before running an inversion. Sets ``Exclude?`` to ``True`` for samples with no relevant proxy observations, removes sections where all samples have been excluded, and drops excluded age constraints.
Parameters
----------
sample_df: pandas.DataFrame
:class:`pandas.DataFrame` containing proxy data for all sections.
ages_df: pandas.DataFrame
:class:`pandas.DataFrame` containing age constraints for all sections.
proxies: str or list(str)
Proxies to include in the inference.
sections: list(str) or numpy.array(str)
List of sections to include in the inference (as named in ``sample_df`` and ``ages_df``).
Returns
-------
sample_df: pandas.DataFrame
:class:`pandas.DataFrame` containing cleaned proxy data for all sections.
ages_df: pandas.DataFrame
:class:`pandas.DataFrame` containing cleaned age constraint data for all sections.
"""
if type(proxies) == str:
proxies = list([proxies])
if sample_df is not None:
# create a copy so it doesn't modify the original DataFrame
sample_df = sample_df.copy()
keep_idx = np.sort(np.unique((np.concatenate([sample_df.index[~np.isnan(sample_df[proxy])] for proxy in proxies]))))
exclude_idx = list(sample_df.index)
for idx in keep_idx:
exclude_idx.remove(idx)
# if sample has no relevant proxy observations, exclude from inference
sample_df.loc[exclude_idx, 'Exclude?'] = True
sample_df = sample_df[sample_df['section'].isin(sections)]
sample_df = sample_df.sort_values(by = ['section', 'height'])
sample_df = sample_df.reset_index(inplace = False, drop = True)
if ages_df is not None:
ages_df = ages_df.copy()
ages_df = ages_df[ages_df['section'].isin(sections)]
ages_df = ages_df[ages_df['Exclude?'] == False]
ages_df = ages_df.sort_values(by = ['section', 'height'])
ages_df = ages_df.reset_index(inplace = False, drop = True)
return sample_df, ages_df
[docs]
def combine_duplicates(sample_df, proxies, proxy_sigma_default = 0.1, combine_no_superposition = False):
"""
Helper function for combining multiple proxy measurements from the same stratigraphic horizon. For each horizon with multiple proxy values, replaces the proxy value with the mean, and replaces the standard deviation with the combined uncertainty (``proxy_std`` values summed in quadrature) for all measurements. The standard deviation of the population of proxy values for each horizon is stored in the ``proxy_population_std`` column of ``sample_df`` (in :py:meth:`build_model() <stratmc.model.build_model>`, the uncertainty of each proxy observation is modeled as the ``proxy_std`` and ``proxy_population_std`` values summed in quadrature).
Parameters
----------
sample_df: pandas.DataFrame
:class:`pandas.DataFrame` containing proxy data for all sections.
proxies: list(str)
List of proxies to include in the inference.
proxy_sigma_default: float or dict{float}, optional
Measurement uncertainty (:math:`1\\sigma`) to use for proxy observations if not specified in ``proxy_std`` column of ``sample_df``. To set a different value for each proxy, pass a dictionary with proxy names as keys. Defaults to 0.1.
combine_no_superposition: bool, optional
Whether to combine samples without superposition information by averaging their proxy values; defaults to ``False``.
Returns
-------
sample_df: pandas.DataFrame
:class:`pandas.DataFrame` containing proxy data with duplicates combined.
"""
sample_df = sample_df.copy()
if type(proxies) == str:
proxies = list([proxies])
if ((type(proxy_sigma_default) == float) or (type(proxy_sigma_default) == int)):
temp = proxy_sigma_default
proxy_sigma_default = {}
for proxy in proxies:
proxy_sigma_default[proxy] = temp
for proxy in proxies:
if proxy + '_std' not in list(sample_df.columns):
sample_df[proxy + '_std'] = np.nan
idx = np.isnan(sample_df[proxy + '_std'])
sample_df.loc[idx, proxy + '_std'] = proxy_sigma_default[proxy]
# don't consider excluded samples when averaging observations from same height -- remove from dataframe and add back later
excluded_sample_df = sample_df[sample_df['Exclude?']]
if combine_no_superposition:
sample_df = sample_df[(~sample_df['Exclude?'].values.astype(bool))]
else:
no_superposition_sample_df = sample_df[~sample_df['superposition?']]
no_superposition_sample_df.reset_index(inplace = True, drop = True)
sample_df = sample_df[(~sample_df['Exclude?'].values.astype(bool)) & (sample_df['superposition?'].values.astype(bool))]
excluded_sample_df.reset_index(inplace = True, drop = True)
sample_df.reset_index(inplace = True, drop = True)
dup_idx = np.where(sample_df.duplicated(subset = ['section', 'height'], keep = 'first').values)[0]
dup_idx = list(sample_df.iloc[dup_idx].index)
duplicate_dicts = []
for idx in dup_idx:
if idx in list(sample_df.index):
duplicate_rows = (sample_df['section'] == sample_df['section'][idx]) & (sample_df['height'] == sample_df['height'][idx])
duplicate_df = sample_df[duplicate_rows].copy()
duplicate_sub_idx = list(duplicate_df.index)
duplicate_dict = {}
proxy_columns = [proxy for proxy in proxies]
proxy_std_columns = [proxy + '_std' for proxy in proxies]
columns = list(sample_df.columns)
for c in proxy_columns:
columns.remove(c)
# replace proxy value with the mean
duplicate_dict[c] = np.nanmean(duplicate_df[c])
# standard deviation of the population of proxy values
duplicate_dict[c + '_population_std'] = np.nanstd(duplicate_df[c])
for c in proxy_std_columns:
columns.remove(c)
# replace the measurement uncertainty with the quadrature uncertainty (of the measurement uncertainties; this does not include the population standard deviation)
duplicate_dict[c] = np.sqrt(np.sum((duplicate_df[c])**2))
for c in columns:
duplicate_dict[c] = duplicate_df.iloc[0][c]
for key in list(duplicate_dict.keys()):
duplicate_dict[key] = [duplicate_dict[key]]
# remove the duplicate samples from sample_df
sample_df.drop(index = duplicate_sub_idx, inplace = True)
duplicate_dicts.append(duplicate_dict)
# add combined data to dataframe
for duplicate in duplicate_dicts:
sample_df = pd.concat([sample_df, pd.DataFrame.from_dict(duplicate)], ignore_index = True)
# put the excluded samples back
if excluded_sample_df.shape[0] > 0:
sample_df = pd.concat([sample_df, excluded_sample_df], ignore_index = True)
# put samples w/out superposition information back
if (not combine_no_superposition):
if no_superposition_sample_df.shape[0] > 0:
sample_df = pd.concat([sample_df, no_superposition_sample_df], ignore_index = True)
# sort and reset indexing
sample_df.sort_values(by = ['section', 'height'], inplace = True)
sample_df.reset_index(inplace = True, drop = True)
return sample_df
[docs]
def combine_data(dataframes):
"""
Helper function for merging :class:`pandas.DataFrame` objects containing proxy observations or age constraints. Data are merged using the ``section`` and ``height`` columns.
Parameters
----------
dataframes: list(pandas.DataFrame)
List of :class:`pandas.DataFrame` objects to merge.
Returns
-------
merged_data: pandas.DataFrame
:class:`pandas.DataFrame` containing merged data.
"""
data = pd.DataFrame(columns = ['section', 'height'])
for df in dataframes:
data = data.merge(df,
how = 'outer')
return data
[docs]
def combine_traces(trace_list):
"""
Helper function for combining multiple :class:`arviz.InferenceData` objects (saved as NetCDF files) that contain prior and posterior samples for the same inference model (sampled with :py:meth:`get_trace() <stratmc.inference.get_trace>` in :py:mod:`stratmc.inference`). The :class:`arviz.InferenceData` objects are concatenated along the ``chain`` dimension such that if two traces with 8 chains each are concatenated, the new combined trace will have 16 chains.
Parameters
----------
trace_list: list(str)
List of paths to :class:`arviz.InferenceData` objects (saved as NetCDF files) to be merged.
Returns
-------
combined_trace: arviz.InferenceData
New :class:`arviz.InferenceData` object containing the prior and posterior draws for all traces in ``trace_list``.
"""
combined_trace = load_trace(trace_list[0])
dataset = combined_trace.X_new.copy()
X_new = combined_trace.X_new.X_new.values
# concatenating attributes manually -- arviz throws an error when the sampling time attribute (or any other attribute) has a unique value in different traces
combined_attrs = {}
attr_keys = list(combined_trace.attrs.keys())
for key in attr_keys:
combined_attrs[key] = [combined_trace.attrs[key]]
del combined_trace.attrs[key]
del combined_trace.X_new
for path in tqdm(trace_list[1:]):
trace = load_trace(path)
for key in attr_keys:
combined_attrs[key].append(trace.attrs[key])
del trace.attrs[key]
if not np.array_equal(trace.X_new.X_new.values.ravel(), X_new.ravel()):
sys.exit("Traces have different X_new - check that all inferences were run with the same data and parameters")
del trace.X_new
az.concat([combined_trace, trace], dim = 'chain', inplace = True)
combined_trace.add_groups(dataset)
combined_trace.attrs = combined_attrs
return combined_trace
[docs]
def drop_chains(full_trace, chains):
"""
Remove a subset of chains from a :class:`arviz.InferenceData` object.
Parameters
----------
full_trace: arviz.InferenceData
An :class:`arviz.InferenceData` object containing the full set of prior and posterior samples from :py:meth:`get_trace() <stratmc.inference.get_trace>` in :py:mod:`stratmc.inference`.
chains: list or np.array of int
Indices of chains to remove from ``full_trace``.
Returns
-------
full_trace_clean: arviz.InferenceData
Copy of ``full_trace`` without the chains specified in ``chains``.
"""
all_chains = list(full_trace.posterior.chain.values)
for chain in chains:
all_chains.remove(chain)
full_trace_clean = full_trace.sel(chain = all_chains, inplace = False)
return full_trace_clean
[docs]
def thin_trace(full_trace, drop_freq = 2):
"""
Remove a subset of draws from a :class:`arviz.InferenceData` object. Only applies to groups associated with the posterior (the prior draws will not be affected).
Parameters
----------
full_trace: arviz.InferenceData
An :class:`arviz.InferenceData` object containing the full set of prior and posterior samples from :py:meth:`get_trace() <stratmc.inference.get_trace>` in :py:mod:`stratmc.inference`.
drop_freq: int
Frequency of draw removal. For example, 2 will remove every other draw, while 4 will remove every fourth draw.
Returns
-------
thinned_trace: arviz.InferenceData
Thinned version of ``full_trace``.
"""
all_draws = list(full_trace.posterior.draw.values)
drop_draws = list(full_trace.posterior.draw.values)[::drop_freq]
for draw in drop_draws:
all_draws.remove(draw)
thinned_trace = full_trace.sel(groups = ["posterior", "posterior_predictive", "sample_stats", "log_likelihood"], draw = all_draws, inplace = False)
return thinned_trace
[docs]
def save_trace(trace, path):
"""
Save trace (:class:`arviz.InferenceData` object) as a NetCDF file.
Parameters
----------
trace: arviz.InferenceData
An :class:`arviz.InferenceData` object containing the full set of prior and posterior samples from :py:meth:`build_model() <stratmc.model.build_model>` in :py:mod:`stratmc.model` (the output of :py:meth:`get_trace() <stratmc.inference.get_trace>` in :py:mod:`stratmc.inference`).
path: str
Location (including the file name, without '.nc' extension) to save ``trace``.
"""
trace.to_netcdf(path+'.nc', groups=['posterior', 'log_likelihood', 'prior', 'prior_predictive', 'posterior_predictive', 'observed_data', 'sample_stats', 'X_new'])
[docs]
def save_object(var, path):
"""
Save variable as a pickle (.pkl) object.
Parameters
----------
var:
Variable to be saved.
path: str
Location (including the file name, without '.pkl' extension) to save ``var``.
"""
with open(path+'.pkl', "wb") as buff:
pickle.dump(var, buff)
[docs]
def load_trace(path):
"""
Custom load command for NetCDF file containing a trace (:class:`arviz.InferenceData` object saved with :py:meth:`save_trace() <stratmc.data.save_trace>`).
Parameters
----------
path: str
Path to saved NetCDF file (without the '.nc' extension).
Returns
-------
trace: arviz.InferenceData
Trace saved as NetCDF file.
"""
trace = az.from_netcdf(path+'.nc')
return trace
[docs]
def load_object(path):
"""
Custom load command for pickle (.pkl) object (variables can be saved as .pkl files with :py:meth:`save_object() <stratmc.data.save_object>`).
Parameters
----------
path: str
Path to saved .pkl file (without the '.pkl' extension).
Returns
-------
var:
Variable saved in ``path``.
"""
with open(path + '.pkl', "rb") as input_file:
return pickle.load(input_file)
[docs]
def accumulation_rate(full_trace, sample_df, ages_df, method = 'all', age_model = 'posterior', include_age_constraints = True, **kwargs):
"""
Calculate apparent sediment accumulation rate between successive samples (if ``method = 'successive'``) or every possible sample pairing (``method = 'all'``).
Note that if ``method = 'all'``, rate is returned in mm/year, and duration is returned in years. If ``method = 'successive'``, rate is returned in m/Myr, and duration is returned in Myr. Input data are assumed to have units of meters and millions of years. Used as input to :py:meth:`sadler_plot() <stratmc.plotting.sadler_plot>` and :py:meth:`accumulation_rate_stratigraphy() <stratmc.plotting.accumulation_rate_stratigraphy>` in :py:mod:`stratmc.plotting`.
Parameters
----------
full_trace: arviz.InferenceData
An :class:`arviz.InferenceData` object containing the full set of prior and posterior samples from :py:meth:`get_trace() <stratmc.inference.get_trace>` in :py:mod:`stratmc.inference`.
sample_df: pandas.DataFrame
:class:`pandas.DataFrame` containing all proxy data.
ages_df: pandas.DataFrame
:class:`pandas.DataFrame` containing age constraints from all sections.
method: str, optional
Whether to calculate accumulation rates between every possible sample pairing ('all'), or between successive samples ('successive'); defaults to 'all'.
age_model: str, optional
Whether to calculate accumulation rates using the the posterior or prior age model for each section; defaults to 'posterior'.
include_age_constraints: bool, optional
Whether to include radiometric age constraints in accumulation rate calculations; defaults to ``True``.
sections: list(str) or numpy.array(str), optional
List of sections to include. Defaults to all sections in ``sample_df``.
Returns
-------
rate_df: pandas.DataFrame
:class:`pandas.DataFrame` containing sediment accumulation rates and associated durations.
"""
# get list of proxies included in model from full_trace
variables = [
l
for l in list(full_trace["prior"].data_vars.keys()) # posterior
if (f"{'gp_ls_'}" in l) and (f"{'unshifted'}" not in l)
]
proxies = []
for var in variables:
proxies.append(var[6:])
if 'sections' in kwargs:
sections = list(kwargs['sections'])
else:
sections = np.unique(sample_df.dropna(subset = proxies, how = 'all')['section'])
sample_df, ages_df = clean_data(sample_df, ages_df, proxies, sections)
if method == 'all': # in mm/yr
duration = {}
rate = {}
rate_df = pd.DataFrame(columns = ['section', 'duration', 'rate'])
for section in sections:
section_df = sample_df[sample_df['section']==section]
sample_heights = section_df['height'].values * 1000 # convert meters to mm
age_heights = ages_df['height'][(ages_df['section']==section) & (~ages_df['Exclude?']) & (~ages_df['intermediate detrital?']) & (~ages_df['intermediate intrusive?'])] * 1000 # convert meters to mm
duration[section] = []
rate[section] = []
# shape (samples x draws)
if age_model == 'posterior':
sample_age_post = az.extract(full_trace.posterior)[str(section) + '_ages'].values
age_constraint_post = az.extract(full_trace.posterior)[str(section) + '_radiometric_age'].values
elif age_model == 'prior':
sample_age_post = az.extract(full_trace.prior)[str(section) + '_ages'].values
age_constraint_post = az.extract(full_trace.prior)[str(section) + '_radiometric_age'].values
if sample_age_post.shape[0] != len(sample_heights):
sys.exit(f"Number of data points for {section} does not match the number of data points in the trace.")
if include_age_constraints:
comb_heights = np.concatenate([sample_heights, age_heights])
sort_idx = np.argsort(comb_heights)
posterior_ages_stacked = np.vstack([sample_age_post, age_constraint_post])
draws = posterior_ages_stacked.shape[1]
else:
posterior_ages_stacked = sample_age_post
comb_heights = sample_heights
sort_idx = np.argsort(comb_heights)
draws = posterior_ages_stacked.shape[1]
sorted_heights = comb_heights[sort_idx]
max_idx = len(sorted_heights)
# for each draw
for n in np.arange(draws):
ages = posterior_ages_stacked[sort_idx, n] * 1e6 # put in order, and convert Myr to years
for i in np.arange(len(sorted_heights)):
for j in np.arange(i+1, max_idx): # if at the top sample, returns empty array
height_diff = sorted_heights[j] - sorted_heights[i]
age_diff = ages[i] - ages[j]
duration[section].append(age_diff)
rate[section].append(height_diff/age_diff)
section_rate_df = pd.DataFrame({'section': [section] * len(duration[section]), 'duration': duration[section], 'rate': rate[section]})
rate_df = pd.concat([rate_df.astype(section_rate_df.dtypes), section_rate_df], ignore_index = True)
elif method == 'successive': # in meters/Myr
duration = {}
rate = {}
base_height = {}
top_height = {}
base_age = {}
top_age = {}
rate_df = pd.DataFrame(columns = ['section', 'base_height', 'top_height', 'base_age', 'top_age', 'duration', 'rate'])
for section in sections:
base_height[section] = []
base_age[section] = []
top_height[section] = []
top_age[section] = []
duration[section] = []
rate[section] = []
section_df = sample_df[sample_df['section']==section]
sample_heights = section_df['height'].values
age_heights = ages_df['height'][(ages_df['section']==section) & (~ages_df['Exclude?']) & (~ages_df['intermediate detrital?']) & (~ages_df['intermediate intrusive?'])]
# shape (samples x draws)
if age_model == 'posterior':
sample_age_post = az.extract(full_trace.posterior)[str(section) + '_ages'].values
age_constraint_post = az.extract(full_trace.posterior)[str(section) + '_radiometric_age'].values
elif age_model == 'prior':
sample_age_post = az.extract(full_trace.prior)[str(section) + '_ages'].values
age_constraint_post = az.extract(full_trace.prior)[str(section) + '_radiometric_age'].values
if sample_age_post.shape[0] != len(sample_heights):
sys.exit(f"Number of data points for {section} does not match the number of data points in the trace.")
if include_age_constraints:
comb_heights = np.concatenate([sample_heights, age_heights])
sort_idx = np.argsort(comb_heights)
posterior_ages_stacked = np.vstack([sample_age_post, age_constraint_post])
draws = posterior_ages_stacked.shape[1]
else:
posterior_ages_stacked = sample_age_post
comb_heights = sample_heights
sort_idx = np.argsort(comb_heights)
draws = posterior_ages_stacked.shape[1]
sorted_heights = comb_heights[sort_idx]
max_idx = len(sorted_heights) - 1
# for each draw
for n in np.arange(draws):
ages = posterior_ages_stacked[sort_idx, n] # keep in Myr
for i in np.arange(len(sorted_heights)-1):
height_diff = sorted_heights[i+1] - sorted_heights[i]
age_diff = ages[i] - ages[i+1]
base_age[section].append(ages[i])
base_height[section].append(sorted_heights[i])
top_age[section].append(ages[i+1])
top_height[section].append(sorted_heights[i+1])
duration[section].append(age_diff)
rate[section].append(height_diff/age_diff)
section_rate_df = pd.DataFrame({'section': [section] * len(duration[section]),
'base_height': base_height[section],
'top_height': top_height[section],
'base_age': base_age[section],
'top_age': top_age[section],
'duration': duration[section],
'rate': rate[section]})
rate_df = pd.concat([rate_df.astype(section_rate_df.dtypes), section_rate_df], ignore_index = True)
return rate_df
[docs]
def downsample(sample_df, ages_df, N = 5000, likelihood_ratio_min = 0.5, proxy = 'd13c', keep = 'best', keep_seed = None, resample_with_lowest_n = True, flexible_n = True, best_criteria = 'corr_coef', split_environments = True, **kwargs):
"""
Subsample a set of proxy observations. Calculates the likelihood of the original stratigraphic signal given the subsampled signal and uncertainty in the data. Returns the solution that meets the mean likelihood ratio minimum with the lowest number of downsampled data points. See input parameter descriptions for additional details.
Parameters
----------
sample_df: pandas.DataFrame
:class:`pandas.DataFrame` containing proxy data for all sections.
ages_df: pandas.DataFrame
:class:`pandas.DataFrame` containing age constraints for all sections.
N: int
Number of random sample groupings to test. Defaults to 5,000.
likelihood_ratio_min: float or dict{float}, optional
Minimum acceptable likelihood ratio. For each section, the algorithm selects the smallest downsampled data set that meets this threshold. If multiple solutions with this minimum number of data points exist, then the solution with the highest correlation coefficient is selected if ``keep`` is 'best', while a random one of these solutions is selected if ``keep`` is 'random'. Must be in ``[0, 1]``; defaults to 0.5. Pass as a dictionary to specify a different value for each section.
keep: str
If there are multiple solutions that satisfy ``likelihood_ratio_min`` using the minimum possible number of data points, whether to return the best one of these solutions ('best'), or a random solution ('random'). Defaults to 'best'.
flexible_n: bool
Whether to consider solutions with 1 more data point than the minimum. Defaults to ``True``.
resample_with_lowest_n: bool
Whether to generate another N random solutions with the minimum number of data points required to staisfy ``likelihood_ratio_min`` (or one more than the minimum number of data points, if ``flexible_n = True``). Improves exploration of the solution space. Defaults to ``True``.
best_criteria: str
Which metric to use to identify the best solution among the candidate solutions that meet or exceed ``likelihood_ratio_min`` (if ``mode`` is 'best'). Either 'likelihood_ratio' (mean likelihood ratio) or 'corr_coef' (maximum Pearson correlation coefficient); defalts to 'corr_coef'.
split_environments: bool
Whether to insert breaks between different depositional environments (using 'Depositional Environment' column in ``sample_df``). Defaults to ``True``.
proxy: str, optional
Proxy to downsample. Defaults to 'd13c'.
sections: list(str) or numpy.array(str), optional
List of sections to downsample. Defaults to all sections in ``sample_df``.
Returns
-------
downsampled_data: pandas.DataFrame
:class:`pandas.DataFrame` containing downsampled proxy data. All samples are still included in the DataFrame, but samples that were excluded during downsampling are marked ``Exclude? = True``.
solution_likelihood_ratios: dict
Dictionary with the mean likelihood ratio for chosen solutions; keys are section names.
solution_corr_coefs: dict
Dictionary with the correlation coefficients for chosen solutions; keys are section names.
"""
sample_df_downsampled = sample_df.copy()
if 'sections' in kwargs:
sections = list(kwargs['sections'])
else:
sections = np.unique(sample_df_downsampled['section'])
if type(likelihood_ratio_min) != dict:
temp = likelihood_ratio_min
likelihood_ratio_min = {}
for section in sections:
likelihood_ratio_min[section] = temp
solution_likelihood_ratios = {}
solution_corr_coefs = {}
for section in tqdm(sections):
print(f'Downsampling {section}')
section_df = sample_df[(sample_df['section']==section) & (~sample_df['Exclude?'].astype(bool))].dropna(subset = proxy)
heights = section_df['height'].values
proxy_vec = section_df[proxy].values
proxy_std_vec = section_df[proxy + '_std'].values
if len(heights) > 2:
# grab required boundaries (changes in depositional environment, superposition, depositional ages)
required_boundaries = get_boundaries(sample_df, ages_df, proxy, section, environment = split_environments)
# make height grid to evaluate likelihood at original heights
height_grid = heights
# make proxy grid to evaluate likelihood at original heights
proxy_grid = proxy_vec
proxy_std_grid = proxy_std_vec
min_n_data = len(required_boundaries) - 2 + 1
max_n_data = len(heights)
# calculate the likelihood of the original signal, assuming we keep all data points (need for likelihood ratio calculation)
max_signal_log_prob = np.ones_like(proxy_grid) * np.nan
for i in range(len(proxy_grid)):
max_signal_log_prob[i] = norm.logpdf(proxy_grid[i], proxy_grid[i], proxy_std_grid[i]) # mean_proxy_std
random_data_idx = {}
corr_coef = np.ones(N) * np.nan
mean_likelihood_ratio = np.ones(N) * np.nan
n_data_points = np.ones(N) * np.nan
for i in tqdm(np.arange(N)):
rng = np.random.default_rng(seed = i)
random_data_idx[i] = []
n_data_points[i] = rng.choice(np.arange(np.max([min_n_data, 2]), max_n_data + 1), 1)[0]
candidate_idx = section_df.index.tolist()
# first, select 1 data point each from w/in each 'required' interval -- this ensures that all age constraints and depositional environments are represented
for interval, boundary_height in enumerate(required_boundaries[:-1]):
above = heights >= boundary_height
below = heights < required_boundaries[interval + 1]
cluster_idx = np.array(section_df.index.tolist())[above & below]
random_data_idx[i].append(rng.choice(cluster_idx, 1)[0])
# remove the chosen sample from the list of candidate data points
candidate_idx.remove(random_data_idx[i][-1])
# calculate how many additional data points we need to reach target number
n_remaining = n_data_points[i] - len(random_data_idx[i])
random_data_idx[i] += list(rng.choice(candidate_idx, int(n_remaining), replace = False))
random_data_idx[i].sort()
interp_proxy = np.interp(height_grid, section_df['height'].loc[random_data_idx[i]], section_df[proxy].loc[random_data_idx[i]])
# calculate likelihood of original signal, given interpolated signal and average proxy std dev
signal_log_prob = np.ones_like(proxy_grid) * np.nan
for j in range(len(proxy_grid)):
# use standard deviation of the measured value, instead of the mean
signal_log_prob[j] = norm.logpdf(interp_proxy[j], proxy_grid[j], proxy_std_grid[j]) # mean_proxy_std
mean_likelihood_ratio[i] = np.mean(np.exp(signal_log_prob - max_signal_log_prob))
corr_coef[i] = np.corrcoef(proxy_grid, interp_proxy)[0, 1]
above = (mean_likelihood_ratio >= likelihood_ratio_min[section])
keep_idx_list = np.argwhere(above)
# minimum number of data points in list
min_n_data = int(np.min(n_data_points[keep_idx_list]))
# NOTE -- this can rarely cause problems if it doesn't re-find the solution from the previous part w/ the sufficiently high correlation coeficient. in this case, either try re-running w/ higher N, or run with flexible size so that solutions with N+1 samples are also evaluated
if resample_with_lowest_n:
print(f'Resampling {section}')
for i in tqdm(np.arange(N)):
rng = np.random.default_rng(seed = i)
random_data_idx[i] = []
if flexible_n:
if min_n_data < len(heights):
n_data_points[i] = rng.choice([min_n_data, min_n_data + 1], 1)[0]
else:
n_data_points[i] = min_n_data
else:
n_data_points[i] = min_n_data
candidate_idx = section_df.index.tolist()
# first, select 1 data point each from w/in each 'required' interval -- this ensures that all age constraints and depositional environments are represented
for interval, boundary_height in enumerate(required_boundaries[:-1]):
above = heights >= boundary_height
below = heights < required_boundaries[interval + 1]
cluster_idx = np.array(section_df.index.tolist())[above & below]
random_data_idx[i].append(rng.choice(cluster_idx, 1)[0])
# remove the chosen sample from the list of candidate data points
candidate_idx.remove(random_data_idx[i][-1])
# calculate how many additional data points we need to reach target number
n_remaining = n_data_points[i] - len(random_data_idx[i])
random_data_idx[i] += list(rng.choice(candidate_idx, int(n_remaining), replace = False))
random_data_idx[i].sort()
interp_proxy = np.interp(height_grid, section_df['height'].loc[random_data_idx[i]], section_df[proxy].loc[random_data_idx[i]])
# calculate likelihood of original signal, given interpolated signal and average proxy std dev
signal_log_prob = np.ones_like(proxy_grid) * np.nan
for j in range(len(proxy_grid)):
signal_log_prob[j] = norm.logpdf(interp_proxy[j], proxy_grid[j], proxy_std_grid[j]) # mean_proxy_std
mean_likelihood_ratio[i] = np.mean(np.exp(signal_log_prob - max_signal_log_prob))
corr_coef[i] = np.corrcoef(proxy_grid, interp_proxy)[0, 1]
above = (mean_likelihood_ratio >= likelihood_ratio_min[section])
# solutions that meet minimum criteria
keep_idx_list = np.argwhere(above)
if len(keep_idx_list) == 0:
print('Try running with higher N; no viable solutions found after resampling')
# if size can be flexible (+1 larger than the minimum) (applies for both 'best' and 'random' modes)
if flexible_n:
n_data_idx = np.where((n_data_points[keep_idx_list] == min_n_data) | (n_data_points[keep_idx_list] == min_n_data + 1))[0]
else:
# get indices (within candidate list) where number of clusters is equal to the minimum
n_data_idx = np.where(n_data_points[keep_idx_list] == min_n_data)[0]
# use user-specified criteria (either corr coef or mean likelihood ratio) to pick the best solution
if (keep == 'best') or (len(n_data_idx) == 1):
# out of the viable solutions, choose the one with the highest correlation coefficient
if best_criteria == 'corr_coef':
best_idx_temp = np.nanargmax(corr_coef[keep_idx_list][n_data_idx])
# out of the viable solutions, choose the one with the lowest (mean) residuals between the interpolated signal and the excluded data points
elif best_criteria == 'likelihood_ratio':
best_idx_temp = np.argmax(mean_likelihood_ratio[keep_idx_list][n_data_idx])
best_idx = keep_idx_list[n_data_idx[best_idx_temp]][0]
elif (keep == 'random') and (len(n_data_idx) > 1):
keep_rng = np.random.default_rng(seed = keep_seed)
best_idx_temp = keep_rng.choice(n_data_idx, 1)[0]
best_idx = keep_idx_list[best_idx_temp][0]
#solution_corr_coefs[section] = corr_coef[best_idx]
solution_likelihood_ratios[section] = mean_likelihood_ratio[best_idx]
solution_corr_coefs[section] = corr_coef[best_idx]
# mark the samples we're keeping as Exclude? = False, and the rest as Exclude? = True
# don't change samples that didn't have data for the downsampled proxy anyway
sample_df_downsampled.loc[(sample_df_downsampled['section'] == section) & (~np.isnan(sample_df_downsampled[proxy])), 'Exclude?'] = True
sample_df_downsampled.loc[random_data_idx[best_idx], 'Exclude?'] = False
sample_df_downsampled['Exclude?'] = sample_df_downsampled['Exclude?'].astype(bool)
return sample_df_downsampled, solution_likelihood_ratios, solution_corr_coefs
[docs]
def get_boundaries(sample_df, ages_df, proxy, section, environment = True, depositional_ages = True, superposition = True):
"""
Helper function for :py:meth:`downsample() <stratmc.data.downsample>`. Returns list of height boundaries where the target section must be split into different groups. By default, inserts breaks between samples from different depositional environments, around groups of samples with the same depositional age, and around groups of samples without superposition information.
Parameters
----------
sample_df: pandas.DataFrame
:class:`pandas.DataFrame` containing all proxy data.
ages_df: pandas.DataFrame
:class:`pandas.DataFrame` containing age constraints from all sections.
proxy: str
Name of proxy to be downsampled.
section: str
Name of target section.
environment: bool
Whether to insert breaks between different depositional environments (requires 'Depositional Environment' column in ``sample_df``). Defaults to ``True``.
depositional_ages: bool
Whether to insert breaks around groups of samples with the same depositional age constraint. Defaults to ``True``.
superposition:
Whether to insert breaks around groups of samples without superposition information. Defaults to ``True``.
Returns
-------
boundary_heights: numpy.array
Array containing required boundaries.
"""
if 'Depositional Environment' not in list(sample_df.columns):
sample_df['Depositional Environment'] = np.nan
sample_df['Depositional Environment'] = sample_df['Depositional Environment'].astype(str)
section_df = sample_df[(sample_df['section']==section) & (~sample_df['Exclude?'].astype(bool))].dropna(subset = proxy)
section_ages_df = ages_df[(ages_df['section']==section) & (~ages_df['depositional?'])]
heights = section_df['height'].values
age_heights = section_ages_df['height'].values
# create a list of interval boundary heights: 1) age constraint, 2) change in 'superposition?' boolean, 3) change in depositional environment
boundary_heights = list(age_heights)
section_unique_dep_env = section_df['Depositional Environment'].unique()
section_dep_env = section_df['Depositional Environment'].values
section_superposition = section_df['superposition?'].values
section_dep_ages = section_df['depositional age'].values
section_unique_dep_ages = list(section_df['depositional age'].astype(str).unique())
if 'nan' in section_unique_dep_ages:
section_unique_dep_ages.remove('nan')
if superposition:
# if there's a change in superposition boolean within the interval, need to split into a separate interval (w/ same lower and upper bound -- just add the height twice)
if not (all(section_superposition)) or (all(~section_superposition)):
# grab heights of samples without superposition
super_heights = np.unique(section_df[~section_df['superposition?']]['height'])
for h in super_heights:
if h not in boundary_heights:
# append the height
boundary_heights += [h]
top_h_idx = np.where(section_df['height'] == h)[0][-1]
# bound with height of overlying sample, if not already done or at top of section
if (h != np.max(heights)):
if heights[top_h_idx + 1] not in super_heights:
boundary_heights.append(heights[top_h_idx + 1])
if depositional_ages:
# add boundaries around groups of samples with the same depositional age
for dep_age in section_unique_dep_ages:
# print(f'splitting depositional ages for section {section}')
dep_age_idx = np.where(section_dep_ages == dep_age)[0]
# assuming all the ages are in 1 chunk, add the base and the overlying sample
if all(np.diff(dep_age_idx) == 1):
# only add base if chunk isn't at base of section
if dep_age_idx[0] != 0:
boundary_heights.append(heights[dep_age_idx[0]])
# only add overlying sample if not at top of section (already bounded by another age constraint)
if dep_age_idx[-1] != len(heights) - 1:
boundary_heights.append(heights[dep_age_idx[-1] + 1])
else:
print(f'samples with depositional age {dep_age} in section {section} are not in a continuous chunk - check that depositional age assignment is correct')
if (len(dep_age_idx) > 1):
switch_idx = np.where(np.diff(dep_age_idx) != 1)[0]
boundary_heights += list(heights[dep_age_idx[switch_idx] + 1])
# add boundary above the uppermost chunk, unless it's the top of the secion (in which case there should already be an age constraint)
if dep_age_idx[-1] != len(heights) - 1:
boundary_heights += list([heights[dep_age_idx[-1] + 1]])
# add boundaries between different depositional environments
if environment:
if len(section_unique_dep_env) > 1:
for env in section_unique_dep_env:
env_idx = np.where(section_dep_env == env)[0]
# add base of lowermost group, unless we're at the bottom of the section
if env_idx[0] != 0:
boundary_heights.append(heights[env_idx[0]])
# if all samples from this environment are in 1 chunk, just add the top boundary
if (len(env_idx)) >= 1 and (all(np.diff(env_idx) == 1)):
# don't add to list if chunk is at top of the section (already bounded by an age constraint)
if (env_idx[-1] != len(heights) - 1):
boundary_heights.append(heights[env_idx[-1] + 1])
else:
# if there's more than one sample from this environment (scenario with only 1 is covered above)
if (len(env_idx) > 1):
switch_idx = np.where(np.diff(env_idx) != 1)[0]
boundary_heights += list(heights[env_idx[switch_idx] + 1])
# add boundary above the uppermost chunk, unless it's the top of the secion (in which case there should already be an age constraint)
if env_idx[-1] != len(heights) - 1:
boundary_heights += list([heights[env_idx[-1] + 1]])
# sort interval boundaries, and get rid of any duplicate boundaries
boundary_heights = np.sort(np.unique(boundary_heights))
boundary_heights = remove_extra_bounds(heights, boundary_heights)
return boundary_heights