Provenance#

One aspect of data provenance is keeping track of the processing steps applied to data.

Next, we will learn how this can be done working with xarray objects.

Note: This is work in progress! Adding provenance information can clutter the attributes of a xarray quite a bit

import h5rdmtoolbox as h5tbx
import numpy as np

h5tbx.set_config(add_provenance=True)
<h5rdmtoolbox._cfg.set_config at 0x7fa9dc29a6a0>

For the example, let’s assume a 3D-velocity field with time, y- and x-dimension:

cv = h5tbx.convention.from_zenodo('https://zenodo.org/records/10428822')

h5tbx.use(cv)
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[2], line 1
----> 1 cv = h5tbx.convention.from_zenodo('https://zenodo.org/records/10428822')
      3 h5tbx.use(cv)

File ~/checkouts/readthedocs.org/user_builds/h5rdmtoolbox/checkouts/v1.4.1/h5rdmtoolbox/convention/core.py:758, in from_zenodo(doi_or_recid, name, overwrite, force_download)
    755 if not filename.exists() or force_download:
    756     record = zenodo.ZenodoRecord(rec_id)
--> 758     filenames = list(record.files.keys())
    759     if name is None:
    760         matches = [file for file in filenames if pathlib.Path(file).suffix == '.yaml']

File ~/checkouts/readthedocs.org/user_builds/h5rdmtoolbox/checkouts/v1.4.1/h5rdmtoolbox/repository/zenodo/core.py:602, in ZenodoRecord.files(self)
    595 @property
    596 def files(self) -> Dict[str, RepositoryFile]:
    597     # def _parse_download_url(filename):
    598     #     if filename is None:
    599     #         return filename
    600     #     return f"{self.rec_url}/{self.rec_id}/files/{filename}"
--> 602     is_submitted = self.submitted()
    604     def _parse_download_url(url, filename):
    605         if url is None:

File ~/checkouts/readthedocs.org/user_builds/h5rdmtoolbox/checkouts/v1.4.1/h5rdmtoolbox/repository/zenodo/core.py:558, in ZenodoRecord.is_published(self)
    556 def is_published(self) -> bool:
    557     """Check if the deposit is published."""
--> 558     return self.json()['submitted']

KeyError: 'submitted'
with h5tbx.File(data_type='experimental', contact=h5tbx.__author_orcid__) as h5:
    h5.create_dataset('time', data=np.linspace(0, 5, 5), standard_name='time', units='s', make_scale=True)
    h5.create_dataset('y', data=np.linspace(0, 10, 10), standard_name='y_coordinate', units='m', make_scale=True)
    h5.create_dataset('x', data=np.linspace(0, 7, 7), standard_name='x_coordinate', units='m', make_scale=True)
    h5.create_dataset('u', data=np.random.rand(5, 10, 7), standard_name='x_velocity', units='m/s', attach_scale=('time', 'y', 'x'))
    u = h5.u[:]

# plot t=2.5 s:
u.sel(time=2.5).plot()
<matplotlib.collections.QuadMesh at 0x1884c7ec130>
../../../_images/3b6dc59124afb5b5d873d02a62810409a3dc4b886eb898c3ca44a03be03b0b29.png
from h5rdmtoolbox.convention.standard_names import accessor
u_processed = u.snt[0:2,...].snt.arithmetic_mean_of(dim='time')

def get_dim_shape(da):
    return {d: len(da[d]) for d in da.dims}

def explain_history(da):
    for i, item in enumerate(da.attrs['PROVENANCE']['processing_history']):
        print(i, 'applied ', item['name'], ' on array with shape', item['parent']['dims_shape'])
        print('  -> ', item['name'])
    print('Current shape ', get_dim_shape(da))

explain_history(u_processed)
0 applied  __getitem__  on array with shape {'time': 5, 'y': 10, 'x': 7}
  ->  __getitem__
1 applied  arithmetic_mean_of  on array with shape {'time': 2, 'y': 10, 'x': 7}
  ->  arithmetic_mean_of
Current shape  {'y': 10, 'x': 7}
h5tbx.set_config(add_provenance=False)
<h5rdmtoolbox._cfg.set_config at 0x1884c8a0400>