Special I/O#

import h5rdmtoolbox as h5tbx
h5tbx.use(None)
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[1], line 1
----> 1 import h5rdmtoolbox as h5tbx
      2 h5tbx.use(None)

File ~/checkouts/readthedocs.org/user_builds/h5rdmtoolbox/checkouts/v1.7.0/h5rdmtoolbox/__init__.py:129
    125     with File(src) as h5:
    126         return h5.dumps()
--> 129 from h5rdmtoolbox.wrapper.ld.hdf.file import get_ld as hdf_get_ld
    130 from h5rdmtoolbox.wrapper.ld.user.file import get_ld as user_get_ld
    133 def get_ld(
    134         hdf_filename: Union[str, pathlib.Path],
    135         structural: bool = True,
    136         semantic: bool = True,
    137         blank_node_iri_base: Optional[str] = None,
    138         **kwargs) -> rdflib.Graph:

File ~/checkouts/readthedocs.org/user_builds/h5rdmtoolbox/checkouts/v1.7.0/h5rdmtoolbox/wrapper/ld/__init__.py:1
----> 1 import ssnolib.ssno.standard_name
      2 from ontolutils.namespacelib import M4I
      3 from ontolutils.namespacelib import SCHEMA

ModuleNotFoundError: No module named 'ssnolib'

Creating datasets and CSV file(s)#

Datasets can be created directly form a single or from multiple files. Let’s first create two simple CSV files:

import pandas as pd
import numpy as np

np.random.seed(100)

# first
df = pd.DataFrame({'x': np.random.random((4, )),
                   'y': np.random.random((4, ))})
csv_filename1 = h5tbx.utils.generate_temporary_filename(suffix='.csv')
df.to_csv(csv_filename1, index=None)
df
x y
0 0.543405 0.004719
1 0.278369 0.121569
2 0.424518 0.670749
3 0.844776 0.825853
# second
df = pd.DataFrame({'x': np.random.random((8, )),
                   'y': np.random.random((8, ))})
csv_filename2 = h5tbx.utils.generate_temporary_filename(suffix='.csv')
df.to_csv(csv_filename2, index=None)
df
x y
0 0.136707 0.811683
1 0.575093 0.171941
2 0.891322 0.816225
3 0.209202 0.274074
4 0.185328 0.431704
5 0.108377 0.940030
6 0.219697 0.817649
7 0.978624 0.336112

Create from a single file:

with h5tbx.File() as h5:
    h5.create_dataset_from_csv(csv_filename=csv_filename1)
    h5.dump()
      (4) [float64]
      (x: 4) [float64]

For creating from multiple CSV files, it must be decided whether to stack (datasets must have same size) or concatenate them:

… concatenating:

with h5tbx.File() as h5:
    h5.create_datasets_from_csv(csv_filenames=[csv_filename1, csv_filename2], combine_opt='concatenate')
    h5.dump()
      (12) [float64]
      (x: 12) [float64]

… stacking:

with h5tbx.File() as h5:
    h5.create_datasets_from_csv(csv_filenames=[csv_filename2, csv_filename2], combine_opt='stack')
    h5.dump()
      (2, 8) [float64]
      (2, 8) [float64]

Creating datasets and image file(s)#

A dataset can be created from image data. The data can be provided as a list of numpy arrays:

with h5tbx.File() as h5:
    h5.create_dataset_from_image([np.random.random((20, 10))] * 5,
                                 'testimg', axis=0)
    h5.dump()
      (5, 20, 10) [float32]

… or as a iterable object which provides the image data one at a time:

class ImgReader:
    """Dummy Image Reader"""
    def __init__(self, imgdir):
        self._imgdir = imgdir
        self._index = 0
        self._size = 5

    def read_img(self):
        # provide random image. Use case would read from file...
        return np.random.random((20, 10))

    def __iter__(self):
        return self

    def __len__(self):
        return self._size

    def __next__(self):
        if self._index < self._size:
            self._index += 1
            return self.read_img()
        raise StopIteration
imgreader = ImgReader('testdir')
with h5tbx.File() as h5:
    h5.create_dataset_from_image(imgreader, 'testimg', axis=0)
    h5.dump()
      (5, 20, 10) [float32]