Special I/O#
import h5rdmtoolbox as h5tbx
h5tbx.use(None)
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[1], line 1
----> 1 import h5rdmtoolbox as h5tbx
2 h5tbx.use(None)
File ~/checkouts/readthedocs.org/user_builds/h5rdmtoolbox/checkouts/v1.7.0/h5rdmtoolbox/__init__.py:129
125 with File(src) as h5:
126 return h5.dumps()
--> 129 from h5rdmtoolbox.wrapper.ld.hdf.file import get_ld as hdf_get_ld
130 from h5rdmtoolbox.wrapper.ld.user.file import get_ld as user_get_ld
133 def get_ld(
134 hdf_filename: Union[str, pathlib.Path],
135 structural: bool = True,
136 semantic: bool = True,
137 blank_node_iri_base: Optional[str] = None,
138 **kwargs) -> rdflib.Graph:
File ~/checkouts/readthedocs.org/user_builds/h5rdmtoolbox/checkouts/v1.7.0/h5rdmtoolbox/wrapper/ld/__init__.py:1
----> 1 import ssnolib.ssno.standard_name
2 from ontolutils.namespacelib import M4I
3 from ontolutils.namespacelib import SCHEMA
ModuleNotFoundError: No module named 'ssnolib'
Creating datasets and CSV file(s)#
Datasets can be created directly form a single or from multiple files. Let’s first create two simple CSV files:
import pandas as pd
import numpy as np
np.random.seed(100)
# first
df = pd.DataFrame({'x': np.random.random((4, )),
'y': np.random.random((4, ))})
csv_filename1 = h5tbx.utils.generate_temporary_filename(suffix='.csv')
df.to_csv(csv_filename1, index=None)
df
| x | y | |
|---|---|---|
| 0 | 0.543405 | 0.004719 |
| 1 | 0.278369 | 0.121569 |
| 2 | 0.424518 | 0.670749 |
| 3 | 0.844776 | 0.825853 |
# second
df = pd.DataFrame({'x': np.random.random((8, )),
'y': np.random.random((8, ))})
csv_filename2 = h5tbx.utils.generate_temporary_filename(suffix='.csv')
df.to_csv(csv_filename2, index=None)
df
| x | y | |
|---|---|---|
| 0 | 0.136707 | 0.811683 |
| 1 | 0.575093 | 0.171941 |
| 2 | 0.891322 | 0.816225 |
| 3 | 0.209202 | 0.274074 |
| 4 | 0.185328 | 0.431704 |
| 5 | 0.108377 | 0.940030 |
| 6 | 0.219697 | 0.817649 |
| 7 | 0.978624 | 0.336112 |
Create from a single file:
with h5tbx.File() as h5:
h5.create_dataset_from_csv(csv_filename=csv_filename1)
h5.dump()
-
-
(4) [float64]
-
(x: 4) [float64]
For creating from multiple CSV files, it must be decided whether to stack (datasets must have same size) or concatenate them:
… concatenating:
with h5tbx.File() as h5:
h5.create_datasets_from_csv(csv_filenames=[csv_filename1, csv_filename2], combine_opt='concatenate')
h5.dump()
-
-
(12) [float64]
-
(x: 12) [float64]
… stacking:
with h5tbx.File() as h5:
h5.create_datasets_from_csv(csv_filenames=[csv_filename2, csv_filename2], combine_opt='stack')
h5.dump()
-
-
(2, 8) [float64]
-
(2, 8) [float64]
Creating datasets and image file(s)#
A dataset can be created from image data. The data can be provided as a list of numpy arrays:
with h5tbx.File() as h5:
h5.create_dataset_from_image([np.random.random((20, 10))] * 5,
'testimg', axis=0)
h5.dump()
-
-
(5, 20, 10) [float32]
… or as a iterable object which provides the image data one at a time:
class ImgReader:
"""Dummy Image Reader"""
def __init__(self, imgdir):
self._imgdir = imgdir
self._index = 0
self._size = 5
def read_img(self):
# provide random image. Use case would read from file...
return np.random.random((20, 10))
def __iter__(self):
return self
def __len__(self):
return self._size
def __next__(self):
if self._index < self._size:
self._index += 1
return self.read_img()
raise StopIteration
imgreader = ImgReader('testdir')
with h5tbx.File() as h5:
h5.create_dataset_from_image(imgreader, 'testimg', axis=0)
h5.dump()
-
-
(5, 20, 10) [float32]