HDF5 and ontologies#
HDF5 itself is considered self-describing due to the ability to store metadata (attributes) with raw data. However, this is only the prerequisite. Especially, achieving (easy) re-usability, requires standardized metadata, which is publically defined and accessible. This means, that data must be describable with persistent identifiers, as known from linked data solutions.
One solution of describing data is using controlled vocabularies or even better ontologies. In fact, an ontology exists, which allows describing the structural content of an HDF5 file (groups, datasets, attributes, properties etc.). The h5rdmtoolbox has implemented a conversion function, translating an HDF5 into a JSON-LD file. This is outlined here.
import h5rdmtoolbox as h5tbx
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[1], line 1
----> 1 import h5rdmtoolbox as h5tbx
File ~/checkouts/readthedocs.org/user_builds/h5rdmtoolbox/checkouts/v1.7.0/h5rdmtoolbox/__init__.py:129
125 with File(src) as h5:
126 return h5.dumps()
--> 129 from h5rdmtoolbox.wrapper.ld.hdf.file import get_ld as hdf_get_ld
130 from h5rdmtoolbox.wrapper.ld.user.file import get_ld as user_get_ld
133 def get_ld(
134 hdf_filename: Union[str, pathlib.Path],
135 structural: bool = True,
136 semantic: bool = True,
137 blank_node_iri_base: Optional[str] = None,
138 **kwargs) -> rdflib.Graph:
File ~/checkouts/readthedocs.org/user_builds/h5rdmtoolbox/checkouts/v1.7.0/h5rdmtoolbox/wrapper/ld/__init__.py:1
----> 1 import ssnolib.ssno.standard_name
2 from ontolutils.namespacelib import M4I
3 from ontolutils.namespacelib import SCHEMA
ModuleNotFoundError: No module named 'ssnolib'
Let’s create a sample HDF5 file first:
with h5tbx.File(mode='w') as h5:
h5.create_dataset('test_dataset', shape=(3, ))
grp = h5.create_group('grp')
sub_grp = grp.create_group('Fan')
sub_grp.create_dataset('D3', data=300)
sub_grp['D3'].attrs['units', 'http://w3id.org/nfdi4ing/metadata4ing#hasUnits'] = 'mm'
sub_grp['D3'].rdf['units'].object = 'https://qudt.org/vocab/unit/MilliM'
sub_grp['D3'].attrs['standard_name', 'https://matthiasprobst.github.io/ssno/#standard_name'] = 'blade_diameter3'
h5.dump(False)
-
-
(3) [float32]
-
-
-
300 [mm] (int32)
- standard_name
https://matthiasprobst.github.io/ssno/#standard_name : blade_diameter3 - units
http://w3id.org/nfdi4ing/metadata4ing#hasUnits : mm
https://qudt.org/vocab/unit/MilliM
- standard_name
-
-
Dump the semantic metadata to JSON-LD format#
The semantic metadata is stored in the RDF dictionaries of the HDF5 file, which the h5rdmtoolbox can work with. Call h5tbx.jsonld.dumps() to extract it:
print(h5tbx.jsonld.dumps(h5.hdf_filename,
indent=2,
context={'schema': 'http://schema.org/',
"ssno": "https://matthiasprobst.github.io/ssno/#",
"m4i": "http://w3id.org/nfdi4ing/metadata4ing#"}))
{
"@context": {
"hdf5": "http://purl.allotrope.org/ontologies/hdf5/1.8#",
"m4i": "http://w3id.org/nfdi4ing/metadata4ing#",
"schema": "http://schema.org/",
"ssno": "https://matthiasprobst.github.io/ssno/#"
},
"@graph": [
{
"@id": "_:N1",
"@type": "hdf5:File",
"hdf5:rootGroup": {
"@id": "_:N0",
"@type": "hdf5:Group",
"hdf5:member": [
{
"@id": "_:N2",
"@type": "hdf5:Group",
"hdf5:member": {
"@id": "_:N3",
"@type": "hdf5:Group",
"hdf5:member": {
"@id": "_:N4",
"@type": "hdf5:Dataset",
"hdf5:attribute": [
{
"@id": "_:N5",
"@type": "hdf5:Attribute",
"hdf5:name": "standard_name",
"hdf5:value": "blade_diameter3"
},
{
"@id": "_:N6",
"@type": "hdf5:Attribute",
"hdf5:name": "units",
"hdf5:value": "mm"
}
],
"hdf5:datatype": "H5T_INTEGER",
"hdf5:dimension": 0,
"hdf5:name": "/grp/Fan/D3",
"hdf5:size": 1,
"hdf5:value": {
"@id": "https://qudt.org/vocab/unit/MilliM"
},
"m4i:hasUnits": {
"@id": "https://qudt.org/vocab/unit/MilliM"
},
"ssno:standard_name": "blade_diameter3"
},
"hdf5:name": "/grp/Fan"
},
"hdf5:name": "/grp"
},
{
"@id": "_:N7",
"@type": "hdf5:Dataset",
"hdf5:datatype": "H5T_FLOAT",
"hdf5:dimension": 1,
"hdf5:name": "/test_dataset",
"hdf5:size": 3
}
],
"hdf5:name": "/"
}
}
]
}
Dump the structural metadata to JSON-LD format#
The structural or organizational metadata is the HDF5 internal layout, like groups, datasets, attributes and their properties including their relations:
hdf_jsonld = h5tbx.dump_jsonld(h5.hdf_filename, skipND=None)
print(hdf_jsonld)
{"@context": {"hdf5": "http://purl.allotrope.org/ontologies/hdf5/1.8#", "units": "http://w3id.org/nfdi4ing/metadata4ing#hasUnits"}, "@graph": [{"@id": "_:N9", "@type": "hdf5:File", "hdf5:rootGroup": {"@id": "_:N8", "@type": "hdf5:Group", "hdf5:member": [{"@id": "_:N10", "@type": "hdf5:Group", "hdf5:member": {"@id": "_:N11", "@type": "hdf5:Group", "hdf5:member": {"@id": "_:N12", "@type": "hdf5:Dataset", "hdf5:attribute": [{"@id": "_:N13", "@type": "hdf5:Attribute", "hdf5:name": "standard_name", "hdf5:value": "blade_diameter3"}, {"@id": "_:N14", "@type": "hdf5:Attribute", "hdf5:name": "units", "hdf5:value": "mm"}], "hdf5:datatype": "H5T_INTEGER", "hdf5:dimension": 0, "hdf5:name": "/grp/Fan/D3", "hdf5:size": 1, "hdf5:value": {"@id": "https://qudt.org/vocab/unit/MilliM"}, "https://matthiasprobst.github.io/ssno/#standard_name": "blade_diameter3", "units": {"@id": "https://qudt.org/vocab/unit/MilliM"}}, "hdf5:name": "/grp/Fan"}, "hdf5:name": "/grp"}, {"@id": "_:N15", "@type": "hdf5:Dataset", "hdf5:datatype": "H5T_FLOAT", "hdf5:dimension": 1, "hdf5:name": "/test_dataset", "hdf5:size": 3}], "hdf5:name": "/"}}]}
Query the HDF-JSONLD file#
The obtained JSON-LD file can be used to search for specific information. In the below example, all datasets with their sizes are extracted:
sparql_query = """PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX hdf5: <http://purl.allotrope.org/ontologies/hdf5/1.8#>
SELECT ?ds_name ?ds_size
WHERE {
?group rdf:type hdf5:Dataset .
?group hdf5:name ?ds_name .
?group hdf5:size ?ds_size .
}
"""
import rdflib
g = rdflib.Graph()
g.parse(data=hdf_jsonld, format='json-ld')
results = g.query(sparql_query)
for b in results.bindings:
print(b)
{rdflib.term.Variable('ds_name'): rdflib.term.Literal('/grp/Fan/D3'), rdflib.term.Variable('ds_size'): rdflib.term.Literal('1', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer'))}
{rdflib.term.Variable('ds_name'): rdflib.term.Literal('/test_dataset'), rdflib.term.Variable('ds_size'): rdflib.term.Literal('3', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer'))}
# convert results to dataframe:
import pandas as pd
df = pd.DataFrame(results.bindings)
df
| ds_name | ds_size | |
|---|---|---|
| 0 | /grp/Fan/D3 | 1 |
| 1 | /test_dataset | 3 |