HDF5 and ontologies#

HDF5 itself is considered self-describing due to the ability to store metadata (attributes) with raw data. However, this is only the prerequisite. Especially, achieving (easy) re-usability, requires standardized metadata, which is publically defined and accessible. This means, that data must be describable with persistent identifiers, as known from linked data solutions.

One solution of describing data is using controlled vocabularies or even better ontologies. In fact, an ontology exists, which allows describing the structural content of an HDF5 file (groups, datasets, attributes, properties etc.). The h5rdmtoolbox has implemented a conversion function, translating an HDF5 into a JSON-LD file. This is outlined here.

import h5rdmtoolbox as h5tbx

Let’s create a sample HDF5 file first:

with h5tbx.File(mode='w') as h5:
    h5.create_dataset('test_dataset', shape=(3, ))
    grp = h5.create_group('grp')
    sub_grp = grp.create_group('Fan')
    sub_grp.create_dataset('D3', data=300)
    sub_grp['D3'].attrs['units', 'http://w3id.org/nfdi4ing/metadata4ing#hasUnits'] = 'mm'
    sub_grp['D3'].rdf['units'].object = 'https://qudt.org/vocab/unit/MilliM'
    sub_grp['D3'].attrs['standard_name', 'https://matthiasprobst.github.io/ssno/#standard_name'] = 'blade_diameter3'
    h5.dump(False)

Dump the semantic metadata to JSON-LD format#

The semantic metadata is stored in the RDF dictionaries of the HDF5 file, which the h5rdmtoolbox can work with. Call h5tbx.jsonld.dumps() to extract it:

print(h5tbx.serialize(h5.hdf_filename,
                      indent=2,
                      context={'schema': 'http://schema.org/',
                               "ssno":  "https://matthiasprobst.github.io/ssno/#",
                               "m4i": "http://w3id.org/nfdi4ing/metadata4ing#"}))
@prefix hdf: <http://purl.allotrope.org/ontologies/hdf5/1.8#> .
@prefix m4i: <http://w3id.org/nfdi4ing/metadata4ing#> .
@prefix ns1: <https://matthiasprobst.github.io/ssno/#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

hdf:H5T_IEEE_F32LE a hdf:Datatype .

hdf:H5T_INTEL_I64 a hdf:Datatype .

hdf:scalarDataspace a hdf:scalarDataspace .

[] a hdf:File ;
    hdf:rootGroup [ a hdf:Group ;
            hdf:member [ a hdf:Dataset ;
                    hdf:dataspace [ a hdf:SimpleDataspace ;
                            hdf:dimension [ a hdf:DataspaceDimension ;
                                    hdf:dimensionIndex 0 ;
                                    hdf:size 3 ] ] ;
                    hdf:datatype hdf:H5T_IEEE_F32LE,
                        "H5T_FLOAT" ;
                    hdf:layout hdf:H5D_CONTIGUOUS ;
                    hdf:maximumSize 3 ;
                    hdf:name "/test_dataset" ;
                    hdf:rank 1 ;
                    hdf:size 3 ],
                [ a hdf:Group ;
                    hdf:member [ a hdf:Group ;
                            hdf:member [ a hdf:Dataset ;
                                    hdf:attribute [ a hdf:StringAttribute ;
                                            hdf:data "blade_diameter3"^^xsd:string ;
                                            hdf:name "standard_name" ],
                                        [ a hdf:StringAttribute ;
                                            hdf:data "mm"^^xsd:string ;
                                            hdf:name "units" ] ;
                                    hdf:dataspace hdf:scalarDataspace ;
                                    hdf:datatype hdf:H5T_INTEL_I64,
                                        "H5T_INTEGER" ;
                                    hdf:layout hdf:H5D_CONTIGUOUS ;
                                    hdf:maximumSize -1 ;
                                    hdf:name "/grp/Fan/D3" ;
                                    hdf:rank 0 ;
                                    hdf:size 1 ;
                                    hdf:value "300" ;
                                    m4i:hasUnits <https://qudt.org/vocab/unit/MilliM> ;
                                    ns1:standard_name "blade_diameter3"^^xsd:string ] ;
                            hdf:name "/grp/Fan"^^xsd:string ] ;
                    hdf:name "/grp"^^xsd:string ] ;
            hdf:name "/"^^xsd:string ] .

Dump the structural metadata to JSON-LD format#

The structural or organizational metadata is the HDF5 internal layout, like groups, datasets, attributes and their properties including their relations:

hdf_jsonld = h5tbx.dump_jsonld(h5.hdf_filename, skipND=None)
print(hdf_jsonld)
{
  "@context": {
    "hdf": "http://purl.allotrope.org/ontologies/hdf5/1.8#",
    "m4i": "http://w3id.org/nfdi4ing/metadata4ing#",
    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  },
  "@graph": [
    {
      "@id": "hdf:scalarDataspace",
      "@type": "hdf:scalarDataspace"
    },
    {
      "@id": "hdf:H5T_IEEE_F32LE",
      "@type": "hdf:Datatype"
    },
    {
      "@id": "hdf:H5T_INTEL_I64",
      "@type": "hdf:Datatype"
    },
    {
      "@id": "_:ce43df509b959153c9a1931fd73f3486",
      "@type": "hdf:File",
      "hdf:rootGroup": {
        "@id": "_:a144b2a7f9b4e2a1f19eba0e919ba1a7"
      }
    },
    {
      "@id": "_:a144b2a7f9b4e2a1f19eba0e919ba1a7",
      "@type": "hdf:Group",
      "hdf:member": [
        {
          "@id": "_:c2872782f44afdfb16f0accbe1421e3d"
        },
        {
          "@id": "_:ca355230ea6666e0b1220e7edd6b29ac"
        }
      ],
      "hdf:name": "/"
    },
    {
      "@id": "_:c2872782f44afdfb16f0accbe1421e3d",
      "@type": "hdf:Dataset",
      "hdf:dataspace": {
        "@id": "_:N1389732235644a5dbe51dc8d45f42530"
      },
      "hdf:datatype": [
        {
          "@id": "hdf:H5T_IEEE_F32LE"
        },
        "H5T_FLOAT"
      ],
      "hdf:layout": {
        "@id": "hdf:H5D_CONTIGUOUS"
      },
      "hdf:maximumSize": 3,
      "hdf:name": "/test_dataset",
      "hdf:rank": 1,
      "hdf:size": 3
    },
    {
      "@id": "_:N1389732235644a5dbe51dc8d45f42530",
      "@type": "hdf:SimpleDataspace",
      "hdf:dimension": {
        "@id": "_:N8bfa1924c1bc4bfab5247cbb8c872815"
      }
    },
    {
      "@id": "_:N8bfa1924c1bc4bfab5247cbb8c872815",
      "@type": "hdf:DataspaceDimension",
      "hdf:dimensionIndex": 0,
      "hdf:size": 3
    },
    {
      "@id": "_:ca355230ea6666e0b1220e7edd6b29ac",
      "@type": "hdf:Group",
      "hdf:member": {
        "@id": "_:a8a14cbbfb6c2687f5d8cf1c6fa77c0c"
      },
      "hdf:name": "/grp"
    },
    {
      "@id": "_:a8a14cbbfb6c2687f5d8cf1c6fa77c0c",
      "@type": "hdf:Group",
      "hdf:member": {
        "@id": "_:f65cd632d277a0731846cac8b742b04d"
      },
      "hdf:name": "/grp/Fan"
    },
    {
      "@id": "_:f65cd632d277a0731846cac8b742b04d",
      "@type": "hdf:Dataset",
      "hdf:attribute": [
        {
          "@id": "_:N3d844330b0234a70a51bde79299d859a"
        },
        {
          "@id": "_:N5427da569afe4445b8f94134ff950eaf"
        }
      ],
      "hdf:dataspace": {
        "@id": "hdf:scalarDataspace"
      },
      "hdf:datatype": [
        {
          "@id": "hdf:H5T_INTEL_I64"
        },
        "H5T_INTEGER"
      ],
      "hdf:layout": {
        "@id": "hdf:H5D_CONTIGUOUS"
      },
      "hdf:maximumSize": -1,
      "hdf:name": "/grp/Fan/D3",
      "hdf:rank": 0,
      "hdf:size": 1,
      "hdf:value": "300",
      "https://matthiasprobst.github.io/ssno/#standard_name": "blade_diameter3",
      "m4i:hasUnits": {
        "@id": "https://qudt.org/vocab/unit/MilliM"
      }
    },
    {
      "@id": "_:N3d844330b0234a70a51bde79299d859a",
      "@type": "hdf:StringAttribute",
      "hdf:data": "mm",
      "hdf:name": "units"
    },
    {
      "@id": "_:N5427da569afe4445b8f94134ff950eaf",
      "@type": "hdf:StringAttribute",
      "hdf:data": "blade_diameter3",
      "hdf:name": "standard_name"
    }
  ]
}

Query the HDF-JSONLD file#

The obtained JSON-LD file can be used to search for specific information. In the below example, all datasets with their sizes are extracted:

sparql_query = """PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX hdf5: <http://purl.allotrope.org/ontologies/hdf5/1.8#>

SELECT ?ds_name ?ds_size
WHERE {
    ?group rdf:type hdf5:Dataset .
    ?group hdf5:name ?ds_name .
    ?group hdf5:size ?ds_size .
}
"""
import rdflib
g = rdflib.Graph()
g.parse(data=hdf_jsonld, format='json-ld')
results = g.query(sparql_query)
for b in results.bindings:
    print(b)
{rdflib.term.Variable('ds_name'): rdflib.term.Literal('/test_dataset'), rdflib.term.Variable('ds_size'): rdflib.term.Literal('3', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer'))}
{rdflib.term.Variable('ds_name'): rdflib.term.Literal('/grp/Fan/D3'), rdflib.term.Variable('ds_size'): rdflib.term.Literal('1', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer'))}
# convert results to dataframe:
import pandas as pd
df = pd.DataFrame(results.bindings)
df
ds_name ds_size
0 /test_dataset 3
1 /grp/Fan/D3 1