Query HDF metadata with SPARQL

Query HDF metadata with SPARQL#

Metadata in form of JSON-LD can be queried using SPARQL:

import rdflib
from ssnolib import SSNO

import h5rdmtoolbox as h5tbx
from h5rdmtoolbox import jsonld

Example file:

with h5tbx.File() as h5:
    ds = h5.create_dataset('u', data=[1,2,3,4], attrs={'standard_name': 'coeff', 'units': 'm/s'})
    ds.rdf.predicate['standard_name'] = SSNO.hasStandardName
    ds.rdf.object['standard_name'] = "https://local.org/standard_names/piv_correlation_coefficient"
    h5.dump()

Extract metadata:

ttl = h5tbx.serialize(h5.hdf_filename,
                      format="ttl",
                      context={'ssno': 'https://matthiasprobst.github.io/ssno#'})
print(ttl)
@prefix hdf: <http://purl.allotrope.org/ontologies/hdf5/1.8#> .
@prefix ssno: <https://matthiasprobst.github.io/ssno#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

hdf:H5T_INTEL_I64 a hdf:Datatype .

[] a hdf:File ;
    hdf:rootGroup [ a hdf:Group ;
            hdf:member [ a hdf:Dataset ;
                    hdf:attribute [ a hdf:StringAttribute ;
                            hdf:data "coeff" ;
                            hdf:name "standard_name" ],
                        [ a hdf:StringAttribute ;
                            hdf:data "m/s" ;
                            hdf:name "units" ] ;
                    hdf:dataspace [ a hdf:SimpleDataspace ;
                            hdf:dimension [ a hdf:DataspaceDimension ;
                                    hdf:dimensionIndex 0 ;
                                    hdf:size 4 ] ] ;
                    hdf:datatype hdf:H5T_INTEGER,
                        hdf:H5T_INTEL_I64 ;
                    hdf:layout hdf:H5D_CONTIGUOUS ;
                    hdf:maximumSize 4 ;
                    hdf:name "/u" ;
                    hdf:rank 1 ;
                    hdf:size 4 ;
                    ssno:hasStandardName <https://local.org/standard_names/piv_correlation_coefficient> ] ;
            hdf:name "/" ] .
/home/docs/checkouts/readthedocs.org/user_builds/h5rdmtoolbox/checkouts/v2.5.4/h5rdmtoolbox/wrapper/core.py:2496: UserWarning: Not providing a file-uri is not good practice because it will generate blank nodes. Consider providing an URI such as the DOI URL for example.
  warnings.warn(

SPARQL query:

sparql_query_str = """
PREFIX hdf5: <http://purl.allotrope.org/ontologies/hdf5/1.8#>
PREFIX ssno: <https://matthiasprobst.github.io/ssno#>

SELECT  ?name ?sn
{
    ?obj a hdf5:Dataset .
    ?obj hdf5:name ?name .
    ?obj ssno:hasStandardName ?sn .
}
"""
g = rdflib.Graph().parse(data=ttl, format='ttl')
qres = g.query(sparql_query_str)

for name, sn in qres:
    print(str(name), str(sn))
/u https://local.org/standard_names/piv_correlation_coefficient

Find dataset with specific standard_name:

def find_dataset_from_standard_name(hdf_filename, sn, limit=1):
    sparql_query_str = """
    PREFIX hdf: <http://purl.allotrope.org/ontologies/hdf5/1.8#>
    PREFIX ssno: <https://matthiasprobst.github.io/ssno#>
    
    SELECT ?name
    {
        ?obj a hdf:Dataset .
        ?obj hdf:name ?name .
    """
    sparql_query_str += f"?obj ssno:hasStandardName <{sn}> .\n}}"
    g = rdflib.Graph().parse(data=ttl, format='ttl')
    qres = g.query(sparql_query_str)

    if limit == 1:
        for name in qres:
            return str(name[0])
    else:
        return [str(name[0]) for name in qres]
find_dataset_from_standard_name(
    h5.hdf_filename,
    'https://local.org/standard_names/piv_correlation_coefficient',
    limit=1
)
'/u'
def find_attribute_from_name(hdf_filename, attr_name, limit=1):
    sparql_query_str = f"""
    PREFIX hdf: <http://purl.allotrope.org/ontologies/hdf5/1.8#>
    
    SELECT  ?name
    {{
        ?obj a ?type .
        ?obj hdf:name ?name .
        ?obj hdf:attribute ?attr .
        ?attr hdf:name "{attr_name}" .
        VALUES ?type {{  hdf:Group hdf:Dataset }}
    }}
    """
    g = rdflib.Graph().parse(data=ttl, format='ttl')
    qres = g.query(sparql_query_str)

    def _get_attr(hdf_filename, obj_name, ak):
        with h5tbx.File(hdf_filename, 'r') as h5:
            return h5[obj_name].attrs[ak]

    if limit == 1:
        for name in qres:
            return {attr_name: _get_attr(hdf_filename, str(name[0]), attr_name)}
    else:
        return {attr_name: [_get_attr(hdf_filename, str(name[0]), attr_name) for name in qres]}
find_attribute_from_name(h5.hdf_filename, 'standard_name', limit=1)
{'standard_name': 'coeff'}