Query HDF metadata with SPARQL#
Metadata in form of JSON-LD can be queried using SPARQL:
import rdflib
from ssnolib import SSNO
import h5rdmtoolbox as h5tbx
from h5rdmtoolbox import jsonld
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[1], line 2
1 import rdflib
----> 2 from ssnolib import SSNO
4 import h5rdmtoolbox as h5tbx
5 from h5rdmtoolbox import jsonld
ModuleNotFoundError: No module named 'ssnolib'
Example file:
with h5tbx.File() as h5:
ds = h5.create_dataset('u', data=[1,2,3,4], attrs={'standard_name': 'coeff', 'units': 'm/s'})
ds.rdf.predicate['standard_name'] = SSNO.hasStandardName
ds.rdf.object['standard_name'] = "https://local.org/standard_names/piv_correlation_coefficient"
h5.dump()
Extract metadata:
ttl = h5tbx.serialize(h5.hdf_filename,
format="ttl",
context={'ssno': 'https://matthiasprobst.github.io/ssno#'})
print(ttl)
SPARQL query:
sparql_query_str = """
PREFIX hdf5: <http://purl.allotrope.org/ontologies/hdf5/1.8#>
PREFIX ssno: <https://matthiasprobst.github.io/ssno#>
SELECT ?name ?sn
{
?obj a hdf5:Dataset .
?obj hdf5:name ?name .
?obj ssno:hasStandardName ?sn .
}
"""
g = rdflib.Graph().parse(data=ttl, format='ttl')
qres = g.query(sparql_query_str)
for name, sn in qres:
print(str(name), str(sn))
Find dataset with specific standard_name:
def find_dataset_from_standard_name(hdf_filename, sn, limit=1):
sparql_query_str = """
PREFIX hdf: <http://purl.allotrope.org/ontologies/hdf5/1.8#>
PREFIX ssno: <https://matthiasprobst.github.io/ssno#>
SELECT ?name
{
?obj a hdf:Dataset .
?obj hdf:name ?name .
"""
sparql_query_str += f"?obj ssno:hasStandardName <{sn}> .\n}}"
g = rdflib.Graph().parse(data=ttl, format='ttl')
qres = g.query(sparql_query_str)
if limit == 1:
for name in qres:
return str(name[0])
else:
return [str(name[0]) for name in qres]
find_dataset_from_standard_name(
h5.hdf_filename,
'https://local.org/standard_names/piv_correlation_coefficient',
limit=1
)
def find_attribute_from_name(hdf_filename, attr_name, limit=1):
sparql_query_str = f"""
PREFIX hdf: <http://purl.allotrope.org/ontologies/hdf5/1.8#>
SELECT ?name
{{
?obj a ?type .
?obj hdf:name ?name .
?obj hdf:attribute ?attr .
?attr hdf:name "{attr_name}" .
VALUES ?type {{ hdf:Group hdf:Dataset }}
}}
"""
g = rdflib.Graph().parse(data=ttl, format='ttl')
qres = g.query(sparql_query_str)
def _get_attr(hdf_filename, obj_name, ak):
with h5tbx.File(hdf_filename, 'r') as h5:
return h5[obj_name].attrs[ak]
if limit == 1:
for name in qres:
return {attr_name: _get_attr(hdf_filename, str(name[0]), attr_name)}
else:
return {attr_name: [_get_attr(hdf_filename, str(name[0]), attr_name) for name in qres]}
find_attribute_from_name(h5.hdf_filename, 'standard_name', limit=1)