Source code for h5rdmtoolbox.ld.shacl

import pathlib
from dataclasses import dataclass
from typing import Union, List

import h5py
import rdflib
from pyshacl import validate as pyshacl_validate

from .hdf.file import get_ld as get_hdf_ld
from .user.file import get_ld as get_contextual_ld


@dataclass
class ValidationResult:
    conforms: bool
    results_graph: rdflib.Graph
    results_text: str
    messages: List[str]
    nodes: List[str]


def _parse_shacl(shacl: Union[str, pathlib.Path, rdflib.Graph], format) -> rdflib.Graph:
    if isinstance(shacl, pathlib.Path):
        if format is None:
            format = rdflib.util.guess_format(str(shacl))
        shacl_graph = rdflib.Graph()
        shacl_graph.parse(str(shacl), format=rdflib.util.guess_format(str(shacl)))
    elif isinstance(shacl, str):
        if format is None:
            format = "turtle"
        try:
            shacl_graph = rdflib.Graph()
            shacl_graph.parse(shacl, format=format)
        except Exception:
            # it may be a file, not a string graph:
            return _parse_shacl(pathlib.Path(shacl))
    elif isinstance(shacl, rdflib.Graph):
        shacl_graph = shacl
    else:
        raise TypeError("shacl must be a pathlib.Path, str, or rdflib.Graph")
    return shacl_graph


[docs] def validate_hdf( *, hdf_data: Union[str, rdflib.Graph] = None, hdf_source: Union[h5py.File, pathlib.Path] = None, shacl_data: Union[str, rdflib.Graph] = None, shacl_source: Union[str, pathlib.Path] = None, hdf_file_uri="https://example.org/hdf5file#", shacl_format: str = "turtle", hdf_data_format: str = "turtle", **pyshacl_kwargs, ) -> ValidationResult: """Validate an HDF5 file against SHACL shapes. Parameters ---------- hdf_data : Union[str, rdflib.Graph], optional The HDF5 data as a string or rdflib.Graph. If is string is provided it is assumed to be in Turtle format (you may overwrite this by passing hdf_data_format). hdf_source : Union[h5py.File, pathlib.Path], optional The path to the HDF5 file. shacl_data : Union[str, rdflib.Graph], optional The SHACL shapes as a string or rdflib.Graph. If is string is provided it is assumed to be in Turtle format. shacl_source : Union[str, pathlib.Path], optional The path to the SHACL shapes file. shacl_format : str, optional The format of the SHACL shapes string. Default is 'turtle'. hdf_data_format : str, optional The format of the HDF5 data string. Default is 'turtle'. **pyshacl_kwargs Additional keyword arguments to pass to pyshacl.validate(). Returns ------- ValidationResult The result of the validation containing: - conforms: bool - results_graph: rdflib.Graph - results_text: str - messages: List[str] """ if shacl_data is not None and shacl_source is not None: raise ValueError( 'Only one of "shacl_data" or "shacl_source" should be provided.' ) if shacl_data is None and shacl_source is None: raise ValueError('One of "shacl_data" or shacl_source must be provided.') if hdf_data is not None and hdf_source is not None: raise ValueError('Only one of "hdf_data" or "hdf_source" should be provided.') if hdf_data is None and hdf_source is None: raise ValueError('One of "hdf_data" or "hdf_source" must be provided.') if hdf_data is not None: if isinstance(hdf_data, str): h5_graph = rdflib.Graph() h5_graph.parse(hdf_data, format="turtle") elif isinstance(hdf_data, rdflib.Graph): h5_graph = hdf_data else: raise TypeError( f'Parameter "hdf_data" must be a str or rdflib.Graph, but is {type(hdf_data)}' ) if hdf_source is not None: if isinstance(hdf_source, (str, pathlib.Path)): if not pathlib.Path(hdf_source).exists(): raise FileNotFoundError(f'HDF5 file source "{hdf_source}" not found.') with h5py.File(hdf_source, "r") as h5f: return validate_hdf( hdf_data=None, hdf_source=h5f, shacl_data=shacl_data, shacl_source=shacl_source, hdf_file_uri=hdf_file_uri, hdf_data_format=hdf_data_format, **pyshacl_kwargs, ) if not isinstance(hdf_source, h5py.File): raise TypeError( 'Parameter "hdf_source" must be an h5py.File or a path to an HDF5 file.' ) h5_graph1 = get_hdf_ld(hdf_source, file_uri=hdf_file_uri, skipND=True) h5_graph2 = get_contextual_ld(hdf_source, file_uri=hdf_file_uri) h5_graph = h5_graph1 + h5_graph2 shacl_graph = None if shacl_data is not None: if isinstance(shacl_data, str): shacl_graph = rdflib.Graph() shacl_graph.parse(data=shacl_data, format=shacl_format) elif isinstance(shacl_data, rdflib.Graph): shacl_graph = shacl_data else: raise TypeError('Parameter "shacl_data" must be a str or rdflib.Graph') elif shacl_source is not None: # shacl is a filename: if not pathlib.Path(shacl_source).exists(): raise FileNotFoundError(f'SHACL file source "{shacl_source}" not found.') shacl_graph = rdflib.Graph() shacl_graph.parse(source=shacl_source, format=shacl_format) conforms, results_graph, results_text = _validate_graphs( h5_graph, shacl_graph, **pyshacl_kwargs ) return ValidationResult( conforms=conforms, results_graph=results_graph, results_text=results_text, messages=_get_messages(results_graph), nodes=_get_focus_nodes(results_graph), )
def _validate_graphs( data_graph, shacl_graph, inference="rdfs", abort_on_first=False, meta_shacl=False, advanced=False, debug=False, ): """ Validate a data graph against a SHACL shapes graph. Parameters: - data_graph: The RDF graph containing the data to be validated. - shacl_graph: The RDF graph containing the SHACL shapes. - inference: Type of inference to apply ('rdfs', 'owl', or None). - abort_on_first: If True, stop validation on the first error found. - meta_shacl: If True, enable Meta-SHACL features. - advanced: If True, enable advanced SHACL features. - debug: If True, enable debug output. Returns: - A tuple (conforms, results_graph, results_text) where: - conforms: Boolean indicating if the data graph conforms to the shapes. - results_graph: An RDF graph with validation results. - results_text: A textual summary of the validation results. """ conforms, results_graph, results_text = pyshacl_validate( data_graph, shacl_graph=shacl_graph, inference=inference, abort_on_first=abort_on_first, meta_shacl=meta_shacl, advanced=advanced, debug=debug, ) return conforms, results_graph, results_text def _get_messages(results_graph): """ Extract validation messages from a SHACL results graph. Parameters: - results_graph: An RDF graph containing SHACL validation results. Returns: - A list of validation messages. """ messages = [] for s, p, o in results_graph.triples( (None, rdflib.namespace.SH.resultMessage, None) ): messages.append(str(o)) return messages def _get_focus_nodes(results_graph): """ Extract focus nodes from a SHACL results graph. Parameters: - results_graph: An RDF graph containing SHACL validation results. Returns: - A list of focus nodes. """ focus_nodes = [] for s, p, o in results_graph.triples((None, rdflib.namespace.SH.focusNode, None)): focus_nodes.append(o) return focus_nodes