Prov#

About#

This is the start of some discussion on issues around prov tracking in OIH. This may take two paths. One would be the prov tracking indexers might do and the other prov that providers would encode to provide specific prov the community requests.

Gleaner Prov#

The Gleaner application generates a prov graph of the activity of accessing and indexing provider resources. The main goal of this prov is to connect an indexed URL to the digital object stored in the object store. This digital object should be the JSON-LD data graph presented by the provider.

By contrast, the authoritative reference in the various profiles will connect the the data graph ID, or in the absence of that the data graph URL or the referenced resources URL by gleaner, to another reference. This may be an organization ID or a PID of the connected resource.

 1{
 2    "@context": {
 3        "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
 4        "prov": "http://www.w3.org/ns/prov#",
 5        "rdfs": "http://www.w3.org/2000/01/rdf-schema#"
 6    },
 7    "@graph": [
 8        {
 9            "@id": "https://www.re3data.org/repository/obis",
10            "@type": "prov:Organization",
11            "rdf:name": "Ocean Biodiversity Information System",
12            "rdfs:seeAlso": "https://obis.org"
13        },
14        {
15            "@id": "https://obis.org/dataset/9381239f-3d64-48b4-80c9-b9ebb674edc2",
16            "@type": "prov:Entity",
17            "prov:wasAttributedTo": {
18                "@id": "https://www.re3data.org/repository/obis"
19            },
20            "prov:value": "https://obis.org/dataset/9381239f-3d64-48b4-80c9-b9ebb674edc2"
21        },
22        {
23            "@id": "https://gleaner.io/id/collection/7c1eaa1aaed95861330109026c42e57a31ecae55",
24            "@type": "prov:Collection",
25            "prov:hadMember": {
26                "@id": "https://obis.org/dataset/9381239f-3d64-48b4-80c9-b9ebb674edc2"
27            }
28        },
29        {
30            "@id": "urn:gleaner:milled:obis:7c1eaa1aaed95861330109026c42e57a31ecae55",
31            "@type": "prov:Entity",
32            "prov:value": "7c1eaa1aaed95861330109026c42e57a31ecae55.jsonld"
33        },
34        {
35            "@id": "https://gleaner.io/id/run/7c1eaa1aaed95861330109026c42e57a31ecae55",
36            "@type": "prov:Activity",
37            "prov:endedAtTime": {
38                "@value": "2021-04-20",
39                "@type": "http://www.w3.org/2001/XMLSchema#dateTime"
40            },
41            "prov:generated": {
42                "@id": "urn:gleaner:milled:obis:7c1eaa1aaed95861330109026c42e57a31ecae55"
43            },
44            "prov:used": {
45                "@id": "https://gleaner.io/id/collection/7c1eaa1aaed95861330109026c42e57a31ecae55"
46            }
47        }
48    ]
49}
Hide code cell source
import json
from rdflib.extras.external_graph_libs import rdflib_to_networkx_multidigraph
from rdflib.extras.external_graph_libs import rdflib_to_networkx_graph
from pyld import jsonld
import graphviz
import os, sys

currentdir = os.path.dirname(os.path.abspath(''))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)
from lib import jbutils

with open("../../../odis-in/dataGraphs/indexing/prov/graphs/gleaner.json") as dgraph:
    doc = json.load(dgraph)

frame = {
  "@context": {"@vocab": "https://schema.org/",
  "prov": "http://www.w3.org/ns/prov#"},
  "@explicit": "false",
  "@type":     "prov:Activity",
   "prov:generated": {},
   "prov:endedAtTime": {},
   "prov:used": {}
}


context = {
  "@vocab": "https://schema.org/",
  "prov": "http://www.w3.org/ns/prov#"
}

compacted = jsonld.compact(doc, context)

framed = jsonld.frame(compacted, frame)
jd = json.dumps(framed, indent=4)
print(jd)
{
    "@context": {
        "@vocab": "https://schema.org/",
        "prov": "http://www.w3.org/ns/prov#"
    },
    "@id": "https://gleaner.io/id/run/7c1eaa1aaed95861330109026c42e57a31ecae55",
    "@type": "prov:Activity",
    "prov:endedAtTime": {
        "@type": "http://www.w3.org/2001/XMLSchema#dateTime",
        "@value": "2021-04-20"
    },
    "prov:generated": {
        "@id": "urn:gleaner:milled:obis:7c1eaa1aaed95861330109026c42e57a31ecae55",
        "@type": "prov:Entity",
        "prov:value": "7c1eaa1aaed95861330109026c42e57a31ecae55.jsonld"
    },
    "prov:used": {
        "@id": "https://gleaner.io/id/collection/7c1eaa1aaed95861330109026c42e57a31ecae55",
        "@type": "prov:Collection",
        "prov:hadMember": {
            "@id": "https://obis.org/dataset/9381239f-3d64-48b4-80c9-b9ebb674edc2",
            "@type": "prov:Entity",
            "prov:value": "https://obis.org/dataset/9381239f-3d64-48b4-80c9-b9ebb674edc2",
            "prov:wasAttributedTo": {
                "@id": "https://www.re3data.org/repository/obis",
                "@type": "prov:Organization",
                "http://www.w3.org/1999/02/22-rdf-syntax-ns#name": "Ocean Biodiversity Information System",
                "http://www.w3.org/2000/01/rdf-schema#seeAlso": "https://obis.org"
            }
        }
    }
}

Nano Prov#

This is a basic nanoprov example. Note, this is a draft and the ID connections and examples have not been made yet.

 1{
 2    "@context": {
 3        "gleaner": "https://voc.gleaner.io/id/",
 4        "np": "http://www.nanopub.org/nschema#",
 5        "prov": "http://www.w3.org/ns/prov#",
 6        "xsd": "http://www.w3.org/2001/XMLSchema#"
 7    },
 8    "@set": [
 9        {
10            "@id": "gleaner:nanopub/XID",
11            "@type": "np:NanoPublication",
12            "np:hasAssertion": {
13                "@id": "gleaner:nanopub/XID#assertion"
14            },
15            "np:hasProvenance": {
16                "@id": "gleaner:nanopub/XID#provenance"
17            },
18            "np:hasPublicationInfo": {
19                "@id": "gleaner:nanopub/XID#pubInfo"
20            }
21        },
22        {
23            "@id": "gleaner:nanopub/XID#assertion",
24            "@graph": {
25                "@id": "DataSetURI",
26                "@type": "schema:Dataset",
27                "description": "This is where you would put corrections or annotations",
28                "identifier": [
29                    {
30                        "@type": "schema:PropertyValue",
31                        "name": "GraphSHA",
32                        "description": "A SHA256 sha stamp on the harvested data graph from a URL",
33                        "value": "{{SHA256 HASH HERE}}"
34                    },
35                    {
36                        "@type": "schema:PropertyValue",
37                        "name": "ProviderID",
38                        "description": "The id provided with the data graph by the provider",
39                        "value": "{{re3 or URL noted in config}}"
40                    },
41                    {
42                        "@type": "schema:PropertyValue",
43                        "name": "URL",
44                        "description": "The URL harvested by gleaner",
45                        "value": "{{The URL the JSON-LD came from}}"
46                    }
47                ]
48            }
49        },
50        {
51            "@id": "gleaner:nanopub/XID#provenance",
52            "@graph": {
53                "@id": "URIforprovondataset",
54                "prov:wasGeneratedAtTime": {
55                    "@value": "dateDone",
56                    "@type": "xsd:dateTime"
57                },
58                "prov:wasDerivedFrom": {
59                    "@id": "IDHERE"
60                },
61                "prov:wasAttributedTo": {
62                    "@id": "IDHERE"
63                }
64            }
65        },
66        {
67            "@id": "gleaner:nanopub/XID#pubInfo",
68            "@graph": {
69                "@id": "IDHERE",
70                "prov:wasAttributedTo": {
71                    "@id": "gleaner:tool/gleaner"
72                },
73                "prov:generatedAtTime": {
74                    "@value": "2019-10-23T14:38:00Z",
75                    "@type": "xsd:dateTime"
76                }
77            }
78        }
79    ]
80}
Hide code cell source
import json
from pyld import jsonld
import os, sys

currentdir = os.path.dirname(os.path.abspath(''))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)
from lib import jbutils

with open("../../../odis-in/dataGraphs/indexing/prov/graphs/nanoprov.json") as dgraph:
    doc = json.load(dgraph)

context = {
    "@vocab": "https://schema.org/",
}

compacted = jsonld.compact(doc, context)
jbutils.show_graph(compacted)
../../_images/b06ceb92c218d6631622534fcf12fa2fc8c352e36c51d9c644dce71215284382.svg

Refs#

Nanopubs Guidance