From baa8cbe88fefbdf3918fa651a587fb2dcdf7f231 Mon Sep 17 00:00:00 2001 From: callahantiff Date: Sat, 11 Mar 2023 14:20:11 -0500 Subject: [PATCH 01/18] fixing ubuntu dependency error --- .github/workflows/build-qa.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-qa.yml b/.github/workflows/build-qa.yml index 6ab5c718..7c008523 100644 --- a/.github/workflows/build-qa.yml +++ b/.github/workflows/build-qa.yml @@ -4,7 +4,7 @@ jobs: build: name: Quality Check - runs-on: ubuntu-latest + runs-on: Ubuntu-20.04 steps: - uses: actions/checkout@v2 with: From 8a7f2120a9900eb71e74c591d3ed53a54f62f55f Mon Sep 17 00:00:00 2001 From: callahantiff Date: Sat, 11 Mar 2023 14:29:39 -0500 Subject: [PATCH 02/18] adding virtual environment --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 83d85db9..9bd61430 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ __pycache__/ pkt_kg.egg* build/* dist/* +venv/* #### Testing .single_run From 1bab2d6b1c243d461d2d52480c6cd6a0244de319 Mon Sep 17 00:00:00 2001 From: callahantiff Date: Sat, 11 Mar 2023 15:01:52 -0500 Subject: [PATCH 03/18] addressing mypy error --- pkt_kg/utils/data_utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pkt_kg/utils/data_utils.py b/pkt_kg/utils/data_utils.py index 82ebcd92..c047b488 100644 --- a/pkt_kg/utils/data_utils.py +++ b/pkt_kg/utils/data_utils.py @@ -86,7 +86,7 @@ def url_download(url: str, write_location: str, filename: str) -> None: def ftp_url_download(url: str, write_location: str, filename: str) -> None: - """Downloads a file from an ftp server. + """Downloads a file from a ftp server. Args: url: A string that points to the location of a temp mapping file that needs to be processed. @@ -108,7 +108,7 @@ def ftp_url_download(url: str, write_location: str, filename: str) -> None: def gzipped_ftp_url_download(url: str, write_location: str, filename: str) -> None: - """Downloads a gzipped file from an ftp server. + """Downloads a gzipped file from a ftp server. Args: url: A string that points to the location of a temp mapping file that needs to be processed. @@ -153,9 +153,9 @@ def zipped_url_download(url: str, write_location: str, filename: str = '') -> No print('Downloading Zipped Data from {}'.format(url)) with requests.get(url, allow_redirects=True) as zip_data: - with ZipFile(BytesIO(zip_data.content)) as zip_file: + with ZipFile(BytesIO(zip_data.content)) as zip_file: # type: ignore zip_file.extractall(write_location[:-1]) - zip_data.close() + zip_data.close() # type: ignore if filename != '': os.rename(write_location + re.sub(zip_pat, '', url.split('/')[-1]), write_location + filename) return None @@ -213,7 +213,7 @@ def chunks(lst: List[str], chunk_size: int) -> Generator: Args: lst: A list of objects, can be strings or integers. - chunk_size: An integer which specifies the how big each chunk should be. + chunk_size: An integer which specifies how big each chunk should be. Returns: A nested list, where the length of each nested list is the size of the integer passed by the user. @@ -304,7 +304,7 @@ def explodes_data(df: pd.DataFrame, lst_cols: list, splitter: str, fill_value: s lst_cols: A list of columns to unnest splitter: A character delimiter used in nested columns fill_value: A string value to fill empty cell values with - preserve_idx: Whether or not thee original index should be preserved or reset. + preserve_idx: Whether the original index should be preserved or reset. Returns: An exploded Pandas DataFrame. From b3958525988414159e9ce0fb14070db22e2b20e0 Mon Sep 17 00:00:00 2001 From: callahantiff Date: Sat, 11 Mar 2023 15:01:58 -0500 Subject: [PATCH 04/18] updating owlnets --- pkt_kg/owlnets.py | 99 ++++++++++++++++++++++++++----------------- tests/test_owlnets.py | 58 +++++++++++++++++++++---- 2 files changed, 109 insertions(+), 48 deletions(-) diff --git a/pkt_kg/owlnets.py b/pkt_kg/owlnets.py index 379bcaf4..356dea5d 100644 --- a/pkt_kg/owlnets.py +++ b/pkt_kg/owlnets.py @@ -74,7 +74,7 @@ class OwlNets(object): default list ['RO']). Raises: - TypeError: If graph is not an rdflib.graph object. + TypeError: If graph is not a rdflib.graph object. ValueError: If graph is an empty rdflib.graph object. TypeError: If the file containing owl object properties is not a txt file. TypeError: If the file containing owl object properties is empty. @@ -139,7 +139,7 @@ def removes_disjoint_with_axioms(self) -> None: return None def removes_edges_with_owl_semantics(self, verbose: bool = True) -> Graph: - """Creates a filtered knowledge graph, such that only nodes that are owl:Class/owl:Individual connected via a + """Creates a filtered knowledge graph, such that only nodes that are owl:Class/owl:Individual connected via an owl:ObjectProperty and not an owl:AnnotationProperty. For example: REMOVE - edges needed to support owl semantics (not biologically meaningful): subject: obo:CLO_0037294; predicate: owl:AnnotationProperty; object: rdf:about=obo.CLO_0037294 @@ -148,7 +148,7 @@ def removes_edges_with_owl_semantics(self, verbose: bool = True) -> Graph: subject: obo:CHEBI_16130; predicate: obo:RO_0002606; object: obo:HP_0000832 Args: - verbose: A bool indicating whether or not to print/log method use. + verbose: A bool indicating whether to print/log method use. Returns: filtered_graph: An RDFLib graph that contains only clinically and biologically meaningful triples. @@ -192,12 +192,12 @@ def removes_edges_with_owl_semantics(self, verbose: bool = True) -> Graph: return filtered_graph def cleans_decoded_graph(self, verbose: bool = True) -> Graph: - """Creates a filtered knowledge graph, such that only nodes that are owl:Class/owl:Individual connected via a + """Creates a filtered knowledge graph, such that only nodes that are owl:Class/owl:Individual connected via an owl:ObjectProperty and not an owl:AnnotationProperty. This method is a reduced version of the removes_edges_with_owl_semantics method, which is meant to be applied to a graph after it's been decoded. Args: - verbose: A bool indicating whether or not to print/log progress. + verbose: A bool indicating whether to print/log progress. Returns: filtered_graph: An RDFLib graph that contains only clinically and biologically meaningful triples. @@ -303,7 +303,7 @@ def reconciles_classes(self, node: URIRef) -> Set: node: An RDFLib URIRef object. Returns: - matches: A set of tuples, where each tuple contains a triple that is comprised of three RDFLib objects of + matches: A set of tuples, where each tuple contains a triple that consists of three RDFLib objects of type URIRef, BNode, and/or Literal. """ @@ -387,7 +387,7 @@ def captures_cardinality_axioms(self, node_info: Set, node: URIRef) -> None: def detects_negation_axioms(self, node_info: Dict, node: URIRef) -> bool: """Removes axioms from an RDFLib Graph object that convey or contain negation. The method currently checks - for negation by searching for any occurrence of the following key words: "not", "lacks". + for negation by searching for any occurrence of the following keywords: "not", "lacks". Args: node_info: A nested dictionary. The outer dictionary keys are anonymous nodes and the inner keys are @@ -416,8 +416,8 @@ def detects_complement_of_constructed_classes(self, node_info: Dict, node: URIRe node: An RDFLib URIRef object containing node information. Returns: - True if the class is detected to contain a owl:ComplementOf constructor. - False if the class is not detected to contain a owl:ComplementOf constructor. + True if the class is detected to contain an owl:ComplementOf constructor. + False if the class is not detected to contain an owl:ComplementOf constructor. """ comp_res = {k: v for k, v in node_info.items() if 'complementOf' in v.keys()} @@ -429,10 +429,10 @@ def returns_object_property(sub: URIRef, obj: URIRef, prop: URIRef = None) -> UR """Checks the subject and object node types in order to determine the correct type of owl:ObjectProperty. The following ObjectProperties are returned for each of the following subject-object types: - - subject + object are not PATO terms + prop is None --> rdfs:subClassOf - - sub + obj are PATO terms + prop is None --> rdfs:subClassOf - - sub is not a PATO term, but obj is a PATO term --> owl:RO_000086 - - sub is a PATO term + obj is a PATO term + prop is not None --> prop + - if sub + obj are PATO terms + prop is None --> rdfs:subClassOf + - elif sub is not a PATO term, but obj is a PATO term --> obo:RO_000086 + - elif prop is not None --> prop + - else --> rdfs:subClassOf Args: sub: An rdflib.term object. @@ -444,8 +444,8 @@ def returns_object_property(sub: URIRef, obj: URIRef, prop: URIRef = None) -> UR """ if ('PATO' in sub and 'PATO' in obj) and not prop: return RDFS.subClassOf - elif ('PATO' not in sub and 'PATO' not in obj) and not prop: return RDFS.subClassOf - elif 'PATO' not in sub and 'PATO' in obj: return URIRef(obo + 'RO_0000086') + elif 'PATO' not in sub and 'PATO' in obj: return obo.RO_0000086 + elif not prop: return RDFS.subClassOf else: return prop @staticmethod @@ -499,20 +499,7 @@ def parses_constructors(self, node: URIRef, edges: Dict, class_dict: Dict, relat -> Tuple[Set, Optional[Dict]]: """Traverses a dictionary of rdflib objects used in the owl:unionOf or owl:intersectionOf constructors, from which the original set of edges used to the construct the class_node are edited, such that all owl-encoded - information is removed. For example: - INPUT: - - - - - - - - - - - - OUTPUT: [(CL_0000995, rdfs:subClassOf, CL_0001021), (CL_0000995, rdfs:subClassOf, CL_0001026)] + information is removed. See examples here: https://github.com/callahantiff/PheKnowLator/wiki/OWL-NETS-2.0. Args: node: An rdflib term of type URIRef or BNode that references an OWL-encoded class. @@ -526,9 +513,9 @@ def parses_constructors(self, node: URIRef, edges: Dict, class_dict: Dict, relat """ cleaned: Set = set() - if 'unionOf' in edges.keys() or 'intersectionOf' in edges.keys(): - batch = class_dict[edges['unionOf' if 'unionOf' in edges.keys() else 'intersectionOf']] - else: batch = edges + if 'unionOf' in edges.keys(): batch = class_dict[edges['unionOf']]; keyword = 'union' + elif 'intersectionOf' in edges.keys(): batch = class_dict[edges['intersectionOf']]; keyword = 'intersection' + else: batch = edges; keyword = 'other' while batch: if ('first' in batch.keys() and 'rest' in batch.keys()) and 'type' not in batch.keys(): @@ -540,7 +527,9 @@ def parses_constructors(self, node: URIRef, edges: Dict, class_dict: Dict, relat else: batch = class_dict[batch['rest']] elif isinstance(batch['first'], URIRef) and isinstance(batch['rest'], URIRef): obj_property = self.returns_object_property(node, batch['first'], relation) - cleaned |= {(node, obj_property, batch['first'])}; batch = None + if keyword == 'union': cleaned |= {(batch['first'], obj_property, node)} + else: cleaned |= {(node, obj_property, batch['first'])} + batch = None else: batch = self.parses_anonymous_axioms(batch, class_dict) else: break @@ -595,13 +584,41 @@ class (referenced by node) in order to remove owl-encoded information. An exampl return cleaned, results[1] else: return cleaned, axioms + @staticmethod + def verifies_cleaned_classes(cleaned_classes: Set) -> Set: + """Verifies a set of cleaned tuples to ensure that there are not duplicate triples (i.e., subject-object + pairs with different properties). The function assumes that a duplicate tuple will include RDFS.subClassOf, + which should be removed. + Args: + cleaned_classes: A set of tuples, where each tuple contains three URIRef objects. + Returns: + A set of tuples, where each tuple contains a cleaned triple comprised of three URIRef objects. + """ + + org = len([x[0::2] for x in list(cleaned_classes)]) + unq = len(set([x[0::2] for x in list(cleaned_classes)])) + + if org == unq: return cleaned_classes + else: + cleaned_dict: Dict = dict(); verified_classes: Set = set() + for s, p, o in cleaned_classes: + key = '{}--{}'.format(str(s), str(o)) + if key in cleaned_dict.keys(): cleaned_dict[key] += [str(p)] + else: cleaned_dict[key] = [str(p)] + for k, v in cleaned_dict.items(): + s = URIRef(k.split('--')[0]); o = URIRef(k.split('--')[1]) + if len(v) > 1 and str(RDFS.subClassOf) in v: p = URIRef([x for x in v if x != str(RDFS.subClassOf)][0]) + else: p = URIRef(v[0]) + verified_classes |= {(s, p, o)} + return verified_classes + def cleans_owl_encoded_entities(self, node_list: List, verbose: bool = True) -> None: - """Loops over a all owl:Class and owl: Axiom objects and decodes the OWL semantics returning the corresponding + """Loops over an all owl:Class and owl: Axiom objects and decodes the OWL semantics returning the corresponding triples for each type without OWL semantics. Args: node_list: A list of owl:Class and owl:Axiom entities to decode. - verbose: A bool indicating whether or not to print/log progress. + verbose: A bool indicating whether to print/log progress. Returns: None. @@ -620,8 +637,9 @@ def cleans_owl_encoded_entities(self, node_list: List, verbose: bool = True) -> if not neg and not comp: node, org = (node_info[0], node) if isinstance(node, BNode) else (node, node) cleaned_entities |= {org}; cleaned_classes: Set = set() - bnodes = set(x for x in self.graph.objects(org) if isinstance(x, BNode)) - for element in (bnodes if len(bnodes) > 0 else node_info[1].keys()): + # bnodes = set(x for x in self.graph.objects(org) if isinstance(x, BNode)) + # for element in (bnodes if len(bnodes) > 1 else node_info[1].keys()): + for element in node_info[1].keys(): edges = node_info[1][element] while edges: if 'subClassOf' in edges.keys(): @@ -636,11 +654,12 @@ def cleans_owl_encoded_entities(self, node_list: List, verbose: bool = True) -> results = self.parses_restrictions(node, edges, node_info[1]) if results is not None: cleaned_classes |= results[0]; edges = results[1] else: edges = None - else: # catch all other axioms -- only catching owl:onProperty + else: # catch all other axioms -- currently only catching owl:onProperty misc = [x for x in edges.keys() if x not in ['type', 'first', 'rest', 'onProperty']] edges = None; self.owl_nets_dict['misc'][n3(node)] = {tuple(misc)} - decoded_graph = adds_edges_to_graph(decoded_graph, list(cleaned_classes), False) - self.owl_nets_dict['decoded_entities'][n3(node)] = cleaned_classes + verified_classes = self.verifies_cleaned_classes(cleaned_classes) + decoded_graph = adds_edges_to_graph(decoded_graph, list(verified_classes), False) + self.owl_nets_dict['decoded_entities'][n3(node)] = verified_classes self.graph = decoded_graph; self.graph = self.cleans_decoded_graph(verbose) # ; pbar.close() return None diff --git a/tests/test_owlnets.py b/tests/test_owlnets.py index 2025348c..967e3d29 100644 --- a/tests/test_owlnets.py +++ b/tests/test_owlnets.py @@ -47,13 +47,17 @@ def setUp(self): # set-up input arguments self.write_location = self.dir_loc_resources + '/knowledge_graphs' self.kg_filename = '/so_with_imports.owl' + self.kg_filename2 = '/clo_with_imports.owl' # read in knowledge graph self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.graph2 = Graph().parse('http://purl.obolibrary.org/obo/clo.owl', format='xml') # initialize class self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, write_location=self.write_location, filename=self.kg_filename) self.owl_nets2 = OwlNets(kg_construct_approach='instance', graph=self.graph, write_location=self.write_location, filename=self.kg_filename) + self.owl_nets3 = OwlNets(kg_construct_approach='subclass', graph=self.graph2, + write_location=self.write_location, filename=self.kg_filename2) # update class attributes dir_loc_owltools = os.path.join(current_directory, 'utils/owltools') @@ -433,17 +437,17 @@ def test_returns_object_property(self): """Tests the returns_object_property method.""" # when sub and obj are PATO terms and property is none - res1 = self.owl_nets.returns_object_property(obo.PATO_0001199, obo.PATO_0000402, None) + res1 = self.owl_nets.returns_object_property(obo.PATO_0001199, obo.PATO_0000402) self.assertIsInstance(res1, URIRef) self.assertEqual(res1, RDFS.subClassOf) # when sub and obj are NOT PATO terms and property is none - res2 = self.owl_nets.returns_object_property(obo.SO_0000784, obo.GO_2000380, None) + res2 = self.owl_nets.returns_object_property(obo.SO_0000784, obo.GO_2000380) self.assertIsInstance(res2, URIRef) self.assertEqual(res2, RDFS.subClassOf) # when the obj is a PATO term and property is none - res3 = self.owl_nets.returns_object_property(obo.SO_0000784, obo.PATO_0001199, None) + res3 = self.owl_nets.returns_object_property(obo.SO_0000784, obo.PATO_0001199) self.assertIsInstance(res3, URIRef) self.assertEqual(res3, obo.RO_0000086) @@ -458,7 +462,7 @@ def test_returns_object_property(self): self.assertEqual(res5, obo.RO_0002202) # when sub is a PATO term and property is none - res6 = self.owl_nets.returns_object_property(obo.PATO_0001199, obo.SO_0000784, None) + res6 = self.owl_nets.returns_object_property(obo.PATO_0001199, obo.SO_0000784) self.assertEqual(res6, None) return None @@ -526,7 +530,7 @@ def test_parses_constructors_intersection(self): # set-up inputs node = obo.SO_0000034 node_info = self.owl_nets.creates_edge_dictionary(node) - bnodes = set(x for x in self.owl_nets.graph.objects(node, None) if isinstance(x, BNode)) + bnodes = set(x for x in self.owl_nets.graph.objects(node) if isinstance(x, BNode)) edges = {k: v for k, v in node_info[1].items() if 'intersectionOf' in v.keys() and k in bnodes} edges = node_info[1][list(x for x in bnodes if x in edges.keys())[0]] @@ -539,12 +543,12 @@ def test_parses_constructors_intersection(self): return None def test_parses_constructors_intersection2(self): - """Tests the parses_constructors method for the UnionOf class constructor""" + """Tests the parses_constructors method for the intersectionOf class constructor""" # set-up inputs node = obo.SO_0000078 node_info = self.owl_nets.creates_edge_dictionary(node) - bnodes = set(x for x in self.owl_nets.graph.objects(node, None) if isinstance(x, BNode)) + bnodes = set(x for x in self.owl_nets.graph.objects(node) if isinstance(x, BNode)) edges = {k: v for k, v in node_info[1].items() if 'intersectionOf' in v.keys() and k in bnodes} edges = node_info[1][list(x for x in bnodes if x in edges.keys())[0]] @@ -556,13 +560,33 @@ def test_parses_constructors_intersection2(self): return None + def test_parses_constructors_union(self): + """Tests the parses_constructors method for the unionOf class constructor""" + + # set-up inputs + node = obo.CL_0000995 + node_info = self.owl_nets3.creates_edge_dictionary(node) + bnodes = set(x for x in self.owl_nets3.graph.objects(node) if isinstance(x, BNode)) + edges = {k: v for k, v in node_info[1].items() if 'unionOf' in v.keys() and k in bnodes} + edges = node_info[1][list(x for x in bnodes if x in edges.keys())[0]] + + # test method + res = self.owl_nets3.parses_constructors(node, edges, node_info[1]) + self.assertIsInstance(res, Tuple) + self.assertEqual(sorted(list(res[0])), + [(obo.CL_0001021, RDFS.subClassOf, obo.CL_0000995), + (obo.CL_0001026, RDFS.subClassOf, obo.CL_0000995)]) + self.assertEqual(res[1], None) + + return None + def test_parses_restrictions(self): """Tests the parses_restrictions method.""" # set-up inputs node = obo.SO_0000078 node_info = self.owl_nets.creates_edge_dictionary(node) - bnodes = set(x for x in self.owl_nets.graph.objects(node, None) if isinstance(x, BNode)) + bnodes = set(x for x in self.owl_nets.graph.objects(node) if isinstance(x, BNode)) edges = {k: v for k, v in node_info[1].items() if ('type' in v.keys() and v['type'] == OWL.Restriction) and k in bnodes} edges = node_info[1][list(x for x in bnodes if x in edges.keys())[0]] @@ -576,6 +600,24 @@ def test_parses_restrictions(self): return None + def test_verifies_cleaned_classes(self): + """Tests the verifies_cleaned_classes method""" + + # create input data + cleaned_classes = {(obo.HP_0000602, obo.BFO_0000051, obo.HP_0000597), + (obo.HP_0000602, RDFS.subClassOf, obo.HP_0000597), + (obo.HP_0007715, RDFS.subClassOf, obo.HP_0000597), + (obo.HP_0007715, obo.BFO_0000051, obo.HP_0000597)} + cleaned_result = sorted(list({(obo.HP_0000602, obo.BFO_0000051, obo.HP_0000597), + (obo.HP_0007715, obo.BFO_0000051, obo.HP_0000597)})) + + # test method + verified_classes = self.owl_nets3.verifies_cleaned_classes(cleaned_classes) + self.assertIsInstance(verified_classes, Set) + self.assertEqual(sorted(list(verified_classes)), cleaned_result) + + return None + def test_cleans_owl_encoded_entities(self): """Tests the cleans_owl_encoded_entities method""" From 3e520f62e31a0d982a2763acf5495a7388506fb9 Mon Sep 17 00:00:00 2001 From: callahantiff Date: Sat, 11 Mar 2023 15:23:08 -0500 Subject: [PATCH 05/18] addressing deprecated networkx gpickle reader/writer --- .../entity_search/Entity_Search.ipynb | 10 +++-- pkt_kg/utils/kg_utils.py | 38 ++++++++++--------- 2 files changed, 26 insertions(+), 22 deletions(-) diff --git a/notebooks/tutorials/entity_search/Entity_Search.ipynb b/notebooks/tutorials/entity_search/Entity_Search.ipynb index a53e81b0..dd2b109d 100644 --- a/notebooks/tutorials/entity_search/Entity_Search.ipynb +++ b/notebooks/tutorials/entity_search/Entity_Search.ipynb @@ -245,7 +245,9 @@ "outputs": [], "source": [ "# load the knowledge graph\n", - "kg = nx.read_gpickle(write_location + data_urls[0].split('/')[-1])\n", + "with open(write_location + data_urls[0].split('/')[-1], 'rb') as f:\n", + " kg = pickle.load(f)\n", + "\n", "undirected_kg = nx.to_undirected(kg)\n", "print('The knowledge graph contains {} nodes and {} edges'.format(len(kg.nodes()), len(kg.edges())))" ] @@ -2131,9 +2133,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "venv", "language": "python", - "name": "python3" + "name": "venv" }, "language_info": { "codemirror_mode": { @@ -2145,7 +2147,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.2" + "version": "3.6.8" } }, "nbformat": 4, diff --git a/pkt_kg/utils/kg_utils.py b/pkt_kg/utils/kg_utils.py index 7f5720c5..c2e90b39 100644 --- a/pkt_kg/utils/kg_utils.py +++ b/pkt_kg/utils/kg_utils.py @@ -43,6 +43,7 @@ import networkx as nx # type: ignore import os import os.path +import pickle from collections import Counter # type: ignore from more_itertools import unique_everseen # type: ignore @@ -70,7 +71,7 @@ def gets_ontology_classes(graph: Graph) -> Set: graph: An rdflib Graph object. Returns: - class_list: A list of all of the classes in the graph. + class_list: A list of all the classes in the graph. Raises: ValueError: If the query returns zero nodes with type owl:ObjectProperty. @@ -108,7 +109,7 @@ def gets_deprecated_ontology_classes(graph: Graph) -> Set: graph: An rdflib Graph object. Returns: - class_list: A list of all of the deprecated OWL classes in the graph. + class_list: A list of all the deprecated OWL classes in the graph. """ class_list = {x for x in graph.subjects(OWL.deprecated, Literal('true', datatype=URIRef(schema + 'boolean')))} @@ -123,7 +124,7 @@ def gets_object_properties(graph: Graph) -> Set: graph: An rdflib Graph object. Returns: - object_property_list: A list of all of the object properties in the graph. + object_property_list: A list of all the object properties in the graph. Raises: ValueError: If the query returns zero nodes with type owl:ObjectProperty. @@ -162,7 +163,7 @@ def gets_ontology_class_synonyms(graph: Graph) -> Tuple: def gets_ontology_class_dbxrefs(graph: Graph) -> Tuple: """Queries a knowledge graph and returns a dictionary containing all owl:Class objects and their database - cross references (dbxref). Function also includes exact matches. A tuple of dictionaries: (1) contains dbxref and + cross-references (dbxref). Function also includes exact matches. A tuple of dictionaries: (1) contains dbxref and exact matches (URIs and labels); and (2) contains dbxref/exactmatch uris and a string indicating the type (i.e. dbxref or exact match). @@ -224,7 +225,7 @@ def gets_ontology_statistics(file_location: str, owltools_location: str = './pkt def merges_ontologies(onts: List[str], loc: str, merged: str, owltools: str = os.path.abspath('./pkt_kg/libs/owltools')) -> Graph: - """Using the OWLTools API, each ontology listed in in the ontologies attribute is recursively merged with into a + """Using the OWLTools API, each ontology listed in the ontologies attribute is recursively merged with into a master merged ontology file and saved locally to the provided file path via the merged_ontology attribute. The function assumes that the file is written to the directory specified by the write_location attribute. @@ -256,7 +257,7 @@ def ontology_file_formatter(loc: str, full_kg: str, owltools: str = os.path.absp Args: loc: A string pointing to a local directory for writing data. - full_kg: A string containing the subdirectory and name of the the knowledge graph file. + full_kg: A string containing the subdirectory and name of the knowledge graph file. owltools: A string pointing to the location of the owl tools library. Returns: @@ -286,7 +287,7 @@ def adds_edges_to_graph(graph: Graph, edge_list: Union[List, Set], progress_bar: Args: graph: An RDFLib Graph object. edge_list: A list or set of tuples, where each tuple contains a triple. - progress_bar: A boolean indicating whether or not the progress bar should be used. + progress_bar: A boolean indicating whether the progress bar should be used. Returns: graph: An updated RDFLib graph. @@ -401,7 +402,7 @@ def gets_entity_ancestors(graph: Graph, uris: List[Union[URIRef, str]], rel: Uni def connected_components(graph: Union[Graph, Set]) -> List: """Creates a dictionary where the keys are integers representing a component number and the values are sets containing the nodes for a given component. This method works by first converting the RDFLib graph into a - NetworkX multi-directed graph, which is converted to a undirected graph prior to calculating the connected + NetworkX multi-directed graph, which is converted to an undirected graph prior to calculating the connected components. Args: @@ -440,7 +441,7 @@ def removes_self_loops(graph: Graph) -> List: def derives_graph_statistics(graph: Union[Graph, Set, nx.MultiDiGraph]) -> str: """Derives statistics from an input knowledge graph and prints them to the console. Note that we are not converting each node to a string before deriving our counts. This is purposeful as the number of unique nodes is - altered when you it converted to a string. For example, in the HPO when honoring the RDF type of each node + altered when you converted it to a string. For example, in the HPO when honoring the RDF type of each node there are 406,717 unique nodes versus 406,331 unique nodes when ignoring the RDF type of each node. Args: @@ -523,7 +524,7 @@ def removes_namespace_from_bnodes(graph: Graph, ns: Union[str, Namespace] = pkt_ Args: graph: An RDFLib Graph object. ns: A string or RDFLib Namespace object (default='https://github.com/callahantiff/PheKnowLator/pkt/bnode/') - verbose: A bool flag used to indicate whether or not to print method function (default=False). + verbose: A bool flag used to indicate whether to print method function (default=False). Returns: updated_graph: An RDFLib Graph object with bnode namespaces removed. @@ -561,7 +562,7 @@ class identifier. A new edge for each triple, containing an instance of a class Args: graph: An RDFLib Graph object containing pkt-namespacing. const: A string containing the type of construction approach used to build the knowledge graph. - verbose: A bool flag used to indicate whether or not to print method function (default=False). + verbose: A bool flag used to indicate whether to print method function (default=False). Returns: graph: An RDFLib Graph object or set of RDFLib triples updated to remove bnode namespacing. @@ -599,7 +600,7 @@ class identifier. A new edge for each triple, containing an instance of a class def splits_knowledge_graph(graph: Graph, graph_output: bool = False) -> Tuple[Graph, Union[Graph, Set]]: """Method takes an input RDFLib Graph object and splits it into two new graphs where the first graph contains only those triples needed to maintain a base logical subset and the second contains only annotation assertions. - Please note that the code below processes both entities (i.e. owl:Class and owl:ObjectProperties + Please note that the code below processes both entities (i.e. owl:Class and owl:ObjectProperties). Source: https://www.w3.org/TR/owl2-syntax/#Annotation_Assertion @@ -719,7 +720,7 @@ def n3(node: Union[URIRef, BNode, Literal]) -> str: def convert_to_networkx(write_loc: str, filename: str, graph: Union[Graph, Set], stats: bool = False) -> Optional[str]: """Converts an RDFLib.Graph object into a Networkx MultiDiGraph and pickles a copy locally. Each node is provided a - key that is the URI identifier and each edge is given a key which is an md5 hash of the triple and a weight of + key that is the URI identifier and each edge is given a key which is a md5 hash of the triple and a weight of 0.0. An example of the output is shown below. The md5 hash is meant to store a unique key that represents that predicate with respect to the triples it occurs with. @@ -735,9 +736,9 @@ def convert_to_networkx(write_loc: str, filename: str, graph: Union[Graph, Set], Args: write_loc: A string pointing to a local directory for writing data. - filename: A string containing the subdirectory and name of the the knowledge graph file. + filename: A string containing the subdirectory and name of the knowledge graph file. graph: An RDFLib Graph object or set of RDFLib Graph triples. - stats: A bool indicating whether or not to derive network statistics after writing networkx file to disk. + stats: A bool indicating whether to derive network statistics after writing networkx file to disk. Returns: network_stats: A string containing network statistics information. @@ -751,13 +752,15 @@ def convert_to_networkx(write_loc: str, filename: str, graph: Union[Graph, Set], nx_mdg.add_node(s, key=n3(s)); nx_mdg.add_node(o, key=n3(o)) nx_mdg.add_edge(s, o, **{'key': p, 'predicate_key': pred_key, 'weight': 0.0}) print('Pickling MultiDiGraph') - nx.write_gpickle(nx_mdg, write_loc + filename + '_NetworkxMultiDiGraph.gpickle') + with open(write_loc + filename + '_NetworkxMultiDiGraph.gpickle', 'wb') as f: + pickle.dump(nx_mdg, f, pickle.HIGHEST_PROTOCOL) + if stats: print('Generating Network Statistics'); return derives_graph_statistics(nx_mdg) else: return None def appends_to_existing_file(edges: Union[List, Set, Graph], filepath: str, sep: str = ' ') -> None: - """Method adds data to the end of an existing file. Assumes that it is adding data to the end of a n-triples file. + """Method adds data to the end of an existing file. Assumes that it is adding data to the end of an n-triples file. Args: edges: A list or set of tuple, where each tuple is a triple. Or an RDFLib Graph object. @@ -775,4 +778,3 @@ def appends_to_existing_file(edges: Union[List, Set, Graph], filepath: str, sep: out.close() return None - From 0eb5da9ef2115e5ccc6136eff8907d8d3c98cd4e Mon Sep 17 00:00:00 2001 From: callahantiff Date: Sat, 11 Mar 2023 16:04:23 -0500 Subject: [PATCH 06/18] removing pkt namespace from final full builds --- pkt_kg/knowledge_graph.py | 41 ++++++++++++++++++++++++++------------- pkt_kg/utils/kg_utils.py | 4 ++-- 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/pkt_kg/knowledge_graph.py b/pkt_kg/knowledge_graph.py index cedf8461..8d2333d3 100644 --- a/pkt_kg/knowledge_graph.py +++ b/pkt_kg/knowledge_graph.py @@ -46,13 +46,13 @@ class KGBuilder(object): """Class creates a semantic knowledge graph. The class currently facilitates two construction approaches and three build types. The current construction approaches are Instance-based and Subclass-based. The three build types are - (1) Full (i.e. runs all build steps in the algorithm); (2) Partial (i.e. runs all of the build steps through + (1) Full (i.e. runs all build steps in the algorithm); (2) Partial (i.e. runs all the build steps through adding new edges); and (3) Post-Closure: Runs the remaining build steps over a closed knowledge graph. Attributes: construction: A string indicating the construction approach (i.e. instance or subclass). - node_data: A string ("yes" or "no") indicating whether or not to add node data to the knowledge graph. - inverse_relations: A string ("yes" or "no") indicating whether or not to add inverse edge relations. + node_data: A string ("yes" or "no") indicating whether to add node data to the knowledge graph. + inverse_relations: A string ("yes" or "no") indicating whether to add inverse edge relations. decode_owl: A string containing "yes" or "no" indicating whether owl semantics should be removed. cpus: An integer indicating the number of workers to use. write_location: An optional string passed to specify the primary directory to write to. @@ -60,7 +60,7 @@ class KGBuilder(object): Raises: ValueError: If the formatting of kg_version is incorrect (i.e. not "v.#.#.#"). ValueError: If write_location, edge_data does not contain a valid filepath. - OSError: If the ontologies, edge_data, subclass_dict files don't not exist. + OSError: If the ontologies, edge_data, subclass_dict files don't exist. TypeError: If the edge_data and subclass_dict files contains no data. TypeError: If the relations_data, node_data, ontologies directories do not contain any data. TypeError: If construction, inverse_relations, node_data, and decode_owl are not strings. @@ -174,7 +174,7 @@ def construct_knowledge_graph(self) -> None: @abstractmethod def gets_build_type(self) -> str: - """"A string representing the type of knowledge graph build.""" + """A string representing the type of knowledge graph build.""" pass @@ -187,7 +187,7 @@ class EdgeConstructor(object): kg_owl: A string containing a filename. rel_dict: A dictionary keyed by URI containing all relations for constructing an edge set. inverse_dict: A dictionary keyed by URI containing all relations and their inverse relation. - node_data: A string ("yes" or "no") indicating whether or not to add node data to the knowledge graph. + node_data: A string ("yes" or "no") indicating whether to add node data to the knowledge graph. metadata: An instance of the metadata class with bound method needed for created edge metadata. ont_cls: A set of RDFLib URIRef terms representing all classes in the core merged ontologies. obj_props: A set of RDFLib URIRef terms representing all object properties in the core merged ontologies. @@ -245,7 +245,7 @@ def verifies_object_property(self, object_property: URIRef) -> None: return None def checks_classes(self, edge_info) -> bool: - """Determines whether or not an edge is safe to add to the knowledge graph by making sure that any ontology + """Determines whether an edge is safe to add to the knowledge graph by making sure that any ontology class nodes are also present in the current list of classes from the merged ontologies graph. Args: @@ -266,7 +266,7 @@ class nodes are also present in the current list of classes from the merged onto else: return URIRef(finds_node_type(edge_info)['cls1']) in self.ont_classes def checks_relations(self, relation: str, edge_list: Union[List, Set]) -> Optional[str]: - """Determines whether or not an inverse relation should be created and added to the graph and verifies + """Determines whether an inverse relation should be created and added to the graph and verifies that a relation and its inverse (if it exists) are both an existing owl:ObjectProperty in the graph. @@ -353,7 +353,7 @@ def creates_new_edges(self, edge_type: str) -> Graph: class PartialBuild(KGBuilder): def gets_build_type(self) -> str: - """"A string representing the type of knowledge graph build.""" + """A string representing the type of knowledge graph build.""" return 'Partial Build' @@ -428,8 +428,15 @@ def construct_knowledge_graph(self) -> None: # deduplicate logic and annotation files, merge them, and print final stats deduplicates_file(f + annot); deduplicates_file(f + logic); merges_files(f + annot, f + logic, f + full) - graph = Graph().parse(f + full, format='nt') - s = 'Full (Logic + Annotation) {}'.format(derives_graph_statistics(graph)); print('\n' + s); logger.info(s) + str1 = '\nLoading Full (Logic + Annotation) Graph'; print('\n' + str1); logger.info(str1) + graph = Graph().parse(f + full, format='nt'); str2 = 'Deriving Stats'; print('\n' + str2); logger.info(str2) + + # allow logic only and annotation only subsets to contain pkt-namespaced bnodes, but clean + write full graph + s1 = 'Processing pkt-namespaced BNodes in Full (Logic + Annotation) graph'; logger.info(s1); print('\n' + s1) + graph = removes_namespace_from_bnodes(graph=removes_namespace_from_bnodes(graph), ns=pkt_ns) + graph.serialize(f + full, format='nt'); graph.serialize(f + full.replace('nt', 'owl')) + ontology_file_formatter(f, full.replace('nt', 'owl'), self.owl_tools) + s2 = 'Full (Logic + Annotation) {}'.format(derives_graph_statistics(graph)); print('\n' + s2); logger.info(s2) return None @@ -437,7 +444,7 @@ def construct_knowledge_graph(self) -> None: class PostClosureBuild(KGBuilder): def gets_build_type(self) -> str: - """"A string representing the type of knowledge graph being built.""" + """A string representing the type of knowledge graph being built.""" return 'Post-Closure Build' @@ -525,7 +532,7 @@ def construct_knowledge_graph(self) -> None: class FullBuild(KGBuilder): def gets_build_type(self) -> str: - """"A string representing the type of knowledge graph being built.""" + """A string representing the type of knowledge graph being built.""" return 'Full Build' @@ -622,6 +629,12 @@ def construct_knowledge_graph(self) -> None: deduplicates_file(f + annot); deduplicates_file(f + logic); merges_files(f + annot, f + logic, f + full) str1 = '\nLoading Full (Logic + Annotation) Graph'; print('\n' + str1); logger.info(str1) graph = Graph().parse(f + full, format='nt'); str2 = 'Deriving Stats'; print('\n' + str2); logger.info(str2) - s = 'Full (Logic + Annotation) {}'.format(derives_graph_statistics(graph)); print('\n' + s); logger.info(s) + + # allow logic only and annotation only subsets to contain pkt-namespaced bnodes, but clean + write full graph + s1 = 'Processing pkt-namespaced BNodes in Full (Logic + Annotation) graph'; logger.info(s1); print('\n' + s1) + graph = removes_namespace_from_bnodes(graph=removes_namespace_from_bnodes(graph), ns=pkt_ns) + graph.serialize(f + full, format='nt'); graph.serialize(f + full.replace('nt', 'owl')) + ontology_file_formatter(f, full.replace('nt', 'owl'), self.owl_tools) + s2 = 'Full (Logic + Annotation) {}'.format(derives_graph_statistics(graph)); print('\n' + s2); logger.info(s2) return None diff --git a/pkt_kg/utils/kg_utils.py b/pkt_kg/utils/kg_utils.py index c2e90b39..a9e04050 100644 --- a/pkt_kg/utils/kg_utils.py +++ b/pkt_kg/utils/kg_utils.py @@ -523,14 +523,14 @@ def removes_namespace_from_bnodes(graph: Graph, ns: Union[str, Namespace] = pkt_ Args: graph: An RDFLib Graph object. - ns: A string or RDFLib Namespace object (default='https://github.com/callahantiff/PheKnowLator/pkt/bnode/') + ns: A string or RDFLib Namespace object (default='https://github.com/callahantiff/PheKnowLator/pkt/bnode/'). verbose: A bool flag used to indicate whether to print method function (default=False). Returns: updated_graph: An RDFLib Graph object with bnode namespaces removed. """ - if verbose: print('Removing Namespace from BNodes'); print('Processing Original Nodes') + if verbose: print('Identifying BNodes with Namespace: {}'.format(str(ns))); print('Identifying BNodes') ns_uri = str(ns) if isinstance(ns, Namespace) else ns all_triples = set(graph) sub_only_bnodes_ns = {(s, p, o) for s, p, o in graph if str(s).startswith(ns_uri) and not str(o).startswith(ns_uri)} From 9b9830190b1f0b26d6df02696b86f6ae68201828 Mon Sep 17 00:00:00 2001 From: callahantiff Date: Sat, 11 Mar 2023 16:09:54 -0500 Subject: [PATCH 07/18] correcting typos and cleaning up comments --- pkt_kg/construction_approaches.py | 12 ++++++------ pkt_kg/downloads.py | 14 +++++++------- pkt_kg/edge_list.py | 10 +++++----- pkt_kg/utils/data_utils.py | 14 +++++++------- 4 files changed, 25 insertions(+), 25 deletions(-) diff --git a/pkt_kg/construction_approaches.py b/pkt_kg/construction_approaches.py index ae0d828b..d18664cb 100644 --- a/pkt_kg/construction_approaches.py +++ b/pkt_kg/construction_approaches.py @@ -44,9 +44,9 @@ class KGConstructionApproach(object): write_location: A string pointing to the 'resources' directory. Raises: - TypeError: If graph is not an rdflib.graph object. + TypeError: If graph is not a rdflib.graph object. TypeError: If edge_info and edge_dict are not dictionary objects. - ValueError: If graph, edge_info, edge_dict, or subclass_dict files are empty. + ValueError: If a graph, edge_info, edge_dict, or subclass_dict files are empty. OSError: If there is no subclass_dict file in the resources/construction_approach directory. """ @@ -73,7 +73,7 @@ def __init__(self, write_location: str) -> None: self.subclass_dict = pickle.load(filepath, encoding='bytes') def maps_node_to_class(self, edge_type: str, entity: str) -> Optional[List]: - """Takes an entity and checks whether or not it exists in a dictionary of subclass content, such that keys + """Takes an entity and checks whether it exists in a dictionary of subclass content, such that keys are non-class entity identifiers (e.g. Reactome identifiers) and values are sets of ontology class identifiers mapped to that non-class entity. For example: {'R-HSA-5601843': {'PW_0000001'}, 'R-HSA-77584': {'PW_0000001', 'GO_0008334'}} @@ -113,7 +113,7 @@ def subclass_core_constructor(node1: URIRef, node2: URIRef, relation: URIRef, in Args: node1: A URIRef or BNode object containing a subject node. - node2: A URIRef or BNode object containing a object node. + node2: A URIRef or BNode object containing an object node. relation: A URIRef object containing an owl:ObjectProperty. inv_relation: A string containing an inverse relation identifier (i.e. RO_0002200) or None (i.e. indicating no inverse relation). @@ -158,7 +158,7 @@ def subclass_constructor(self, edge_info: Dict, edge_type: str) -> List: Assumption: All ontology class nodes use the obo namespace. - Note. We explicitly type each node as a owl:Class and each relation/inverse relation as a owl:ObjectProperty. + Note. We explicitly type each node as an owl:Class and each relation/inverse relation as an owl:ObjectProperty. This may seem redundant, but it is needed in order to ensure consistency between the data after applying the OWL API to reformat the data. @@ -209,7 +209,7 @@ def instance_core_constructor(node1: URIRef, node2: URIRef, relation: URIRef, in Args: node1: A URIRef or BNode object containing a subject node. - node2: A URIRef or BNode object containing a object node. + node2: A URIRef or BNode object containing an object node. relation: A URIRef object containing an owl:ObjectProperty. inv_relation: A string containing the identifier for an inverse relation (i.e. RO_0002200) or None (i.e. indicator of no inverse relation). diff --git a/pkt_kg/downloads.py b/pkt_kg/downloads.py index 7bc5d27d..0dd88ab9 100644 --- a/pkt_kg/downloads.py +++ b/pkt_kg/downloads.py @@ -98,7 +98,7 @@ def parses_resource_file(self) -> None: Raises: ValueError: If the file does not contain data. - ValueError: If there some of the input URLs were improperly formatted. + ValueError: If ome input URLs were improperly formatted. """ pass @@ -112,7 +112,7 @@ def downloads_data_from_url(self) -> None: 'phenotype': 'resources/ontologies/hp_with_imports.owl'} Raises: - ValueError: If not all of the URLs returned valid data. + ValueError: If not all the URLs returned valid data. """ pass @@ -220,7 +220,7 @@ def generates_source_metadata(self) -> None: @abstractmethod def gets_data_type(self) -> str: - """"A string representing the type of data being processed.""" + """A string representing the type of data being processed.""" pass @@ -228,7 +228,7 @@ def gets_data_type(self) -> str: class OntData(DataSource): def gets_data_type(self) -> str: - """"A string representing the type of data being processed.""" + """A string representing the type of data being processed.""" return 'Ontology Data' @@ -242,7 +242,7 @@ def parses_resource_file(self) -> None: Raises: TypeError: If the file does not contain data. - ValueError: If there some of the input URLs were improperly formatted. + ValueError: If some input URLs were improperly formatted. """ if os.stat(self.data_path).st_size == 0: @@ -257,7 +257,7 @@ def parses_resource_file(self) -> None: def downloads_data_from_url(self, owltools_location: str = os.path.abspath('./pkt_kg/libs/owltools')) -> None: """Takes a string representing a file path/name to a text file as an argument. The function assumes - that each item in the input file list is an URL to an OWL/OBO ontology. + that each item in the input file list is a URL to an OWL/OBO ontology. For each URL, the referenced ontology is downloaded, and used as input to an OWLTools command line argument ( https://github.com/owlcollab/owltools/wiki/Extract-Properties-Command), which facilitates the downloading of @@ -306,7 +306,7 @@ def downloads_data_from_url(self, owltools_location: str = os.path.abspath('./pk class LinkedData(DataSource): def gets_data_type(self) -> str: - """"A string representing the type of data being processed.""" + """A string representing the type of data being processed.""" return 'Edge Data' diff --git a/pkt_kg/edge_list.py b/pkt_kg/edge_list.py index d3b2d973..c68c0268 100755 --- a/pkt_kg/edge_list.py +++ b/pkt_kg/edge_list.py @@ -74,7 +74,7 @@ def gets_source_info(self): @staticmethod def identify_header(file_path: str, delimiter: str, skip_rows: List[int]) -> Optional[int]: """Compares the similarity of the first line of a Pandas DataFrame to the column headers when read in with and - without a header to determine whether or not the data frame should be built with a header or not. This + without a header to determine whether the data frame should be built with a header or not. This function was modified from a Stack Overflow post: https://stackoverflow.com/a/40193509 Args: @@ -209,7 +209,7 @@ def data_reducer(cols: str, edge_data: pd.DataFrame) -> pd.DataFrame: edge_data = edge_data[[list(edge_data)[int(cols.split(';')[0])], list(edge_data)[int(cols.split(';')[1])]]] edge_data = edge_data.drop_duplicates(subset=None, keep='first', inplace=False) - # make sure neither column is float + # make sure neither column is a float for x in list(edge_data): if 'float' in str(edge_data[x].dtype): edge_data[x] = edge_data[x].astype(int) @@ -246,7 +246,7 @@ def label_formatter(edge_data: pd.DataFrame, label_criteria: str) -> pd.DataFram def data_merger(self, node: int, mapping_data: str, edge_data: pd.DataFrame) -> List[Union[str, pd.DataFrame]]: """Processes a string that contains instructions for mapping a column in the edge_data Pandas DataFrame. This - function assumes that the mapping data pointed to contains two columns: (1) identifier in edge_data to be + function assumes that the mapping data contains two columns: (1) identifier in edge_data to be mapped and (2) the desired identifier to map to. If one of the columns does not need to be mapped to an identifier then the original node's column is used for the final merge. @@ -288,7 +288,7 @@ def data_merger(self, node: int, mapping_data: str, edge_data: pd.DataFrame) -> def process_mapping_data(self, mapping_data: str, edge_data: pd.DataFrame) -> Tuple[Tuple[Any, Any], ...]: """Merges two mapped Pandas DataFrames into a single DataFrame. After merging the DataFrames, the function - removes all columns except the the mapped columns and removes any duplicate rows. + removes all columns except the mapped columns and removes any duplicate rows. Args: mapping_data: A ';' delimited string containing information on identifier mapping data. Each item @@ -354,7 +354,7 @@ def creates_knowledge_graph_edges(self, x: str) -> None: x: A string containing an edge type (e.g. "gene-gene"). Returns: - source_info: A dictionary that contains all of the master information for each edge type resource. For + source_info: A dictionary that contains all the master information for each edge type resource. For example: {'chemical-complex': {'source_labels': ';;', 'data_type': 'class-entity', 'edge_relation': 'RO_0002436', 'uri': ['https://ex/', 'https://ex/'], 'delimiter': 't', 'column_idx': '0;1', 'identifier_maps': 'None', diff --git a/pkt_kg/utils/data_utils.py b/pkt_kg/utils/data_utils.py index c047b488..977fbd9a 100644 --- a/pkt_kg/utils/data_utils.py +++ b/pkt_kg/utils/data_utils.py @@ -183,7 +183,7 @@ def gzipped_url_download(url: str, write_location: str, filename: str) -> None: def data_downloader(url: str, write_location: str, filename: str = '') -> None: - """Downloads data from a URL and saves the file to the `/resources/processed_data/unprocessed_data' directory. + """Downloads data from a URL and saves the file to the '/resources/processed_data/unprocessed_data' directory. Args: url: A string that points to the location of a temp mapping file that needs to be processed. @@ -300,10 +300,10 @@ def explodes_data(df: pd.DataFrame, lst_cols: list, splitter: str, fill_value: s treats the user-provided column list as a stack and recursively un-nests each column. Args: - df: A Pandas DataFrame containing nested columns - lst_cols: A list of columns to unnest - splitter: A character delimiter used in nested columns - fill_value: A string value to fill empty cell values with + df: A Pandas DataFrame containing nested columns. + lst_cols: A list of columns to unnest. + splitter: A character delimiter used in nested columns. + fill_value: A string value to fill empty cell values with. preserve_idx: Whether the original index should be preserved or reset. Returns: @@ -450,14 +450,14 @@ def merges_files(filepath1: str, filepath2: str, merged_filepath: str) -> None: def sublist_creator(actors: Union[Dict, List], chunk_size: int) -> List: - """Takes a list of lists and returns sublists, where the sublists are balanced according to their length. + """Takes a list of lists and returns sub lists, where the sub lists are balanced according to their length. SOURCE: https://stackoverflow.com/questions/61648065 Args: actors: A list or a dictionary keyed by edge identifier with the length of each associated edge list stored as the values. - chunk_size: An integer specifying the number of sublists that should be returned. + chunk_size: An integer specifying the number of sub lists that should be returned. Returns: updated_lists: A list of lists, where the inner lists have been balanced by their size. From a4187af6f37a19f2aa7dd76249a5cc3c144830d7 Mon Sep 17 00:00:00 2001 From: callahantiff Date: Sat, 11 Mar 2023 16:46:11 -0500 Subject: [PATCH 08/18] fixing mypy errors --- pkt_kg/knowledge_graph.py | 7 ++++--- pkt_kg/utils/kg_utils.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pkt_kg/knowledge_graph.py b/pkt_kg/knowledge_graph.py index 8d2333d3..30a4e937 100644 --- a/pkt_kg/knowledge_graph.py +++ b/pkt_kg/knowledge_graph.py @@ -31,6 +31,7 @@ # set global attributes obo = Namespace('http://purl.obolibrary.org/obo/') +pkt_ns = Namespace('https://github.com/callahantiff/PheKnowLator/pkt/') # logging log_dir, f_log, log_config = 'builds/logs', 'pkt_build_log.log', glob.glob('**/logging.ini', recursive=True) @@ -632,9 +633,9 @@ def construct_knowledge_graph(self) -> None: # allow logic only and annotation only subsets to contain pkt-namespaced bnodes, but clean + write full graph s1 = 'Processing pkt-namespaced BNodes in Full (Logic + Annotation) graph'; logger.info(s1); print('\n' + s1) - graph = removes_namespace_from_bnodes(graph=removes_namespace_from_bnodes(graph), ns=pkt_ns) - graph.serialize(f + full, format='nt'); graph.serialize(f + full.replace('nt', 'owl')) + clean_graph: Graph = removes_namespace_from_bnodes(graph=removes_namespace_from_bnodes(graph), ns=pkt_ns) + clean_graph.serialize(f + full, format='nt'); clean_graph.serialize(f + full.replace('nt', 'owl')) ontology_file_formatter(f, full.replace('nt', 'owl'), self.owl_tools) - s2 = 'Full (Logic + Annotation) {}'.format(derives_graph_statistics(graph)); print('\n' + s2); logger.info(s2) + s2 = 'Full (Logic + Annotation) {}'.format(derives_graph_statistics(clean_graph)); print('\n' + s2); logger.info(s2) return None diff --git a/pkt_kg/utils/kg_utils.py b/pkt_kg/utils/kg_utils.py index a9e04050..8373c2e0 100644 --- a/pkt_kg/utils/kg_utils.py +++ b/pkt_kg/utils/kg_utils.py @@ -544,7 +544,7 @@ def removes_namespace_from_bnodes(graph: Graph, ns: Union[str, Namespace] = pkt_ both_fixed = {(BNode(str(s).split('/')[-1]), p, BNode(str(o).split('/')[-1])) for s, p, o in sub_and_obj_bnodes_ns} del sub_only_bnodes_ns, obj_only_bnodes_ns, sub_and_obj_bnodes_ns if verbose: print('Finalizing Updated Graph') - updated_graph = Graph() + updated_graph: Graph = Graph() for s, p, o in (graph_no_bnodes | sub_fixed | obj_fixed | both_fixed): updated_graph.add((s, p, o)) return updated_graph From ef1f9c2a0b970e913af1f49395b6d037878c69a8 Mon Sep 17 00:00:00 2001 From: callahantiff Date: Sat, 11 Mar 2023 17:43:16 -0500 Subject: [PATCH 09/18] handling resource warnings --- pkt_kg/metadata.py | 20 +++++++++++++++++--- pkt_kg/utils/kg_utils.py | 4 ++++ tests/test_kg_utils.py | 4 ++++ tests/test_metadata.py | 13 +++++++++++-- 4 files changed, 36 insertions(+), 5 deletions(-) diff --git a/pkt_kg/metadata.py b/pkt_kg/metadata.py index 0ec8dc98..918942f5 100644 --- a/pkt_kg/metadata.py +++ b/pkt_kg/metadata.py @@ -11,6 +11,7 @@ import pickle import re # import subprocess +import warnings from datetime import datetime from rdflib import Graph, Literal, Namespace, URIRef # type: ignore @@ -23,6 +24,10 @@ # set environmental variables oboinowl = Namespace('http://www.geneontology.org/formats/oboInOwl#') obo = Namespace('http://purl.obolibrary.org/obo/') + +# silence warning related to closing file +warnings.simplefilter('ignore', ResourceWarning) + # logging log_dir, log, log_config = 'builds/logs', 'pkt_build_log.log', glob.glob('**/logging.ini', recursive=True) try: @@ -78,7 +83,9 @@ def metadata_processor(self) -> None: if self.node_data: log_str = 'Loading and Processing Node Metadata'; print(log_str); logger.info(log_str) - self.node_dict = pickle.load(open(self.node_data[0], 'rb'), encoding="utf8") + # self.node_dict = pickle.load(open(self.node_data[0], 'rb'), encoding="utf8") + with open(self.node_data[0], 'rb') as input_file: + self.node_dict = pickle.load(input_file, encoding="utf8") return None @@ -148,7 +155,10 @@ def extract_metadata(self, graph: Graph) -> None: 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type': { 'Label': 'type', 'Description': 'The subject is an instance of a class.', 'Synonym': 'None'}}} - if self.node_data: pickle.dump(self.node_dict, open(self.node_data[0], 'wb')) + if self.node_data: + # pickle.dump(self.node_dict, open(self.node_data[0], 'wb')) + with open(self.node_data[0], 'wb') as output_file: + pickle.dump(self.node_dict, output_file) return None @@ -252,7 +262,11 @@ def output_metadata(self, node_integer_map: Dict, graph: Union[Set, Graph]) -> N log_str = 'Writing Class Metadata'; print(log_str); logger.info(log_str) # make sure that the metadata dict contains valid entries - self._tidy_metadata(); pickle.dump(self.node_dict, open(self.node_data[0], 'wb')) + self._tidy_metadata() + # pickle.dump(self.node_dict, open(self.node_data[0], 'wb')) + with open(self.node_data[0], 'wb') as output_file: + pickle.dump(self.node_dict, output_file) + # write metadata in flat-file entities = set([i for j in tqdm(graph) for i in j]); filename = self.full_kg[:-4] + '_NodeLabels.txt' with open(self.write_location + filename, 'w', encoding='utf-8') as out: diff --git a/pkt_kg/utils/kg_utils.py b/pkt_kg/utils/kg_utils.py index 8373c2e0..2ab729a7 100644 --- a/pkt_kg/utils/kg_utils.py +++ b/pkt_kg/utils/kg_utils.py @@ -44,6 +44,7 @@ import os import os.path import pickle +import warnings from collections import Counter # type: ignore from more_itertools import unique_everseen # type: ignore @@ -63,6 +64,9 @@ pkt_bnode = Namespace('https://github.com/callahantiff/PheKnowLator/pkt/bnode/') schema = Namespace('http://www.w3.org/2001/XMLSchema#') +# silence warning relating to closing file +warnings.simplefilter('ignore', ResourceWarning) + def gets_ontology_classes(graph: Graph) -> Set: """Queries a knowledge graph and returns a list of all owl:Class objects (excluding BNodes) in the graph. diff --git a/tests/test_kg_utils.py b/tests/test_kg_utils.py index 168ad3a6..2315815a 100644 --- a/tests/test_kg_utils.py +++ b/tests/test_kg_utils.py @@ -4,6 +4,7 @@ import os.path import shutil import unittest +import warnings from mock import patch from typing import Dict, List, Set, Tuple @@ -15,6 +16,9 @@ # set global attributes obo = Namespace('http://purl.obolibrary.org/obo/') +# silence warning relating to closing file +warnings.simplefilter('ignore', ResourceWarning) + class TestKGUtils(unittest.TestCase): """Class to test knowledge graph utility methods.""" diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 8db89c5b..f2ca0dee 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -4,6 +4,7 @@ import os.path import pickle import unittest +import warnings from rdflib import Graph, Namespace from rdflib.namespace import RDF, OWL @@ -12,6 +13,10 @@ from pkt_kg.metadata import * +# silence warning relating to closing file +warnings.simplefilter('ignore', ResourceWarning) + + class TestMetadata(unittest.TestCase): """Class to test the metadata class from the metadata script.""" @@ -224,7 +229,9 @@ def test_output_metadata_graph(self): os.remove(filename + 'SO_Triples_Integer_Identifier_Map.json') # write original data - pickle.dump(original_dict, open(self.dir_loc + '/node_data/node_metadata_dict.pkl', 'wb')) + # pickle.dump(original_dict, open(self.dir_loc + '/node_data/node_metadata_dict.pkl', 'wb')) + with open(self.dir_loc + '/node_data/node_metadata_dict.pkl', 'wb') as output_file: + pickle.dump(original_dict, output_file) return None @@ -268,7 +275,9 @@ def test_output_metadata_graph_set(self): os.remove(filename + 'SO_Triples_Integer_Identifier_Map.json') # write original data - pickle.dump(original_dict, open(self.dir_loc + '/node_data/node_metadata_dict.pkl', 'wb')) + # pickle.dump(original_dict, open(self.dir_loc + '/node_data/node_metadata_dict.pkl', 'wb')) + with open(self.dir_loc + '/node_data/node_metadata_dict.pkl', 'wb') as output_file: + pickle.dump(original_dict, output_file) return None From 6016ca87932c56158a671748624494ca23600bbb Mon Sep 17 00:00:00 2001 From: callahantiff Date: Sat, 11 Mar 2023 17:51:14 -0500 Subject: [PATCH 10/18] updating github actions version --- .github/workflows/build-qa.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-qa.yml b/.github/workflows/build-qa.yml index 7c008523..8ca847ab 100644 --- a/.github/workflows/build-qa.yml +++ b/.github/workflows/build-qa.yml @@ -6,7 +6,7 @@ jobs: name: Quality Check runs-on: Ubuntu-20.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: fetch-depth: 0 - name: Setup Python From d9f8a15c047fcd4ac389ee88c93bb5a8fbfc8b25 Mon Sep 17 00:00:00 2001 From: callahantiff Date: Sat, 11 Mar 2023 18:56:23 -0500 Subject: [PATCH 11/18] fixing typo --- notebooks/OWLNETS_Example_Application.ipynb | 22 ++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/notebooks/OWLNETS_Example_Application.ipynb b/notebooks/OWLNETS_Example_Application.ipynb index 9b40251f..70362012 100644 --- a/notebooks/OWLNETS_Example_Application.ipynb +++ b/notebooks/OWLNETS_Example_Application.ipynb @@ -94,9 +94,9 @@ "metadata": {}, "outputs": [], "source": [ - "# # uncomment and run to install any required modules from notebooks/requirements.txt\n", + "# # if running a local version of pkt_kg (i.e., not running the PyPI version), uncomment the code below\n", "# import sys\n", - "# !{sys.executable} -m pip install -r requirements.txt" + "# sys.path.append('../')" ] }, { @@ -105,9 +105,9 @@ "metadata": {}, "outputs": [], "source": [ - "# # if running a local version of pkt_kg (i.e., not running the PyPI version), uncomment the code below\n", - "# import sys\n", - "# sys.path.append('../')" + "%%collapse\n", + "# uncomment and run to install any required modules from notebooks/requirements.txt\n", + "# !python -m pip install -r requirements.txt" ] }, { @@ -613,8 +613,8 @@ "metadata": {}, "outputs": [], "source": [ - "str1 = 'Decoded {} owl-encoded classes and axioms. Note the following:\\nPartially processed {} cardinality ' \\\n", - " 'elements\\nRemoved {} owl:disjointWith axioms\\n\\nIgnored:\\n -{} misc classes;\\n -{} classes constructed with ' \\\n", + "str1 = 'Decoded {} owl-encoded classes and axioms. Note the following:\\n -Partially processed {} cardinality ' \\\n", + " 'elements\\n -Removed {} owl:disjointWith axioms\\n\\nIgnored:\\n -{} misc classes;\\n -{} classes constructed with ' \\\n", " 'owl:complementOf;\\n -{} classes containing negation (e.g. pr#lacks_part, cl#has_not_completed)\\n' \\\n", " '\\nFiltering removed {} semantic support triples'\n", "stats_str = str1.format(\n", @@ -648,7 +648,7 @@ "metadata": {}, "outputs": [], "source": [ - "# run line below if you want to ensure resulting graph contains \n", + "# run line below if you want to ensure resulting graph contains only a single connected component\n", "common_ancestor = 'http://purl.obolibrary.org/obo/BFO_0000001'\n", "owlnets.graph = owlnets.makes_graph_connected(owlnets.graph, common_ancestor)" ] @@ -822,9 +822,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "venv", "language": "python", - "name": "python3" + "name": "venv" }, "language_info": { "codemirror_mode": { @@ -836,7 +836,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.2" + "version": "3.6.8" } }, "nbformat": 4, From d0c39f2e9fc5bfa57bb5258a96f578bdb25ac382 Mon Sep 17 00:00:00 2001 From: callahantiff Date: Sat, 11 Mar 2023 20:10:59 -0500 Subject: [PATCH 12/18] fixed owl-nets error and sped up tests --- pkt_kg/owlnets.py | 41 +++++-- tests/test_owlnets.py | 276 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 268 insertions(+), 49 deletions(-) diff --git a/pkt_kg/owlnets.py b/pkt_kg/owlnets.py index 356dea5d..a69a5ae5 100644 --- a/pkt_kg/owlnets.py +++ b/pkt_kg/owlnets.py @@ -303,7 +303,7 @@ def reconciles_classes(self, node: URIRef) -> Set: node: An RDFLib URIRef object. Returns: - matches: A set of tuples, where each tuple contains a triple that consists of three RDFLib objects of + matches: A set of tuples, where each tuple contains a triple comprised of three RDFLib objects of type URIRef, BNode, and/or Literal. """ @@ -425,15 +425,24 @@ def detects_complement_of_constructed_classes(self, node_info: Dict, node: URIRe else: return False @staticmethod - def returns_object_property(sub: URIRef, obj: URIRef, prop: URIRef = None) -> URIRef: + def returns_object_property(sub: URIRef, obj: URIRef, prop: Optional[URIRef] = None) -> URIRef: """Checks the subject and object node types in order to determine the correct type of owl:ObjectProperty. The following ObjectProperties are returned for each of the following subject-object types: - if sub + obj are PATO terms + prop is None --> rdfs:subClassOf + - if sub + obj are not PATO terms + prop is None --> rdfs:subClassOf - elif sub is not a PATO term, but obj is a PATO term --> obo:RO_000086 - elif prop is not None --> prop - else --> rdfs:subClassOf + - if sub + obj are PATO terms and property is none --> rdfs:subClassOf + - if sub + obj are NOT PATO terms and property is none --> rdfs:subClassOf + - obj is PATO term and property is none --> obo:RO_000086 + - obj is PATO term and property is NOT none --> obo:RO_000086 + - subj is a PATO term and property is none --> RDFS.subClassOf + - subj is a PATO term and property is NOT none --> prop + + Args: sub: An rdflib.term object. obj: An rdflib.term object. @@ -443,10 +452,18 @@ def returns_object_property(sub: URIRef, obj: URIRef, prop: URIRef = None) -> UR An rdflib.term object that represents an owl:ObjectProperty. """ - if ('PATO' in sub and 'PATO' in obj) and not prop: return RDFS.subClassOf - elif 'PATO' not in sub and 'PATO' in obj: return obo.RO_0000086 - elif not prop: return RDFS.subClassOf - else: return prop + # if ('PATO' in sub and 'PATO' in obj) and prop is None: return RDFS.subClassOf + # elif 'PATO' not in sub and 'PATO' in obj: return obo.RO_0000086 + # elif prop is None: return RDFS.subClassOf + # else: return prop + + if ('PATO' in sub and 'PATO' in obj) and prop is None: return RDFS.subClassOf + elif ('PATO' not in sub and 'PATO' not in obj) and prop is None: return RDFS.subClassOf + elif 'PATO' in obj and prop is None: return obo.RO_0000086 + elif 'PATO' in obj and prop is not None: return obo.RO_0000086 + elif 'PATO' in sub and prop is None: return RDFS.subClassOf + elif 'PATO' in sub and prop is not None: return prop + elif prop is None: return RDFS.subClassOf @staticmethod def parses_anonymous_axioms(edges: Dict, class_dict: Dict) -> Dict: @@ -522,7 +539,8 @@ def parses_constructors(self, node: URIRef, edges: Dict, class_dict: Dict, relat if isinstance(batch['first'], URIRef) and isinstance(batch['rest'], BNode): obj_property = self.returns_object_property(node, batch['first'], relation) if node != batch['first']: - cleaned |= {(node, obj_property, batch['first'])} + if keyword == 'union': cleaned |= {(batch['first'], obj_property, node)} + else: cleaned |= {(node, obj_property, batch['first'])} batch = class_dict[batch['rest']] if 'rest' in batch.keys() else None else: batch = class_dict[batch['rest']] elif isinstance(batch['first'], URIRef) and isinstance(batch['rest'], URIRef): @@ -589,8 +607,10 @@ def verifies_cleaned_classes(cleaned_classes: Set) -> Set: """Verifies a set of cleaned tuples to ensure that there are not duplicate triples (i.e., subject-object pairs with different properties). The function assumes that a duplicate tuple will include RDFS.subClassOf, which should be removed. + Args: cleaned_classes: A set of tuples, where each tuple contains three URIRef objects. + Returns: A set of tuples, where each tuple contains a cleaned triple comprised of three URIRef objects. """ @@ -686,7 +706,6 @@ def makes_graph_connected(self, graph: Graph, common_ancestor: Union[URIRef, str log_str = 'Obtaining node list'; print(log_str); logger.info(log_str) anc_node, roots = common_ancestor if isinstance(common_ancestor, URIRef) else URIRef(common_ancestor), set() nodes = set([x for x in tqdm(list(graph.subjects()) + list(graph.objects())) if isinstance(x, URIRef)]) - print('Identifying root nodes') for x in tqdm(nodes): ancs = gets_entity_ancestors(graph, [x], RDFS.subClassOf) @@ -698,7 +717,6 @@ def makes_graph_connected(self, graph: Graph, common_ancestor: Union[URIRef, str try: ancs = [mode(ancs)] except StatisticsError: ancs = sample(ancs, 1) if not any(x for x in ancs if x in roots) else [] roots |= {ancs[0]} if len(ancs) > 0 else {x} - log_str = 'Updating graph connectivity'; print(log_str); logger.info(log_str) rel = RDF.type if self.kg_construct_approach == 'instance' else RDFS.subClassOf needed_triples = set((URIRef(x), rel, anc_node) for x in roots if x != anc_node) @@ -720,13 +738,10 @@ def purifies_graph_build(self, graph: Graph) -> Graph: """ log_str = 'Purifying Graph Based on Construction Approach'; logger.info(log_str); print(log_str) - org_rel = RDF.type if self.kg_construct_approach == 'subclass' else RDFS.subClassOf pure_rel = RDFS.subClassOf if org_rel == RDF.type else RDF.type - log_str = 'Determining what triples need purification'; print(log_str); logger.info(log_str) triples = list(graph.triples((None, org_rel, None))) - log_str = 'Processing {} {} triples'.format(len(triples), org_rel); print(log_str); logger.info(log_str) for edge in tqdm(triples): graph.add((edge[0], pure_rel, edge[2])); graph.remove(edge) @@ -803,7 +818,7 @@ def runs_owlnets(self, cpus: int = 1) -> Tuple: except RuntimeError: pass acts = [ray.remote(OwlNets).remote(self.graph, loc, f, cons, ot) for _ in range(cpus)] # type: ignore for i in range(0, cpus): acts[i % cpus].cleans_owl_encoded_entities.remote(entities[i]) # type: ignore - _ = ray.wait([x.gets_owlnets_graph.remote() for x in acts], num_returns=len(acts)) # type: ignore + _ = ray.wait([x.gets_owlnets_graph.remote() for x in acts], num_returns=len(acts)) graph_res = ray.get([x.gets_owlnets_graph.remote() for x in acts]) # type: ignore full_graph = adds_edges_to_graph(full_graph, set(x for y in set(graph_res) for x in y), False) res2 += ray.get([x.gets_owlnets_dict.remote() for x in acts]); del acts # type: ignore diff --git a/tests/test_owlnets.py b/tests/test_owlnets.py index 967e3d29..f618939f 100644 --- a/tests/test_owlnets.py +++ b/tests/test_owlnets.py @@ -48,21 +48,8 @@ def setUp(self): self.write_location = self.dir_loc_resources + '/knowledge_graphs' self.kg_filename = '/so_with_imports.owl' self.kg_filename2 = '/clo_with_imports.owl' - # read in knowledge graph - self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') - self.graph2 = Graph().parse('http://purl.obolibrary.org/obo/clo.owl', format='xml') - # initialize class - self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, - write_location=self.write_location, filename=self.kg_filename) - self.owl_nets2 = OwlNets(kg_construct_approach='instance', graph=self.graph, - write_location=self.write_location, filename=self.kg_filename) - self.owl_nets3 = OwlNets(kg_construct_approach='subclass', graph=self.graph2, - write_location=self.write_location, filename=self.kg_filename2) - # update class attributes - dir_loc_owltools = os.path.join(current_directory, 'utils/owltools') - self.owl_nets.owl_tools = os.path.abspath(dir_loc_owltools) - self.owl_nets2.owl_tools = os.path.abspath(dir_loc_owltools) + self.dir_loc_owltools = os.path.join(current_directory, 'utils/owltools') return None @@ -80,6 +67,11 @@ def test_initialization_state(self): def test_initialization_owltools_default(self): """Tests the class initialization state for the owl_tools parameter when no default argument is passed.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, write_location=self.write_location, @@ -92,6 +84,11 @@ def test_initialization_owltools_default(self): def test_initialization_owltools(self): """Tests the class initialization state for the owl_tools parameter when an argument is passed.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, write_location=self.write_location, @@ -105,6 +102,12 @@ def test_initialization_owltools(self): def test_initialization_support(self): """Tests the class initialization state for the support parameter.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) + # when no list is passed owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, @@ -124,6 +127,12 @@ def test_initialization_support(self): def test_initialization_top_level(self): """Tests the class initialization state for the top_level parameter.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) + # when no list is passed owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, @@ -143,6 +152,12 @@ def test_initialization_top_level(self): def test_initialization_relations(self): """Tests the class initialization state for the relations parameter.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) + # when no list is passed owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, @@ -184,6 +199,12 @@ def test_initialization_state_graph(self): def test_graph_input_types(self): """Tests different graph input types.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) + # when graph is provided owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, @@ -205,6 +226,11 @@ def test_graph_input_types(self): def test_initialization_state_construction_approach(self): """Tests the class initialization state for construction approach type.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) self.assertIsInstance(self.owl_nets.kg_construct_approach, str) self.assertTrue(self.owl_nets.kg_construct_approach == 'subclass') self.assertFalse(self.owl_nets.kg_construct_approach == 'instance') @@ -214,6 +240,11 @@ def test_initialization_state_construction_approach(self): def test_initialization_owl_nets_dict(self): """Tests the class initialization state for owl_nets_dict.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) self.assertIsInstance(self.owl_nets.owl_nets_dict, Dict) self.assertIn('decoded_entities', self.owl_nets.owl_nets_dict.keys()) self.assertIn('cardinality', self.owl_nets.owl_nets_dict.keys()) @@ -228,6 +259,12 @@ def test_initialization_owl_nets_dict(self): def test_removes_disjoint_with_axioms(self): """Tests the removes_disjoint_with_axioms method.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) + # create test data triples = [(BNode('N9f94b'), URIRef('http://www.geneontology.org/formats/oboInOwl#source'), Literal('lexical', datatype=URIRef('http://www.w3.org/2001/XMLSchema#string'))), @@ -246,8 +283,14 @@ def test_removes_disjoint_with_axioms(self): def test_removes_edges_with_owl_semantics(self): """Tests the removes_edges_with_owl_semantics method.""" - filtered_graph = self.owl_nets.removes_edges_with_owl_semantics() + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) + # run method + filtered_graph = self.owl_nets.removes_edges_with_owl_semantics() self.assertIsInstance(filtered_graph, Graph) self.assertEqual(len(filtered_graph), 2328) @@ -256,6 +299,11 @@ def test_removes_edges_with_owl_semantics(self): def test_cleans_decoded_graph(self): """Tests the cleans_decoded_graph method when owl has been decoded.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) self.owl_nets.owl_nets_dict['decoded_classes'] = [1, 2, 3, 4, 5] # run method @@ -268,6 +316,12 @@ def test_cleans_decoded_graph(self): def test_recurses_axioms(self): """Tests the recurses_axioms method.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) + # run method when passing axioms that include BNodes seen_nodes = [] axioms = [(BNode('N194ae548a89740849c3536d9753d39d8'), OWL.someValuesFrom, obo.SO_0000784)] @@ -288,6 +342,12 @@ def test_recurses_axioms(self): def test_finds_uri(self): """Tests the finds_bnode_uri method.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) + # set-up testing data triples = [(BNode('N31fefc6d'), RDF.type, OWL.Axiom), (BNode('N31fefc6d'), OWL.annotatedProperty, RDFS.subClassOf), @@ -308,6 +368,12 @@ def test_finds_uri(self): def test_reconciles_axioms(self): """Tests the reconciles_axioms method.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) + # set-up testing data triples = [(BNode('N31fefc6d'), RDF.type, OWL.Axiom), (BNode('N31fefc6d'), OWL.annotatedProperty, RDFS.subClassOf), @@ -333,6 +399,12 @@ def test_reconciles_axioms(self): def test_reconciles_classes(self): """Tests the reconciles_classes method.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) + # set-up testing data triples = [(obo.UBERON_0002374, RDFS.subClassOf, BNode('N41c7c5fd')), (BNode('N41c7c5fd'), RDF.type, OWL.Restriction), @@ -353,6 +425,11 @@ def test_reconciles_classes(self): def test_creates_edge_dictionary(self): """Tests the creates_edge_dictionary method.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) node, edge_dict, cardinality = self.owl_nets.creates_edge_dictionary(obo.SO_0000822) self.assertIsInstance(node, URIRef) self.assertIsInstance(edge_dict, Dict) @@ -366,6 +443,12 @@ def test_creates_edge_dictionary(self): def test_detects_complement_of_constructed_classes_true(self): """Tests the detects_complement_of_constructed_classes method when complementOf is present.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) + # set-up test data node_info = {BNode('N6ebac4ecc22240cdafe506f43d240733'): {'complementOf': OWL.Restriction}} @@ -377,6 +460,12 @@ def test_detects_complement_of_constructed_classes_true(self): def test_detects_complement_of_constructed_classes_false(self): """Tests the detects_complement_of_constructed_classes method when complementOf is not present.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) + # set-up test data node_info = {BNode('N6ebac4ecc22240cdafe506f43d240733'): { 'type': OWL.Restriction, 'onClass': obo.UBERON_0000061, 'onProperty': obo.RO_0002180}} @@ -389,6 +478,12 @@ def test_detects_complement_of_constructed_classes_false(self): def test_detects_negation_axioms_true(self): """Tests the detects_negation_axioms method for negation axioms when one is present""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) + # set-up test data node_info = {BNode('N6ebac4ecc22240cdafe506f43d240733'): { 'type': OWL.Restriction, 'onClass': obo.UBERON_0000061, @@ -402,6 +497,12 @@ def test_detects_negation_axioms_true(self): def test_detects_negation_axioms_false(self): """Tests the detects_negation_axioms method for negation axioms when none present""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) + # set-up test data node = obo.UBERON_0000061 node_info = {BNode('N6ebac4ecc22240cdafe506f43d240733'): { @@ -415,6 +516,12 @@ def test_detects_negation_axioms_false(self): def test_captures_cardinality_axioms(self): """Tests the captures_cardinality_axioms method for a cardinality object.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) + # set-up input triples = [ (BNode('N6ebac'), URIRef('http://www.w3.org/2002/07/owl#minQualifiedCardinality'), @@ -436,6 +543,12 @@ def test_captures_cardinality_axioms(self): def test_returns_object_property(self): """Tests the returns_object_property method.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) + # when sub and obj are PATO terms and property is none res1 = self.owl_nets.returns_object_property(obo.PATO_0001199, obo.PATO_0000402) self.assertIsInstance(res1, URIRef) @@ -463,13 +576,19 @@ def test_returns_object_property(self): # when sub is a PATO term and property is none res6 = self.owl_nets.returns_object_property(obo.PATO_0001199, obo.SO_0000784) - self.assertEqual(res6, None) + self.assertEqual(res6, RDFS.subClassOf) return None def test_parses_subclasses(self): """Tests the parses_subclasses method.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) + # set-up input data node = obo.UBERON_0010757 edges = {'type': OWL.Class, 'subClassOf': obo.UBERON_0002238, 'intersectionOf': BNode('N6add87')} @@ -491,6 +610,12 @@ def test_parses_subclasses(self): def test_parses_anonymous_axioms(self): """Tests the parses_anonymous_axioms method.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) + # set-up input variables class_dict = { BNode('N41aa20'): {'first': obo.SO_0000340, 'rest': BNode('N6e7b')}, @@ -527,6 +652,12 @@ def test_parses_anonymous_axioms(self): def test_parses_constructors_intersection(self): """Tests the parses_constructors method for the intersectionOf class constructor""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) + # set-up inputs node = obo.SO_0000034 node_info = self.owl_nets.creates_edge_dictionary(node) @@ -545,6 +676,12 @@ def test_parses_constructors_intersection(self): def test_parses_constructors_intersection2(self): """Tests the parses_constructors method for the intersectionOf class constructor""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) + # set-up inputs node = obo.SO_0000078 node_info = self.owl_nets.creates_edge_dictionary(node) @@ -561,15 +698,22 @@ def test_parses_constructors_intersection2(self): return None def test_parses_constructors_union(self): - """Tests the parses_constructors method for the unionOf class constructor""" + """Tests the parses_constructors method for the unionOf class constructor and testing the + verifies_cleaned_classes function. Putting in a single test to reduce processing time.""" + # set-up test fata + self.graph2 = Graph().parse('http://purl.obolibrary.org/obo/clo.owl', format='xml') + self.owl_nets3 = OwlNets(kg_construct_approach='subclass', graph=self.graph2, + write_location=self.write_location, filename=self.kg_filename2) + self.owl_nets3.owl_tools = os.path.abspath(self.dir_loc_owltools) + + ## parses_constructors_union # set-up inputs node = obo.CL_0000995 node_info = self.owl_nets3.creates_edge_dictionary(node) bnodes = set(x for x in self.owl_nets3.graph.objects(node) if isinstance(x, BNode)) edges = {k: v for k, v in node_info[1].items() if 'unionOf' in v.keys() and k in bnodes} edges = node_info[1][list(x for x in bnodes if x in edges.keys())[0]] - # test method res = self.owl_nets3.parses_constructors(node, edges, node_info[1]) self.assertIsInstance(res, Tuple) @@ -578,11 +722,30 @@ def test_parses_constructors_union(self): (obo.CL_0001026, RDFS.subClassOf, obo.CL_0000995)]) self.assertEqual(res[1], None) + ## verifies_cleaned_classes + # create input data + cleaned_classes = {(obo.HP_0000602, obo.BFO_0000051, obo.HP_0000597), + (obo.HP_0000602, RDFS.subClassOf, obo.HP_0000597), + (obo.HP_0007715, RDFS.subClassOf, obo.HP_0000597), + (obo.HP_0007715, obo.BFO_0000051, obo.HP_0000597)} + cleaned_result = sorted(list({(obo.HP_0000602, obo.BFO_0000051, obo.HP_0000597), + (obo.HP_0007715, obo.BFO_0000051, obo.HP_0000597)})) + # test method + verified_classes = self.owl_nets3.verifies_cleaned_classes(cleaned_classes) + self.assertIsInstance(verified_classes, Set) + self.assertEqual(sorted(list(verified_classes)), cleaned_result) + return None def test_parses_restrictions(self): """Tests the parses_restrictions method.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) + # set-up inputs node = obo.SO_0000078 node_info = self.owl_nets.creates_edge_dictionary(node) @@ -600,27 +763,15 @@ def test_parses_restrictions(self): return None - def test_verifies_cleaned_classes(self): - """Tests the verifies_cleaned_classes method""" - - # create input data - cleaned_classes = {(obo.HP_0000602, obo.BFO_0000051, obo.HP_0000597), - (obo.HP_0000602, RDFS.subClassOf, obo.HP_0000597), - (obo.HP_0007715, RDFS.subClassOf, obo.HP_0000597), - (obo.HP_0007715, obo.BFO_0000051, obo.HP_0000597)} - cleaned_result = sorted(list({(obo.HP_0000602, obo.BFO_0000051, obo.HP_0000597), - (obo.HP_0007715, obo.BFO_0000051, obo.HP_0000597)})) - - # test method - verified_classes = self.owl_nets3.verifies_cleaned_classes(cleaned_classes) - self.assertIsInstance(verified_classes, Set) - self.assertEqual(sorted(list(verified_classes)), cleaned_result) - - return None - def test_cleans_owl_encoded_entities(self): """Tests the cleans_owl_encoded_entities method""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) + # test method self.owl_nets.cleans_owl_encoded_entities([obo.SO_0000822]) self.assertIsInstance(self.owl_nets.graph, Graph) @@ -636,6 +787,11 @@ def test_cleans_owl_encoded_entities(self): def test_makes_graph_connected_default(self): """Tests the makes_graph_connected method using the default argument for common_ancestor.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) starting_size = len(self.owl_nets.graph) connected_graph = self.owl_nets.makes_graph_connected(self.owl_nets.graph) self.assertTrue(len(connected_graph) > starting_size) @@ -645,6 +801,11 @@ def test_makes_graph_connected_default(self): def test_makes_graph_connected_other(self): """Tests the makes_graph_connected method using something other than the default arg for common_ancestor.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) starting_size = len(self.owl_nets.graph) # test when bad node is passed @@ -660,6 +821,12 @@ def test_makes_graph_connected_other(self): def test_purifies_graph_build_none(self): """Tests the purifies_graph_build method when kg_construction is None.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) + # initialize method owl_nets = OwlNets(graph=self.graph, write_location=self.write_location, filename=self.kg_filename) @@ -672,6 +839,12 @@ def test_purifies_graph_build_none(self): def test_purifies_graph_build_instance(self): """Tests the purifies_graph_build method when kg_construction is instance.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) + # initialize method owl_nets = OwlNets(kg_construct_approach='instance', graph=self.graph, write_location=self.write_location, filename=self.kg_filename) @@ -685,6 +858,12 @@ def test_purifies_graph_build_instance(self): def test_purifies_graph_build_subclass(self): """Tests the purifies_graph_build method when kg_construction is subclass.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) + # initialize method owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, write_location=self.write_location, filename=self.kg_filename) @@ -698,6 +877,11 @@ def test_purifies_graph_build_subclass(self): def test_write_out_results_regular(self): """Tests the write_out_results method.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) self.owl_nets.kg_construct_approach = None graph1, graph2 = self.owl_nets.runs_owlnets(); ray.shutdown() @@ -717,6 +901,11 @@ def test_write_out_results_regular(self): def test_write_out_results_subclass_purified(self): """Tests the owl_nets method.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) self.owl_nets.kg_construct_approach = 'subclass' graph1, graph2 = self.owl_nets.runs_owlnets(); ray.shutdown() @@ -745,6 +934,11 @@ def test_write_out_results_subclass_purified(self): def test_write_out_results_instance_purified(self): """Tests the owl_nets method.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets2 = OwlNets(kg_construct_approach='instance', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets2.owl_tools = os.path.abspath(self.dir_loc_owltools) graph1, graph2 = self.owl_nets2.runs_owlnets(); ray.shutdown() # test graph output @@ -772,6 +966,11 @@ def test_write_out_results_instance_purified(self): def tests_gets_owlnets_dict(self): """Tests gets_owlnets_dict method.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) results = self.owl_nets.gets_owlnets_dict() # verify results @@ -782,6 +981,11 @@ def tests_gets_owlnets_dict(self): def tests_gets_owlnets_graph(self): """Tests gets_owlnets_graphs method.""" + # set up structures needed for testing + self.graph = Graph().parse(self.dir_loc_resources + '/knowledge_graphs/so_with_imports.owl', format='xml') + self.owl_nets = OwlNets(kg_construct_approach='subclass', graph=self.graph, + write_location=self.write_location, filename=self.kg_filename) + self.owl_nets.owl_tools = os.path.abspath(self.dir_loc_owltools) graphs = self.owl_nets.gets_owlnets_graph() # verify results From cedf9bbc40c7268d49a3c47bdab07503f51fd6e6 Mon Sep 17 00:00:00 2001 From: callahantiff Date: Sat, 11 Mar 2023 20:36:58 -0500 Subject: [PATCH 13/18] cleaning up slow tests --- pkt_kg/metadata.py | 26 +++++++++----------------- pkt_kg/utils/kg_utils.py | 4 ---- tests/test_kg_utils.py | 4 ---- tests/test_metadata.py | 12 ++++-------- 4 files changed, 13 insertions(+), 33 deletions(-) diff --git a/pkt_kg/metadata.py b/pkt_kg/metadata.py index 918942f5..d6de9acc 100644 --- a/pkt_kg/metadata.py +++ b/pkt_kg/metadata.py @@ -3,15 +3,11 @@ # import needed libraries import glob -# import json import logging.config -# import os import os.path import pandas # type: ignore import pickle import re -# import subprocess -import warnings from datetime import datetime from rdflib import Graph, Literal, Namespace, URIRef # type: ignore @@ -25,9 +21,6 @@ oboinowl = Namespace('http://www.geneontology.org/formats/oboInOwl#') obo = Namespace('http://purl.obolibrary.org/obo/') -# silence warning related to closing file -warnings.simplefilter('ignore', ResourceWarning) - # logging log_dir, log, log_config = 'builds/logs', 'pkt_build_log.log', glob.glob('**/logging.ini', recursive=True) try: @@ -83,9 +76,9 @@ def metadata_processor(self) -> None: if self.node_data: log_str = 'Loading and Processing Node Metadata'; print(log_str); logger.info(log_str) - # self.node_dict = pickle.load(open(self.node_data[0], 'rb'), encoding="utf8") - with open(self.node_data[0], 'rb') as input_file: - self.node_dict = pickle.load(input_file, encoding="utf8") + self.node_dict = pickle.load(open(self.node_data[0], 'rb'), encoding="utf8") + # with open(self.node_data[0], 'rb') as input_file: + # self.node_dict = pickle.load(input_file, encoding="utf8") return None @@ -155,10 +148,9 @@ def extract_metadata(self, graph: Graph) -> None: 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type': { 'Label': 'type', 'Description': 'The subject is an instance of a class.', 'Synonym': 'None'}}} - if self.node_data: - # pickle.dump(self.node_dict, open(self.node_data[0], 'wb')) - with open(self.node_data[0], 'wb') as output_file: - pickle.dump(self.node_dict, output_file) + if self.node_data: pickle.dump(self.node_dict, open(self.node_data[0], 'wb')) + # with open(self.node_data[0], 'wb') as output_file: + # pickle.dump(self.node_dict, output_file) return None @@ -263,9 +255,9 @@ def output_metadata(self, node_integer_map: Dict, graph: Union[Set, Graph]) -> N # make sure that the metadata dict contains valid entries self._tidy_metadata() - # pickle.dump(self.node_dict, open(self.node_data[0], 'wb')) - with open(self.node_data[0], 'wb') as output_file: - pickle.dump(self.node_dict, output_file) + pickle.dump(self.node_dict, open(self.node_data[0], 'wb')) + # with open(self.node_data[0], 'wb') as output_file: + # pickle.dump(self.node_dict, output_file) # write metadata in flat-file entities = set([i for j in tqdm(graph) for i in j]); filename = self.full_kg[:-4] + '_NodeLabels.txt' diff --git a/pkt_kg/utils/kg_utils.py b/pkt_kg/utils/kg_utils.py index 2ab729a7..8373c2e0 100644 --- a/pkt_kg/utils/kg_utils.py +++ b/pkt_kg/utils/kg_utils.py @@ -44,7 +44,6 @@ import os import os.path import pickle -import warnings from collections import Counter # type: ignore from more_itertools import unique_everseen # type: ignore @@ -64,9 +63,6 @@ pkt_bnode = Namespace('https://github.com/callahantiff/PheKnowLator/pkt/bnode/') schema = Namespace('http://www.w3.org/2001/XMLSchema#') -# silence warning relating to closing file -warnings.simplefilter('ignore', ResourceWarning) - def gets_ontology_classes(graph: Graph) -> Set: """Queries a knowledge graph and returns a list of all owl:Class objects (excluding BNodes) in the graph. diff --git a/tests/test_kg_utils.py b/tests/test_kg_utils.py index 2315815a..168ad3a6 100644 --- a/tests/test_kg_utils.py +++ b/tests/test_kg_utils.py @@ -4,7 +4,6 @@ import os.path import shutil import unittest -import warnings from mock import patch from typing import Dict, List, Set, Tuple @@ -16,9 +15,6 @@ # set global attributes obo = Namespace('http://purl.obolibrary.org/obo/') -# silence warning relating to closing file -warnings.simplefilter('ignore', ResourceWarning) - class TestKGUtils(unittest.TestCase): """Class to test knowledge graph utility methods.""" diff --git a/tests/test_metadata.py b/tests/test_metadata.py index f2ca0dee..affb5799 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -4,7 +4,6 @@ import os.path import pickle import unittest -import warnings from rdflib import Graph, Namespace from rdflib.namespace import RDF, OWL @@ -13,10 +12,6 @@ from pkt_kg.metadata import * -# silence warning relating to closing file -warnings.simplefilter('ignore', ResourceWarning) - - class TestMetadata(unittest.TestCase): """Class to test the metadata class from the metadata script.""" @@ -35,9 +30,6 @@ def setUp(self): dir_loc2 = os.path.join(current_directory, 'utils/owltools') self.owltools_location = os.path.abspath(dir_loc2) - # create graph data - self.graph = Graph().parse(self.dir_loc + '/ontologies/so_with_imports.owl') - # set-up input arguments self.metadata = Metadata(kg_version='v2.0.0', write_location=self.dir_loc, @@ -87,6 +79,7 @@ def test_creates_node_metadata_nodes(self): """Tests the creates_node_metadata method.""" self.metadata.node_data = [self.metadata.node_data[0].replace('.pkl', '_test.pkl')] + self.graph = Graph().parse(self.dir_loc + '/ontologies/so_with_imports.owl') self.metadata.extract_metadata(self.graph) # test when the node has metadata @@ -125,6 +118,7 @@ def test_creates_node_metadata_relations(self): """Tests the creates_node_metadata method.""" self.metadata.node_data = [self.metadata.node_data[0].replace('.pkl', '_test.pkl')] + self.graph = Graph().parse(self.dir_loc + '/ontologies/so_with_imports.owl') self.metadata.extract_metadata(self.graph) # test when the node has metadata @@ -148,6 +142,7 @@ def test_creates_node_metadata_none(self): """Tests the creates_node_metadata method when node_dict is None.""" self.metadata.node_data = [self.metadata.node_data[0].replace('.pkl', '_test.pkl')] + self.graph = Graph().parse(self.dir_loc + '/ontologies/so_with_imports.owl') self.metadata.extract_metadata(self.graph) self.metadata.node_dict = None @@ -182,6 +177,7 @@ def test_extract_metadata(self): # extract metadata self.metadata.node_data = [self.metadata.node_data[0].replace('.pkl', '_test.pkl')] + self.graph = Graph().parse(self.dir_loc + '/ontologies/so_with_imports.owl') self.metadata.extract_metadata(graph=self.graph) # check that it worked From b5244d06a2f9aa73bd0a1463b112d390bd02effe Mon Sep 17 00:00:00 2001 From: callahantiff Date: Sat, 11 Mar 2023 20:37:34 -0500 Subject: [PATCH 14/18] addressing weird linting error --- pkt_kg/owlnets.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pkt_kg/owlnets.py b/pkt_kg/owlnets.py index a69a5ae5..8e75bb02 100644 --- a/pkt_kg/owlnets.py +++ b/pkt_kg/owlnets.py @@ -9,7 +9,6 @@ import os.path import pickle import ray # type: ignore -# import re from collections import ChainMap # type: ignore from random import sample, shuffle @@ -818,7 +817,7 @@ def runs_owlnets(self, cpus: int = 1) -> Tuple: except RuntimeError: pass acts = [ray.remote(OwlNets).remote(self.graph, loc, f, cons, ot) for _ in range(cpus)] # type: ignore for i in range(0, cpus): acts[i % cpus].cleans_owl_encoded_entities.remote(entities[i]) # type: ignore - _ = ray.wait([x.gets_owlnets_graph.remote() for x in acts], num_returns=len(acts)) + _ = ray.wait([x.gets_owlnets_graph.remote() for x in acts], num_returns=len(acts)) # type: ignore graph_res = ray.get([x.gets_owlnets_graph.remote() for x in acts]) # type: ignore full_graph = adds_edges_to_graph(full_graph, set(x for y in set(graph_res) for x in y), False) res2 += ray.get([x.gets_owlnets_dict.remote() for x in acts]); del acts # type: ignore From 989bac0077a6ed65b58688ce6c381be7c8947c75 Mon Sep 17 00:00:00 2001 From: callahantiff Date: Sat, 11 Mar 2023 21:05:32 -0500 Subject: [PATCH 15/18] bumping version for release --- pkt_kg/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkt_kg/__version__.py b/pkt_kg/__version__.py index e2fa8560..92ce8250 100644 --- a/pkt_kg/__version__.py +++ b/pkt_kg/__version__.py @@ -1,2 +1,2 @@ """Current version of package pkt_kg""" -__version__ = "3.0.2" +__version__ = "3.1.0" From c6f9cb44a9df70a4eda436c2c9eff0be777eb50a Mon Sep 17 00:00:00 2001 From: callahantiff Date: Sat, 11 Mar 2023 21:12:44 -0500 Subject: [PATCH 16/18] addressing docker error --- Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index f41a8990..7f2a0d30 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,15 +4,15 @@ ############################################ ## MULTI-STAGE CONTAINER CONFIGURATION ## FROM python:3.6.2 -RUN apt-get update && apt-get install -y \ +RUN sudo apt-get update && apt-get install -y \ apt-transport-https \ software-properties-common \ unzip \ curl RUN wget -O- https://apt.corretto.aws/corretto.key | apt-key add - && \ add-apt-repository 'deb https://apt.corretto.aws stable main' && \ - apt-get update && \ - apt-get install -y java-1.8.0-amazon-corretto-jdk + sudo apt-get update && \ + sudo apt-get install -y java-1.8.0-amazon-corretto-jdk ############################################ From a5de26aac6854f6f6a198e248bdda334c2c4f6fa Mon Sep 17 00:00:00 2001 From: callahantiff Date: Sat, 11 Mar 2023 21:42:09 -0500 Subject: [PATCH 17/18] updating arguments --- .github/workflows/build-qa.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build-qa.yml b/.github/workflows/build-qa.yml index 8ca847ab..75b115ac 100644 --- a/.github/workflows/build-qa.yml +++ b/.github/workflows/build-qa.yml @@ -60,17 +60,17 @@ jobs: runs-on: ubuntu-latest steps: - name: Set-up QEMU - uses: docker/setup-qemu-action@v1 + uses: docker/setup-qemu-action@v2 - name: Set-up Docker Buildx - uses: docker/setup-buildx-action@v1 + uses: docker/setup-buildx-action@v2 - name: Login to DockerHub - uses: docker/login-action@v1 + uses: docker/login-action@v2 with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - name: Build and Push Container to DockerHub id: docker_build - uses: docker/build-push-action@v2 + uses: docker/build-push-action@v4 with: push: true tags: callahantiff/pheknowlator:latest From 9b42ca3d980d7a91f82160e2caa866e920702e19 Mon Sep 17 00:00:00 2001 From: callahantiff Date: Sat, 11 Mar 2023 22:17:42 -0500 Subject: [PATCH 18/18] fixing docker error --- Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 7f2a0d30..f41a8990 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,15 +4,15 @@ ############################################ ## MULTI-STAGE CONTAINER CONFIGURATION ## FROM python:3.6.2 -RUN sudo apt-get update && apt-get install -y \ +RUN apt-get update && apt-get install -y \ apt-transport-https \ software-properties-common \ unzip \ curl RUN wget -O- https://apt.corretto.aws/corretto.key | apt-key add - && \ add-apt-repository 'deb https://apt.corretto.aws stable main' && \ - sudo apt-get update && \ - sudo apt-get install -y java-1.8.0-amazon-corretto-jdk + apt-get update && \ + apt-get install -y java-1.8.0-amazon-corretto-jdk ############################################