From bed084972cd7b9fe4fac58166d767d387c78f55e Mon Sep 17 00:00:00 2001 From: ynerant Date: Thu, 13 Jan 2022 22:06:19 +0100 Subject: [PATCH] =?UTF-8?q?T=C3=A9l=C3=A9verser=20un=20nouveau=20fichier?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.py | 218 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 218 insertions(+) create mode 100644 main.py diff --git a/main.py b/main.py new file mode 100644 index 0000000..392abd8 --- /dev/null +++ b/main.py @@ -0,0 +1,218 @@ +#!/usr/bin/env python3 + +import argparse +import itertools +import json +import tempfile +from dataclasses import dataclass + +from SPARQLWrapper import SPARQLWrapper +from bs4 import BeautifulSoup +import re +import subprocess +from functools import lru_cache + + +@dataclass +class Property: + url: str + range: str | None = None + domain: str | None = None + + @staticmethod + @lru_cache(maxsize=None) + def from_url(url: str) -> "Property": + """ + The property is only defined by its url and is then cached. + """ + return Property(url) + + @property + def name(self) -> str: + return self.url.split('/')[-1] if self.range else "" + + @property + def range_name(self) -> str: + return self.range.split('/')[-1] if self.range else "" + + +@dataclass +class Graph: + node: str + children: list[tuple[Property, "Graph"]] + + def to_tikz(self, prefix: str = "0", x: float = 0, depth: int = 0, header: bool = False) -> str: + tikz = "" + + if header: + tikz = "\\begin{center}\n" + tikz += "\\begin{tikzpicture}[y=3cm]\n" + + tikz += f"\\node[draw] ({prefix}) at ({x:.02f}, -{depth}) {{{self.node}}};\n" + + for j, child in enumerate(self.children): + rel, subgraph = child + tikz += subgraph.to_tikz(f"{prefix}-{j}", 5 * j / (depth + 1), depth + 1) + + tikz += f"\\draw[->] ({prefix}) -- ({prefix}-{j}) node[midway,above,sloped] {{{rel.name}}};\n" + + if header: + tikz += "\\end{tikzpicture}\n" + tikz += "\\end{center}\n" + + return tikz + + +def parse_arguments() -> tuple[str, int, int, int]: + """ + Parse the command line arguments. + """ + parser = argparse.ArgumentParser() + parser.add_argument('classname', type=str, help="Class name as input from DBPedia") + parser.add_argument('threshold', type=int, help="Exceptions threshold") + parser.add_argument('--limit', '-l', type=int, default=1000, help="Limit the number of queried results") + parser.add_argument('--recursion', '-r', type=int, default=2, help="Max height of keys") + args = parser.parse_args() + + return args.classname, args.threshold, args.limit, args.recursion + + +def query(classname: str, filename: str, limit: int = 1000) -> None: + """ + Query DBPedia using SPARQL for values of the given class. + """ + sparql = SPARQLWrapper(endpoint='http://dbpedia.org/sparql') + sparql.setQuery(f"""prefix db-owl: + +SELECT DISTINCT ?x ?r ?y +WHERE {{ + ?x rdf:type db-owl:{classname} . + ?x ?r ?y . +}} LIMIT {limit}""") + sparql.setReturnFormat('tsv') + results = sparql.query().convert().decode('UTF-8') + + with open(filename, 'w') as f: + for i, line in enumerate(results.split('\n')): + if i == 0: + continue + + if '\t' in line: + x, r, y = line.split('\t') + x, r, y = x[1:-1], r[1:-1], y[1:-1] + f.write(f'<{x}> <{r}> <{y}> .\n') + + +def sakey_output(file: str, threshold: int) -> tuple[list[list[str]], list[list[str]]]: + """ + Prints the SAKEY output for the given dataset file. + """ + process = subprocess.Popen(['java', '-jar', 'material/sakey.jar', file, str(threshold)], stdout=subprocess.PIPE) + process.wait() + + if process.returncode != 0: + # There was an error, that was printed in stderr. + exit(process.returncode) + + with process.stdout as stdout: + # Read the output from the process. + content = stdout.read().decode('UTF-8') + + # Parse the output. + non_keys, _, almost_keys, _ = content.split('\n') + + # Remove header + non_keys = non_keys.split(':', 1)[1][1:] + almost_keys = almost_keys.split(':', 1)[1] + + # Quote URLs + non_keys = re.sub('(\w)]', '\g<1>"]', re.sub('(\w),', '\g<1>",', re.sub('http', '"http', non_keys))) + almost_keys = re.sub('(\w)]', '\g<1>"]', re.sub('(\w),', '\g<1>",', re.sub('http', '"http', almost_keys))) + + # Parse JSON + non_keys = json.loads(non_keys) + almost_keys = json.loads(almost_keys) + + return non_keys, almost_keys + + +def find_interesting_keys(keys: list[list[str]]) -> list[list[Property]]: + """ + Parse keys to a better structure. + """ + interesting_keys = [] + + for key in keys: + # A key is a list of properties. + # We transform this key into a simple graph key + # Then we try to find a range and a domain for each property. + for prop in key: + prop = Property.from_url(prop) + if prop.range: + # This property can be extended. + interesting_keys.append([Property.from_url(url) for url in key]) + break + + return interesting_keys + + +def process(classname: str, threshold: int, limit: int = 1000, recursion: int = 2) -> list[Graph]: + filename = tempfile.NamedTemporaryFile().name + query(classname, filename, limit) + + # Parse output from SAKey + non_keys, almost_keys = sakey_output(filename, threshold) + + i_keys = find_interesting_keys(almost_keys) + + graph_keys = [Graph(classname, [])] + + if recursion <= 1: + for key in i_keys: + graph_keys.append(Graph(classname, [(p, Graph(p.range_name, [])) for p in key])) + else: + for key in i_keys: + subgraphs_list = [] + for p in key: + subgraphs = process(p.range_name, threshold, limit, recursion - 1) + subgraphs_list.append(subgraphs) + + for subgraphs_array in itertools.product(*subgraphs_list): + graph_keys.append(Graph(classname, [(p, g) for p, g in zip(key, subgraphs_array)])) + + return graph_keys + + +def main() -> None: + # Parse DBPedia ontology + with open('datasets/dbpedia.owl') as f: + ontology = BeautifulSoup(f, 'xml') + + for e in ontology.find_all('ns0:range'): + url = e.parent.get('rdf:about') + p = Property.from_url(url) + resource = e.get('rdf:resource') + if '#' not in resource: + p.range = resource + + for e in ontology.find_all('ns0:domain'): + url = e.parent.get('rdf:about') + p = Property.from_url(url) + resource = e.get('rdf:resource') + if '#' not in resource: + p.domain = resource + + # Read arguments + classname, threshold, limit, recursion = parse_arguments() + + # Process this class name + graph_keys = process(classname, threshold, limit, recursion) + + for graph in graph_keys: + print(graph.to_tikz(header=True)) + + print("\n") + + +if __name__ == '__main__': + main()