From 8fbd1f16845ae58dea1415e280345b89576a92da Mon Sep 17 00:00:00 2001 From: "philipp.neuer" Date: Thu, 21 Jun 2018 13:35:25 +0200 Subject: [PATCH 01/27] Test for evaluate() and mutate_fix_var() --- tests/__init__.py | 0 tests/test_fv_eval.py | 116 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 tests/__init__.py create mode 100644 tests/test_fv_eval.py diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_fv_eval.py b/tests/test_fv_eval.py new file mode 100644 index 0000000..a924070 --- /dev/null +++ b/tests/test_fv_eval.py @@ -0,0 +1,116 @@ +# coding=utf-8 +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +"""test_mutate_fix_var und test_evaluate einmal davor und +einmal über die results aus mutate_fix_var +""" + +import logging +from collections import OrderedDict + +import SPARQLWrapper +import rdflib +from rdflib import URIRef +from rdflib import Variable + +from config import SPARQL_ENDPOINT +from gp_learner import evaluate +from gp_learner import mutate_fix_var +from gp_learner import update_individuals +from gp_query import calibrate_query_timeout +from gp_query import query_time_hard_exceeded +from gp_query import query_time_soft_exceeded +from graph_pattern import GraphPattern +from graph_pattern import SOURCE_VAR +from graph_pattern import TARGET_VAR +from ground_truth_tools import get_semantic_associations +from ground_truth_tools import split_training_test_set +from gtp_scores import GTPScores +from os import getenv + +logger = logging.getLogger(__name__) + +dbp = rdflib.Namespace('http://dbpedia.org/resource/') + + +v = Variable('v') + +gp = GraphPattern([ + (SOURCE_VAR, v, TARGET_VAR), + ]) + +ground_truth_pairs_ = [ + (dbp['Berlin'],dbp['Germany']), + (dbp['Hamburg'],dbp['Germany']), + (dbp['Kaiserslautern'],dbp['Germany']), + (dbp['Wien'],dbp['Austria']), + (dbp['Insbruck'],dbp['Austria']), + (dbp['Salzburg'],dbp['Austria']), + (dbp['Paris'],dbp['France']), + (dbp['Lyon'],dbp['France']), + (dbp['Amsterdam'],dbp['Netherlands']), + (dbp['Brussels'],dbp['Belgium']), + (dbp['Washington'],dbp['United_States']), + (dbp['Madrid'],dbp['Spain']), + (dbp['Prague'],dbp['Czech_Republic']), + (dbp['Bern'],dbp['Switzerland']), +] + +gtp_scores_ = GTPScores(ground_truth_pairs_) + +sparql = SPARQLWrapper.SPARQLWrapper(getenv('SPARQL_ENDPOINT','http://dbpedia.org/sparql')) +try: + timeout = max(5, calibrate_query_timeout(sparql)) # 5s for warmup +except IOError: + from nose import SkipTest + raise SkipTest( + "Can't establish connection to SPARQL_ENDPOINT:\n %s\n" + "Skipping tests in\n %s" % (SPARQL_ENDPOINT, __file__)) + +def test_eval(): + res, matching_node_pairs, gtp_precisions = evaluate(sparql, timeout, gtp_scores_, gp, run=0, gen=0) + logger.log( + logging.INFO, + 'Results are:\n' + 'remaining_gain: %d\n' + 'score: %d\n' + 'gain: %d\n' + 'fm: %d\n' + 'avg_res_length: %d\n' + 'sum_gt_matches: %d\n' + 'pattern_length: %d\n' + 'pattern_vars:: %d\n' + 'qtime_exceeded: %d\n' + 'query_time: %d\n' + % res + ) + +def test_mut_fv(): + res = mutate_fix_var(sparql,timeout,gtp_scores_,gp,rand_var=v) + for gp_ in res: + logger.info(gp_) + +def test_eval_list(): + list = mutate_fix_var(sparql,timeout,gtp_scores_,gp,rand_var=v) + for gp_ in list: + res, matching_node_pairs, gtp_precisions = evaluate(sparql, timeout, gtp_scores_, gp_, run=0, gen=0) + logger.log( + logging.INFO, + 'For %s\n' + '%s', gp_, + 'the results are:\n' + 'remaining_gain: %d\n' + 'score: %d\n' + 'gain: %d\n' + 'fm: %d\n' + 'avg_res_length: %d\n' + 'sum_gt_matches: %d\n' + 'pattern_length: %d\n' + 'pattern_vars:: %d\n' + 'qtime_exceeded: %d\n' + 'query_time: %d\n' + %res + ) + From db78db407ca26c718653e044eee0f9db7ae2eeb7 Mon Sep 17 00:00:00 2001 From: "philipp.neuer" Date: Mon, 25 Jun 2018 14:23:17 +0200 Subject: [PATCH 02/27] modified test_fv_eval.py --- .gitignore | 3 + graph_pattern.py | 1 + requirements.txt | 4 +- tests/__init__.py | 0 tests/test_fv_eval.py | 171 ++++++++++++++++++++++++++---------------- 5 files changed, 113 insertions(+), 66 deletions(-) delete mode 100644 tests/__init__.py diff --git a/.gitignore b/.gitignore index 172c006..3b99ee0 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,6 @@ venv/ # ignore py compiled etc. files *.pyc *.pyo + +# ignore .idea +.idea/ diff --git a/graph_pattern.py b/graph_pattern.py index a483c88..0a23c68 100644 --- a/graph_pattern.py +++ b/graph_pattern.py @@ -846,6 +846,7 @@ def to_count_var_over_values_query(self, var, vars_, values, limit): 'triples': self._sparql_triples_part(' '), 'limit': limit, } + print(res) return self._sparql_prefix(textwrap.dedent(res)) def to_dict(self): diff --git a/requirements.txt b/requirements.txt index d61d2fd..4a02904 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,8 +11,8 @@ nose>=1.3.7 numpy>=1.12.1 objgraph>=3.1.0 requests>=2.16.5 -#rdflib>=4.2.1 -git+git://github.com/RDFLib/rdflib@master#egg=rdflib +rdflib>=4.2.1 +#git+git://github.com/RDFLib/rdflib@master#egg=rdflib scikit-learn>=0.18.1 scipy>=0.19.0 scoop>=0.7.1.1 diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/test_fv_eval.py b/tests/test_fv_eval.py index a924070..ed501c6 100644 --- a/tests/test_fv_eval.py +++ b/tests/test_fv_eval.py @@ -9,6 +9,7 @@ import logging from collections import OrderedDict +from os import getenv import SPARQLWrapper import rdflib @@ -28,39 +29,84 @@ from ground_truth_tools import get_semantic_associations from ground_truth_tools import split_training_test_set from gtp_scores import GTPScores -from os import getenv +from serialization import print_graph_pattern logger = logging.getLogger(__name__) dbp = rdflib.Namespace('http://dbpedia.org/resource/') +owl = rdflib.Namespace('http://www.w3.org/2002/07/owl#') + +a = Variable('a') +b = Variable('b') +c = Variable('c') +d = Variable('d') +e = Variable('e') +f = Variable('f') +v = Variable('v') +w = Variable('w') + +sameAs = owl['sameAs'] + +gp_1 = GraphPattern([ + (SOURCE_VAR, v, TARGET_VAR) +]) + +gp_2 = GraphPattern([ + (SOURCE_VAR, v, TARGET_VAR), + (TARGET_VAR, w, SOURCE_VAR) +]) + +gp_3 = GraphPattern([ + (SOURCE_VAR, a, b), + (b, c, d), + (d, e, TARGET_VAR) +]) + +gp_4 = GraphPattern([ + (SOURCE_VAR, a, b), + (b, c, d), + (TARGET_VAR, e, d) +]) + +ground_truth_pairs_1 = [ + (dbp['Berlin'], dbp['Germany']), + (dbp['Hamburg'], dbp['Germany']), + (dbp['Kaiserslautern'], dbp['Germany']), + (dbp['Wien'], dbp['Austria']), + (dbp['Insbruck'], dbp['Austria']), + (dbp['Salzburg'], dbp['Austria']), + (dbp['Paris'], dbp['France']), + (dbp['Lyon'], dbp['France']), + (dbp['Amsterdam'], dbp['Netherlands']), + (dbp['Brussels'], dbp['Belgium']), + (dbp['Washington'], dbp['United_States']), + (dbp['Madrid'], dbp['Spain']), + (dbp['Prague'], dbp['Czech_Republic']), + (dbp['Bern'], dbp['Switzerland']), +] +ground_truth_pairs_2 = get_semantic_associations() +ground_truth_pairs_2, _ = split_training_test_set(ground_truth_pairs_2) +ground_truth_pairs_2 = ground_truth_pairs_2[1:10] -v = Variable('v') +ground_truth_pairs_3 = [ + (dbp['Barrister'], dbp['Law']), + (dbp['Christ'], dbp['Jesus']), + (dbp['Pottage'], dbp['Soup']) + ] -gp = GraphPattern([ - (SOURCE_VAR, v, TARGET_VAR), - ]) - -ground_truth_pairs_ = [ - (dbp['Berlin'],dbp['Germany']), - (dbp['Hamburg'],dbp['Germany']), - (dbp['Kaiserslautern'],dbp['Germany']), - (dbp['Wien'],dbp['Austria']), - (dbp['Insbruck'],dbp['Austria']), - (dbp['Salzburg'],dbp['Austria']), - (dbp['Paris'],dbp['France']), - (dbp['Lyon'],dbp['France']), - (dbp['Amsterdam'],dbp['Netherlands']), - (dbp['Brussels'],dbp['Belgium']), - (dbp['Washington'],dbp['United_States']), - (dbp['Madrid'],dbp['Spain']), - (dbp['Prague'],dbp['Czech_Republic']), - (dbp['Bern'],dbp['Switzerland']), +ground_truth_pairs_4 = [ + (dbp['Motorrad_(disambiguation)'], dbp['Bmw_motorcycle']), + (dbp['Horse'], dbp['Saddle']) ] -gtp_scores_ = GTPScores(ground_truth_pairs_) +gtp_scores_1 = GTPScores(ground_truth_pairs_1) +gtp_scores_2 = GTPScores(ground_truth_pairs_2) +gtp_scores_3 = GTPScores(ground_truth_pairs_3) +gtp_scores_4 = GTPScores(ground_truth_pairs_4) -sparql = SPARQLWrapper.SPARQLWrapper(getenv('SPARQL_ENDPOINT','http://dbpedia.org/sparql')) +sparql = SPARQLWrapper.SPARQLWrapper( + getenv('SPARQL_ENDPOINT', 'http://dbpedia.org/sparql')) try: timeout = max(5, calibrate_query_timeout(sparql)) # 5s for warmup except IOError: @@ -69,48 +115,45 @@ "Can't establish connection to SPARQL_ENDPOINT:\n %s\n" "Skipping tests in\n %s" % (SPARQL_ENDPOINT, __file__)) -def test_eval(): - res, matching_node_pairs, gtp_precisions = evaluate(sparql, timeout, gtp_scores_, gp, run=0, gen=0) - logger.log( - logging.INFO, - 'Results are:\n' - 'remaining_gain: %d\n' - 'score: %d\n' - 'gain: %d\n' - 'fm: %d\n' - 'avg_res_length: %d\n' - 'sum_gt_matches: %d\n' - 'pattern_length: %d\n' - 'pattern_vars:: %d\n' - 'qtime_exceeded: %d\n' - 'query_time: %d\n' - % res - ) - -def test_mut_fv(): - res = mutate_fix_var(sparql,timeout,gtp_scores_,gp,rand_var=v) + +def test_eval(gtp_scores, gp): + res, matching_node_pairs, gtp_precisions = evaluate( + sparql, timeout, gtp_scores, gp, run=0, gen=0) + update_individuals([gp], [(res, matching_node_pairs, gtp_precisions)]) + logger.info(gp.fitness) + + +def test_mut_fv(gtp_scores, gp, r=None): + res = mutate_fix_var(sparql, timeout, gtp_scores, gp, rand_var=r) for gp_ in res: logger.info(gp_) -def test_eval_list(): - list = mutate_fix_var(sparql,timeout,gtp_scores_,gp,rand_var=v) - for gp_ in list: - res, matching_node_pairs, gtp_precisions = evaluate(sparql, timeout, gtp_scores_, gp_, run=0, gen=0) - logger.log( - logging.INFO, - 'For %s\n' - '%s', gp_, - 'the results are:\n' - 'remaining_gain: %d\n' - 'score: %d\n' - 'gain: %d\n' - 'fm: %d\n' - 'avg_res_length: %d\n' - 'sum_gt_matches: %d\n' - 'pattern_length: %d\n' - 'pattern_vars:: %d\n' - 'qtime_exceeded: %d\n' - 'query_time: %d\n' - %res - ) +def test_eval_list(gtp_scores, gp, r=None): + mfv_res = mutate_fix_var(sparql, timeout, gtp_scores, gp, rand_var=r) + for gp_ in mfv_res: + res, matching_node_pairs, gtp_precisions = evaluate( + sparql, timeout, gtp_scores, gp_, run=0, gen=0) + update_individuals([gp_], [(res, matching_node_pairs, gtp_precisions)]) + print_graph_pattern(gp_, print_matching_node_pairs=0) + return mfv_res + + +def test_eval_list_double(gtp_scores, gp, r_1=None, r_2=None): + # testing double execution of mutate_fix_var() on gp + res = test_eval_list(gtp_scores, gp, r_1) + gtp_scores.update_with_gps(res) + res_list = list(res) + for gp in res: + res_ = test_eval_list(gtp_scores, gp, r_2) + for gp_ in res_: + res_list.append(gp_) + gtp_scores.update_with_gps(res_list) + for gp in res_list: + print_graph_pattern(gp, print_matching_node_pairs=0) + + +if __name__ == '__main__': + #test_eval_list_double(gtp_scores_1, gp_2) + + test_eval_list_double(gtp_scores_4, gp_4, a, e) From 07d1f39277f3e0217956aa9e1a594109cb99a41b Mon Sep 17 00:00:00 2001 From: "philipp.neuer" Date: Tue, 10 Jul 2018 14:54:07 +0200 Subject: [PATCH 03/27] Test to find one hop patterns with SAMPLE-Queries --- graph_pattern.py | 47 +++++++- tests/test_fv_eval.py | 25 +++- tests/test_sampling.py | 263 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 329 insertions(+), 6 deletions(-) create mode 100644 tests/test_sampling.py diff --git a/graph_pattern.py b/graph_pattern.py index 0a23c68..635fec1 100644 --- a/graph_pattern.py +++ b/graph_pattern.py @@ -636,6 +636,52 @@ def to_sparql_select_query( res = textwrap.dedent(res) return self._sparql_prefix(res) + def to_sparql_select_sample_query( + self, + values, + projection=None, + limit=None, + sample_var=None + ): + """Generates a SPARQL select sample query from the graph pattern. + + Examples: + TODO + + Args: + values: a dict mapping a variable tuple to a list of binding tuples, + e.g. {(v1, v2): [(uri1, uri2), (uri3, uri4), ...]} + projection: which variables to select on, by default all vars. + limit: integer to limit the result size + sample_var: the variable to sample over + """ + assert self.vars_in_graph, \ + "tried to get sparql for pattern without vars: %s" % (self,) + + if projection is None: + projection = sorted([v for v in self.vars_in_graph]) + + if sample_var is None: + sample_var = random.choice(projection) + logger.info(sample_var) + + projection.remove(sample_var) + + res = "SELECT %(samp)s %(proj)s WHERE {\n%(qpp)s}\n%(lim)s" % { + 'samp': (' SAMPLE(%s) as %s' % ( + ''.join(sample_var.n3()), + ''.join(sample_var.n3()) + )), + 'proj': ' '.join([v.n3() for v in projection]), + 'qpp': self._sparql_query_pattern_part( + values=values, + indent=' ', + ), + 'lim': ('LIMIT %d\n' % limit) if limit is not None else '', + } + res = textwrap.dedent(res) + return self._sparql_prefix(res) + def to_sparql_ask_query( self, bind=None, @@ -846,7 +892,6 @@ def to_count_var_over_values_query(self, var, vars_, values, limit): 'triples': self._sparql_triples_part(' '), 'limit': limit, } - print(res) return self._sparql_prefix(textwrap.dedent(res)) def to_dict(self): diff --git a/tests/test_fv_eval.py b/tests/test_fv_eval.py index ed501c6..3c847b6 100644 --- a/tests/test_fv_eval.py +++ b/tests/test_fv_eval.py @@ -8,11 +8,17 @@ """ import logging +from collections import defaultdict from collections import OrderedDict from os import getenv import SPARQLWrapper +from splendid import get_path +from splendid import time_func +import socket import rdflib +from rdflib import BNode +from rdflib import Literal from rdflib import URIRef from rdflib import Variable @@ -30,6 +36,7 @@ from ground_truth_tools import split_training_test_set from gtp_scores import GTPScores from serialization import print_graph_pattern +from utils import sparql_json_result_bindings_to_rdflib logger = logging.getLogger(__name__) @@ -68,6 +75,14 @@ (TARGET_VAR, e, d) ]) +gp_5 = GraphPattern([ + (SOURCE_VAR, a, c), + (TARGET_VAR, URIRef('http://dbpedia.org/ontology/thumbnail'), d), + (TARGET_VAR, URIRef('http://dbpedia.org/property/image'), b), + (c, URIRef('http://dbpedia.org/ontology/wikiPageWikiLink'), SOURCE_VAR), + (c, URIRef('http://purl.org/linguistics/gold/hypernym'), TARGET_VAR) +]) + ground_truth_pairs_1 = [ (dbp['Berlin'], dbp['Germany']), (dbp['Hamburg'], dbp['Germany']), @@ -87,7 +102,7 @@ ground_truth_pairs_2 = get_semantic_associations() ground_truth_pairs_2, _ = split_training_test_set(ground_truth_pairs_2) -ground_truth_pairs_2 = ground_truth_pairs_2[1:10] +ground_truth_pairs_2 = ground_truth_pairs_2[1:100] ground_truth_pairs_3 = [ (dbp['Barrister'], dbp['Law']), @@ -107,6 +122,7 @@ sparql = SPARQLWrapper.SPARQLWrapper( getenv('SPARQL_ENDPOINT', 'http://dbpedia.org/sparql')) +#sparql = SPARQLWrapper.SPARQLWrapper(SPARQL_ENDPOINT) try: timeout = max(5, calibrate_query_timeout(sparql)) # 5s for warmup except IOError: @@ -152,8 +168,7 @@ def test_eval_list_double(gtp_scores, gp, r_1=None, r_2=None): for gp in res_list: print_graph_pattern(gp, print_matching_node_pairs=0) - if __name__ == '__main__': - #test_eval_list_double(gtp_scores_1, gp_2) - - test_eval_list_double(gtp_scores_4, gp_4, a, e) + test_steps(ground_truth_pairs_2) + #values = {(SOURCE_VAR, TARGET_VAR): ground_truth_pairs_1} + #print(gp_1.to_sparql_select_sample_query(values)) diff --git a/tests/test_sampling.py b/tests/test_sampling.py new file mode 100644 index 0000000..c0afe08 --- /dev/null +++ b/tests/test_sampling.py @@ -0,0 +1,263 @@ +# coding=utf-8 +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +"""Tested das bauen von graph_pattern per gesampeltem finden von 1-hop wegen +und fix-var-mutation +""" + +import logging +from collections import defaultdict +from collections import OrderedDict +from os import getenv + +import SPARQLWrapper +from splendid import get_path +from splendid import time_func +import socket +import rdflib +from rdflib import BNode +from rdflib import Literal +from rdflib import URIRef +from rdflib import Variable + +from config import SPARQL_ENDPOINT +from gp_learner import evaluate +from gp_learner import mutate_fix_var +from gp_learner import update_individuals +from gp_query import calibrate_query_timeout +from gp_query import query_time_hard_exceeded +from gp_query import query_time_soft_exceeded +from graph_pattern import GraphPattern +from graph_pattern import SOURCE_VAR +from graph_pattern import TARGET_VAR +from ground_truth_tools import get_semantic_associations +from ground_truth_tools import split_training_test_set +from gtp_scores import GTPScores +from serialization import print_graph_pattern +from utils import sparql_json_result_bindings_to_rdflib + +logger = logging.getLogger(__name__) + +sparql = SPARQLWrapper.SPARQLWrapper(SPARQL_ENDPOINT) +#sparql = SPARQLWrapper.SPARQLWrapper( +# getenv('SPARQL_ENDPOINT', 'http://dbpedia.org/sparql')) +try: + timeout = max(5, calibrate_query_timeout(sparql)) # 5s for warmup +except IOError: + from nose import SkipTest + raise SkipTest( + "Can't establish connection to SPARQL_ENDPOINT:\n %s\n" + "Skipping tests in\n %s" % (sparql.endpoint, __file__)) + +dbp = rdflib.Namespace('http://dbpedia.org/resource/') +owl = rdflib.Namespace('http://www.w3.org/2002/07/owl#') + +a = Variable('a') +b = Variable('b') +c = Variable('c') +d = Variable('d') +e = Variable('e') +f = Variable('f') +v = Variable('v') +w = Variable('w') + +sameAs = owl['sameAs'] + +gp_1 = GraphPattern([ + (SOURCE_VAR, v, TARGET_VAR) +]) + +gp_2 = GraphPattern([ + (SOURCE_VAR, v, TARGET_VAR), + (TARGET_VAR, w, SOURCE_VAR) +]) + +gp_3 = GraphPattern([ + (SOURCE_VAR, a, b), + (b, c, d), + (d, e, TARGET_VAR) +]) + +gp_4 = GraphPattern([ + (SOURCE_VAR, a, b), + (b, c, d), + (TARGET_VAR, e, d) +]) + +gp_5 = GraphPattern([ + (SOURCE_VAR, a, c), + (TARGET_VAR, URIRef('http://dbpedia.org/ontology/thumbnail'), d), + (TARGET_VAR, URIRef('http://dbpedia.org/property/image'), b), + (c, URIRef('http://dbpedia.org/ontology/wikiPageWikiLink'), SOURCE_VAR), + (c, URIRef('http://purl.org/linguistics/gold/hypernym'), TARGET_VAR) +]) + +ground_truth_pairs_1 = [ + (dbp['Berlin'], dbp['Germany']), + (dbp['Hamburg'], dbp['Germany']), + (dbp['Kaiserslautern'], dbp['Germany']), + (dbp['Wien'], dbp['Austria']), + (dbp['Insbruck'], dbp['Austria']), + (dbp['Salzburg'], dbp['Austria']), + (dbp['Paris'], dbp['France']), + (dbp['Lyon'], dbp['France']), + (dbp['Amsterdam'], dbp['Netherlands']), + (dbp['Brussels'], dbp['Belgium']), + (dbp['Washington'], dbp['United_States']), + (dbp['Madrid'], dbp['Spain']), + (dbp['Prague'], dbp['Czech_Republic']), + (dbp['Bern'], dbp['Switzerland']), +] + +ground_truth_pairs_2 = get_semantic_associations() +ground_truth_pairs_2, _ = split_training_test_set(ground_truth_pairs_2) +ground_truth_pairs_2 = ground_truth_pairs_2[1:100] + +ground_truth_pairs_3 = [ + (dbp['Barrister'], dbp['Law']), + (dbp['Christ'], dbp['Jesus']), + (dbp['Pottage'], dbp['Soup']) + ] + +ground_truth_pairs_4 = [ + (dbp['Motorrad_(disambiguation)'], dbp['Bmw_motorcycle']), + (dbp['Horse'], dbp['Saddle']) +] + +gtp_scores_1 = GTPScores(ground_truth_pairs_1) +gtp_scores_2 = GTPScores(ground_truth_pairs_2) +gtp_scores_3 = GTPScores(ground_truth_pairs_3) +gtp_scores_4 = GTPScores(ground_truth_pairs_4) + + +def test_steps(gtps): + values = {(SOURCE_VAR, TARGET_VAR): gtps} + gp1 = GraphPattern([(SOURCE_VAR, a, b)]) + gp2 = GraphPattern([(b, c, TARGET_VAR)]) + # SPARQL-Query die über eine Var aus gp1 random samplet. + # TODO: Query so verändern, dass nach count gefiltert wird (siehe log.txt) + q = gp1.to_sparql_select_sample_query(values=values, limit=100) + logger.info(q) + t, q_res = run_query(q) + logger.info(q_res) + # Kreiere b_list in der die Ergebnisse für b "gespeichert" sind + # TODO speichere alles um später den Weg nachzuvollziehen + res_rows_path = ['results', 'bindings'] + bind = sparql_json_result_bindings_to_rdflib( + get_path(q_res, res_rows_path, default=[]) + ) + b_list = [] + for row in bind: + x = get_path(row, [b]) + y = (x, ) + b_list.append(y) + logger.info('orig query took %.4f s, result:\n%s\n', t, b_list) + b_list[:] = [b_l for b_l in b_list if not list_remove_bool(b_l[0])] + # Values für die nächste query: b_list + values = {(b, ): b_list} + # Query die über eine var aus gp2 random samplet mit values aus b_list + q = gp2.to_sparql_select_sample_query(values=values, limit=5000) + logger.info(q) + t, q_res = run_query(q) + # Kreiere target_list, in der die "gefundenen" Targets vermerkt sind + res_rows_path = ['results', 'bindings'] + bind = sparql_json_result_bindings_to_rdflib( + get_path(q_res, res_rows_path, default=[]) + ) + target_list = [] + for row in bind: + target_list.append(get_path(row, [TARGET_VAR])) + logger.info('orig query took %.4f s, result:\n%s\n', t, q_res) + # Kreire gtps_2 in der alle gtps, deren targets in target_list enthalten + # sind, "gespeichert" werden + gtps_2 = [] + for t in target_list: + for gtp in gtps: + if t == gtp[1]: + gtps_2.append(gtp) + logger.info(gtps_2) + + gp3 = GraphPattern([ + (SOURCE_VAR, a, b), + (b, c, TARGET_VAR) + ]) + gtp_scores = GTPScores(gtps) + gtp_scores2 = GTPScores(gtps_2) + + # Fixe das pattern über die gefundenen gtps + mfv2 = [] + if len(gtps_2) > 1: + mfv2 = mutate_fix_var(sparql, timeout, gtp_scores2, gp3) + + # lasse die gefundenen Pattern einmal durch die fix_var laufen + mfv = [] + for gp_mfv2 in mfv2: + mfv_res = mutate_fix_var(sparql, timeout, gtp_scores, gp_mfv2) + for gp_res in mfv_res: + mfv.append(gp_res) + + # evaluiere die so gefundenen Pattern + res_eval = eval_gp_list(gtp_scores, mfv) + return res_eval + + +# Runs a given (as String) query against the Sparql-endpoint +def run_query(q): + try: + q_short = ' '.join((line.strip() for line in q.split('\n'))) + sparql.setQuery(q_short) + cal = time_func(sparql.queryAndConvert) + except socket.timeout: + cal = (timeout, {}) + except ValueError: + # e.g. if the endpoint gives us bad JSON for some unicode chars + logger.info( + 'Could not parse result for query, assuming empty result...\n' + 'Query:\n%s\nException:', q, + exc_info=1, # appends exception to message + ) + cal = (timeout, {}) + return cal + + +# Checks if an found RDF-Term can be used as value in a new query +# (without conflicts) +def list_remove_bool(var): + if isinstance(var, Literal): + i_n3 = var.n3() + if len(i_n3) > 60: + return True + elif isinstance(var, BNode): + return True + # echt hässlich, aber die einzige Möglichkeit, die ich gesehen habe um + # keine Probleme mit dem Category:Cigarettes-Beispiel zu bekommen + # (siehe docs) + # TODO: Möglicherweise dafür sorgen, dass die nicht rausgeschmissen, + # sondern nur nicht mit prefix gekürzt werden + elif isinstance(var, URIRef): + return ':' in var[7:] + return False + + +# evaluates a given graph-pattern-list +def eval_gp_list(gtp_scores, gp_list): + for gp_l in gp_list: + res_ev = evaluate( + sparql, timeout, gtp_scores, gp_l, run=0, gen=0) + update_individuals([gp_l], [res_ev]) + #print_graph_pattern(gp_, print_matching_node_pairs=0) + return gp_list + + +if __name__ == '__main__': + res = [] + for i in range(20): + res_ts = test_steps(ground_truth_pairs_2) + for gp_ts in res_ts: + res.append(gp_ts) + + res = sorted(res, key=lambda gp_: -gp_.fitness.values.score) + for i in range(10): + print_graph_pattern(res[i]) From 0837c104e3161b3f076d66e3c7ce239a0a48fd52 Mon Sep 17 00:00:00 2001 From: "philipp.neuer" Date: Thu, 30 Aug 2018 14:21:27 +0200 Subject: [PATCH 04/27] test finished, alg.not yet in learner --- graph_pattern.py | 326 ++- tests/SPARQL-query.py | 75 + tests/test_mutate_deep_narrow.py | 3442 ++++++++++++++++++++++++++++++ tests/test_sampling.py | 227 +- 4 files changed, 3992 insertions(+), 78 deletions(-) create mode 100644 tests/SPARQL-query.py create mode 100644 tests/test_mutate_deep_narrow.py diff --git a/graph_pattern.py b/graph_pattern.py index 635fec1..e1468ad 100644 --- a/graph_pattern.py +++ b/graph_pattern.py @@ -34,7 +34,6 @@ logger = logging.getLogger(__name__) - RANDOM_VAR_LEN = 5 # so in total we have 62**5=916132832 different random vars RANDOM_VAR_PREFIX = 'vr' SOURCE_VAR = Variable('source') @@ -241,10 +240,10 @@ def canonicalize(gp, shorten_varnames=True): cgp = GraphPattern(cbgp, mapping=mapping) if not ( - len(gp) == len(cbgp) == len(cgp) - and len(gp.nodes) == len(cgp.nodes) - and len(gp.edges) == len(cgp.edges) - and sorted(gp.identifier_counts().values()) == + len(gp) == len(cbgp) == len(cgp) + and len(gp.nodes) == len(cgp.nodes) + and len(gp.edges) == len(cgp.edges) + and sorted(gp.identifier_counts().values()) == sorted(cgp.identifier_counts().values()) ): # canonicalization should never change any of the features above, but it @@ -432,8 +431,8 @@ def exclude(self, identifiers): [(s, p, o) for s, p, o in self if p not in identifiers and - s not in identifiers and - o not in identifiers + s not in identifiers and + o not in identifiers ] ) @@ -448,7 +447,7 @@ def identifier_counts(self, exclude_vars=False, vars_only=False): :param vars_only: Only return counts for vars. :return: Counter of all identifiers in this graph pattern. """ - assert not(exclude_vars and vars_only) + assert not (exclude_vars and vars_only) ids = Counter([i for t in self for i in t]) if exclude_vars: for i in self.vars_in_graph: @@ -639,6 +638,7 @@ def to_sparql_select_query( def to_sparql_select_sample_query( self, values, + values_s_t=None, projection=None, limit=None, sample_var=None @@ -651,6 +651,7 @@ def to_sparql_select_sample_query( Args: values: a dict mapping a variable tuple to a list of binding tuples, e.g. {(v1, v2): [(uri1, uri2), (uri3, uri4), ...]} + values_s_t: TODO projection: which variables to select on, by default all vars. limit: integer to limit the result size sample_var: the variable to sample over @@ -661,24 +662,286 @@ def to_sparql_select_sample_query( if projection is None: projection = sorted([v for v in self.vars_in_graph]) - if sample_var is None: - sample_var = random.choice(projection) - logger.info(sample_var) + # if sample_var is None: + # sample_var = random.choice(projection) + # logger.info(sample_var) + + if sample_var: + projection.remove(sample_var) + + res = "SELECT %(samp)s %(proj)s WHERE {\n" \ + "%(valst)s\n" \ + "%(qpp)s}\n" \ + "%(lim)s" % { + 'samp': (' SAMPLE(%s) as %s' % ( + ''.join(sample_var.n3()), + ''.join(sample_var.n3()) + )) if sample_var else '', + 'proj': ' '.join([v.n3() for v in projection]), + 'valst': self._sparql_values_part(values=values_s_t, indent=' ') + if values_s_t is not None else '', + 'qpp': self._sparql_query_pattern_part( + values=values, + indent=' ', + ), + 'lim': ('LIMIT %d\n' % limit) if limit is not None else '', + } + res = textwrap.dedent(res) + return self._sparql_prefix(res) - projection.remove(sample_var) + def to_sparql_filter_by_count_in_out_query( + self, + values, + count_node, + in_out=None, + max_in=None, + max_out=None, + projection=None, + gp=None, + limit=None, + sample_var=None + ): + # TODO: Möglicherweise noch die Pfade aus dem gp_in rausfiltern, man + # will ja eher selten einen zusatzhop über einen schon vorhandenen + # Pfad finden - res = "SELECT %(samp)s %(proj)s WHERE {\n%(qpp)s}\n%(lim)s" % { - 'samp': (' SAMPLE(%s) as %s' % ( - ''.join(sample_var.n3()), - ''.join(sample_var.n3()) - )), - 'proj': ' '.join([v.n3() for v in projection]), - 'qpp': self._sparql_query_pattern_part( - values=values, - indent=' ', - ), - 'lim': ('LIMIT %d\n' % limit) if limit is not None else '', - } + """Generates a SPARQL select query from the graph pattern. + + Examples: + TODO + + Args: TODO + values: a dict mapping a variable tuple to a list of binding tuples, + e.g. {(v1, v2): [(uri1, uri2), (uri3, uri4), ...]} + count_node: Node to filter over outgoing arcs. + in_out: + max_in: + max_out: max outgoing arcs + projection: which variables to select on, by default all vars. + gp: + limit: integer to limit the result size + sample_var: the variable to sample over + """ + assert self.vars_in_graph, \ + "tried to get sparql for pattern without vars: %s" % (self,) + + if projection is None: + projection = sorted([v for v in self.vars_in_graph]) + if sample_var: + projection.remove(sample_var) + + if max_out is None: + max_out = 20 + if max_in is None: + max_in = 20 + + if in_out not in ['in', 'out', 'inout']: + in_out = random.choice(['in', 'out', 'inout']) + logger.info('in_out was set on %s' % in_out) + count_out = Variable('cout') + count_in = Variable('cin') + rand_var_out = gen_random_var() + rand_var_in = gen_random_var() + if gp: + if in_out == 'out': + gp_ = GraphPattern(chain(self, + GraphPattern([ + (count_node, count_out, rand_var_out) + ]), + gp)) + elif in_out == 'in': + gp_ = GraphPattern(chain(self, + GraphPattern([ + (rand_var_in, count_in, count_node) + ]), + gp)) + else: # TODO: Testen ob inout überhaupt passt + gp_ = GraphPattern(chain(self, + GraphPattern([ + (rand_var_in, count_in, count_node), + (count_node, count_out, rand_var_out) + ]), + gp)) + else: + if in_out == 'out': + gp_ = GraphPattern(chain(self, + GraphPattern([ + (count_node, count_out, rand_var_out) + ]) + )) + elif in_out == 'in': + gp_ = GraphPattern(chain(self, + GraphPattern([ + (rand_var_in, count_in, count_node) + ]) + )) + else: # TODO: Testen ob inout überhaupt passt + gp_ = GraphPattern(chain(self, + GraphPattern([ + (rand_var_in, count_in, count_node), + (count_node, count_out, rand_var_out) + ]) + )) + + res = "SELECT %(samp)s %(proj)s %(count)s WHERE " \ + "{\n%(qpp)s}\n%(gb)s\n%(hv)s\n%(lim)s" % { + 'samp': (' SAMPLE(%s) as %s' % ( + ''.join(sample_var.n3()), + ''.join(sample_var.n3()) + )) if sample_var else '', + 'proj': ' '.join([v.n3() for v in projection]), + 'count': (' COUNT(%s) as %s' % ( + ''.join(count_out.n3()), + ''.join(count_out.n3()))) if in_out == 'out' else + (' COUNT(%s) as %s' % ( + ''.join(count_in.n3()), + ''.join(count_in.n3()))) if in_out == 'in' else + (' COUNT(%s) as %s COUNT(%s) as %s' % ( + ''.join(count_out.n3()), + ''.join(count_out.n3()), + ''.join(count_in.n3()), + ''.join(count_in.n3()) + )), + 'qpp': gp_._sparql_query_pattern_part( + values=values, + indent=' ', + ), + 'gb': ('GROUP BY ' + ' '.join([v.n3() for v in projection])), + 'hv': ('HAVING (COUNT(%s)<%s)' % ( + ''.join(count_out.n3()), + str(max_out))) if in_out == 'out' else + ('HAVING (COUNT(%s)<%s)' % ( + ''.join(count_in.n3()), + str(max_in))) if in_out == 'in' else + ('HAVING (COUNT(%s)<%s&&COUNT(%s)<%s)' % ( + ''.join(count_out.n3()), + str(max_out), + ''.join(count_in.n3()), + str(max_in) + )), + 'lim': ('LIMIT %d\n' % limit) if limit is not None else '', + } + res = textwrap.dedent(res) + return gp_._sparql_prefix(res) + + def to_sparql_useful_path_query( + self, + var_to_fix, + var_to_count, + valueblocks, + steps, + startvar=None, + avglimit=10, + gp_in=False + ): + count_var_to_count = Variable('c' + ''.join(var_to_count)) + avg_var_to_count = Variable('avgc' + ''.join(var_to_count)) + if startvar is None: + startvar = SOURCE_VAR + res = "SELECT %(vtf)s (AVG(%(cvtc)s) as %(avtc)s) {\n" \ + "SELECT %(stv)s %(vtf)s (COUNT (%(vtc)s) as %(cvtc)s) {\n" \ + "%(val)s\n" \ + "%(trip)s }\n" \ + "GROUP BY %(stv)s %(vtf)s }\n" \ + "GROUP BY %(vtf)s\n" \ + "HAVING (AVG (%(cvtc)s) < %(avgl)s)" % { + 'vtf': ''.join(var_to_fix.n3()), + 'cvtc': ''.join(count_var_to_count.n3()), + 'avtc': ''.join(avg_var_to_count.n3()), + 'stv': ''.join(startvar.n3()), + 'vtc': ''.join(var_to_count.n3()), + 'val': ''.join([ + self._sparql_values_part( + values=valueblocks[key], indent=' ' + ) for key in valueblocks + ]), + 'trip': ''.join([ + step._sparql_triples_part(indent=' ') for step in steps + # TODO: nicht auf private Methode zugreifen + ]) + ''.join([ + self._sparql_triples_part(indent=' ') if gp_in else '' + ]), + 'avgl': str(avglimit), + } + res = textwrap.dedent(res) + return self._sparql_prefix(res) + + def to_sparql_inst_query( + self, + hop, + valueblocks, + gp_help, + gp_in=False + ): + res = "SELECT %(vtf)s (COUNT (?source) as ?cst) {\n" \ + "%(val)s\n" \ + "%(trip)s }\n" \ + "GROUP BY %(vtf)s\n" \ + "HAVING (COUNT (?source) > 0)" % { + 'vtf': ' '.join([var.n3() for var in hop]), + 'val': ''.join([ + self._sparql_values_part( + values=valueblocks[key], indent=' ' + ) for key in valueblocks + ]), + 'trip': ''.join(gp_help._sparql_triples_part()) + + # TODO: nicht auf private Methode zugreifen + ''.join([ + self._sparql_triples_part( + indent=' ' + ) if gp_in else '' + ]), + } + res = textwrap.dedent(res) + return self._sparql_prefix(res) + + # TODO: die normale inst durch diese hier ersetzen (sollte überall gehen) + def to_sparql_useful_path_inst_query( + self, + hop, + valueblocks, + steps, + gp_in=False + ): + res = "SELECT %(vtf)s (COUNT (?source) as ?cst) {\n" \ + "%(val)s\n" \ + "%(trip)s }\n" \ + "GROUP BY %(vtf)s\n" \ + "HAVING (COUNT (?source) > 0)" % { + 'vtf': ' '.join([var.n3() for var in hop]), + 'val': ''.join([ + self._sparql_values_part( + values=valueblocks[key], indent=' ' + ) for key in valueblocks + ]), + 'trip': ''.join([ + step._sparql_triples_part() for step in steps + # TODO: nicht auf private Methode zugreifen + ]) + ''.join([ + self._sparql_triples_part(indent=' ') if gp_in else '' + ]), + } + res = textwrap.dedent(res) + return self._sparql_prefix(res) + + def to_sparql_precheck_query( + self, + values, + gp_in=False + ): + res = "SELECT * {\n" \ + "%(val)s\n" \ + "%(trip)s\n" \ + "}\n" \ + "LIMIT 1" % { + 'val': ''.join( + self._sparql_values_part(values=values, indent=' ') + ), + 'trip': ''.join(self._sparql_triples_part(indent=' ')) + + ''.join([ + self._sparql_triples_part(indent=' ') if gp_in else '' + ]), + } res = textwrap.dedent(res) return self._sparql_prefix(res) @@ -702,9 +965,9 @@ def _sparql_query_pattern_part( ): assert bind is None or isinstance(bind, dict) assert values is None or ( - isinstance(values, dict) and - isinstance(next(six.iterkeys(values)), Iterable) and - isinstance(next(six.itervalues(values)), Iterable) + isinstance(values, dict) and + isinstance(next(six.iterkeys(values)), Iterable) and + isinstance(next(six.itervalues(values)), Iterable) ) res = '' @@ -1088,7 +1351,6 @@ def rate_graph_pattern(self, gp): ] return res - def prune_counts(self, below=2): lns = len(self.identifier_gt_node_sum) ln = len(self.identifier_gt_node_count) @@ -1115,7 +1377,7 @@ def prune_counts(self, below=2): def __str__(self): return '%s: pairs: %d, nodes: %d, Identifier counts:\n' \ - 'Pairs: %s\nNodes: %s' % ( - self.__class__.__name__, len(self.gt_pairs), len(self.nodes), - self.identifier_gt_pair_count, self.identifier_gt_node_count - ) + 'Pairs: %s\nNodes: %s' % ( + self.__class__.__name__, len(self.gt_pairs), len(self.nodes), + self.identifier_gt_pair_count, self.identifier_gt_node_count + ) diff --git a/tests/SPARQL-query.py b/tests/SPARQL-query.py new file mode 100644 index 0000000..4bbb7e0 --- /dev/null +++ b/tests/SPARQL-query.py @@ -0,0 +1,75 @@ +# coding=utf-8 +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +"""ein File einfach um SPARQL-queries abzufeuern, statt es online im Browser +zu machen. +""" + +import logging +from collections import OrderedDict +from os import getenv + +import SPARQLWrapper +from splendid import time_func +import socket +import rdflib +from rdflib import URIRef +from rdflib import Variable + +from config import SPARQL_ENDPOINT +from gp_learner import evaluate +from gp_learner import mutate_fix_var +from gp_learner import update_individuals +from gp_query import calibrate_query_timeout +from gp_query import query_time_hard_exceeded +from gp_query import query_time_soft_exceeded +from graph_pattern import GraphPattern +from graph_pattern import SOURCE_VAR +from graph_pattern import TARGET_VAR +from ground_truth_tools import get_semantic_associations +from ground_truth_tools import split_training_test_set +from gtp_scores import GTPScores +from serialization import print_graph_pattern + + +sparql = SPARQLWrapper.SPARQLWrapper( + getenv('SPARQL_ENDPOINT', 'http://dbpedia.org/sparql')) +try: + timeout = max(5, calibrate_query_timeout(sparql)) # 5s for warmup +except IOError: + from nose import SkipTest + raise SkipTest( + "Can't establish connection to SPARQL_ENDPOINT:\n %s\n" + "Skipping tests in\n %s" % (SPARQL_ENDPOINT, __file__)) + +sparql.resetQuery() +sparql.setTimeout(timeout) +sparql.setReturnFormat(SPARQLWrapper.JSON) + +q = 'SELECT ?source ?target ?vcb0 ?vcb1 ?vcb2 ?vcb3 WHERE {' \ + '?source ?vcb0 ?vcb2 .' \ + '?target ?vcb3 .' \ + '?target ?vcb1 .' \ + '?vcb2 ?source .' \ + '?vcb2 ?target ' \ + '}' + +try: + q_short = ' '.join((line.strip() for line in q.split('\n'))) + sparql.setQuery(q_short) + c = time_func(sparql.queryAndConvert) +except socket.timeout: + c = (timeout, {}) +except ValueError: + # e.g. if the endpoint gives us bad JSON for some unicode chars + print( + 'Could not parse result for query, assuming empty result...\n' + 'Query:\n%s\nException:', q, + exc_info=1, # appends exception to message + ) + c = (timeout, {}) + +t, res = c +print('orig query took %.4f s, result:\n%s\n', t, res) \ No newline at end of file diff --git a/tests/test_mutate_deep_narrow.py b/tests/test_mutate_deep_narrow.py new file mode 100644 index 0000000..bbcbdca --- /dev/null +++ b/tests/test_mutate_deep_narrow.py @@ -0,0 +1,3442 @@ +# coding=utf-8 +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +"""Testet die verschiedenen Versionen der mutatete_deep_narrow +""" + +import logging +import numpy as np +import pickle +import random +from collections import defaultdict +from collections import OrderedDict +from os import getenv + +import SPARQLWrapper +from itertools import chain +from splendid import get_path +from splendid import time_func +import socket +import rdflib +from rdflib import BNode +from rdflib import Literal +from rdflib import URIRef +from rdflib import Variable + +from config import SPARQL_ENDPOINT +from gp_learner import evaluate +from gp_learner import mutate_fix_var +from gp_learner import update_individuals +from gp_query import calibrate_query_timeout +from gp_query import query_time_hard_exceeded +from gp_query import query_time_soft_exceeded +from graph_pattern import gen_random_var +from graph_pattern import GraphPattern +from graph_pattern import SOURCE_VAR +from graph_pattern import TARGET_VAR +from ground_truth_tools import get_semantic_associations +from ground_truth_tools import split_training_test_set +from gtp_scores import GTPScores +from serialization import print_graph_pattern +from utils import sparql_json_result_bindings_to_rdflib + +logger = logging.getLogger(__name__) + +sparql = SPARQLWrapper.SPARQLWrapper(SPARQL_ENDPOINT) +# sparql = SPARQLWrapper.SPARQLWrapper( +# getenv('SPARQL_ENDPOINT', 'http://dbpedia.org/sparql')) +try: + timeout = max(5, calibrate_query_timeout(sparql)) # 5s for warmup +except IOError: + from nose import SkipTest + raise SkipTest( + "Can't establish connection to SPARQL_ENDPOINT:\n %s\n" + "Skipping tests in\n %s" % (sparql.endpoint, __file__)) + +dbr = rdflib.Namespace('http://dbpedia.org/resource/') +owl = rdflib.Namespace('http://www.w3.org/2002/07/owl#') +dbo = rdflib.Namespace('http://dbpedia.org/ontology/') +gold = rdflib.Namespace('http://purl.org/linguistics/gold') +dbt = rdflib.Namespace('http://dbpedia.org/resource/Template:') +dbp = rdflib.Namespace('http://dbpedia.org/property/') + +v = [gen_random_var() for i in range(100)] + +sameAs = owl['sameAs'] +pwl = dbo['wikiPageWikiLink'] +hypernym = gold['hypernym'] +wpUseTemp = dbp['wikiPageUsesTemplate'] + +gp_found = {} +gp_found['1'] = GraphPattern([ + (SOURCE_VAR, pwl, TARGET_VAR), + (SOURCE_VAR, v[0], v[1]), + (v[1], hypernym, TARGET_VAR) +]) +gp_found['2'] = GraphPattern([ + (SOURCE_VAR, pwl, TARGET_VAR), + (TARGET_VAR, v[0], SOURCE_VAR), + (TARGET_VAR, v[1], URIRef('http://dbpedia.org/dbtax/Page')) +]) +gp_found['3'] = GraphPattern([ + (SOURCE_VAR, pwl, TARGET_VAR), + (TARGET_VAR, v[0], SOURCE_VAR), + (TARGET_VAR, v[1], dbt['Sister_project_links']) +]) +gp_found['4'] = GraphPattern([ + (SOURCE_VAR, pwl, TARGET_VAR), + (TARGET_VAR, wpUseTemp, dbt['Pp-semi-indef']) +]) +gp_found['5'] = GraphPattern([ + (SOURCE_VAR, pwl, TARGET_VAR), + (TARGET_VAR, v[0], dbt['Pp-semi-indef']) +]) +gp_found['6'] = GraphPattern([ + (SOURCE_VAR, pwl, TARGET_VAR), + (TARGET_VAR, v[0], SOURCE_VAR), + (TARGET_VAR, v[1], dbt['Cite_book']) +]) +gp_found['7'] = GraphPattern([ + (SOURCE_VAR, pwl, TARGET_VAR), + (TARGET_VAR, v[0], SOURCE_VAR), + (TARGET_VAR, v[1], dbt['Redirect']) +]) +gp_found['8'] = GraphPattern([ + (SOURCE_VAR, hypernym, TARGET_VAR) +]) +gp_found['50'] = GraphPattern([ + (SOURCE_VAR, pwl, TARGET_VAR), + (TARGET_VAR, v[0], SOURCE_VAR), + (TARGET_VAR, v[1], dbt['Use_dmy_dates']) +]) +gp_found['51'] = GraphPattern([ + (SOURCE_VAR, pwl, TARGET_VAR), + (TARGET_VAR, v[0], SOURCE_VAR), + (TARGET_VAR, v[1], dbt['Refend']) +]) +gp_found['52'] = GraphPattern([ + (SOURCE_VAR, pwl, TARGET_VAR), + (TARGET_VAR, URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), + URIRef('http://dbpedia.org/dbtax/Page')) +]) +gp_found['54'] = GraphPattern([ + (SOURCE_VAR, hypernym, TARGET_VAR), + (v[0], sameAs, SOURCE_VAR) +]) +gp_found['55'] = GraphPattern([ + (SOURCE_VAR, hypernym, TARGET_VAR), + (TARGET_VAR, pwl, SOURCE_VAR) +]) +gp_found['67'] = GraphPattern([ + (SOURCE_VAR, pwl, TARGET_VAR), + (TARGET_VAR, v[0], SOURCE_VAR), + (TARGET_VAR, v[1], dbt['Portal']) +]) +gp_found['68'] = GraphPattern([ + (SOURCE_VAR, pwl, TARGET_VAR), + (TARGET_VAR, v[0], SOURCE_VAR), + (TARGET_VAR, v[1], dbt['Convert']) +]) +gp_found['69'] = GraphPattern([ + (SOURCE_VAR, hypernym, TARGET_VAR), + (v[0], hypernym, SOURCE_VAR) +]) +gp_found['72'] = GraphPattern([ + (SOURCE_VAR, URIRef('http://purl.org/dc/terms/subject'), v[1]), + (TARGET_VAR, pwl, SOURCE_VAR), + (v[0], sameAs, v[1]), + (v[1], URIRef('http://www.w3.org/2004/02/skos/core#subject'), TARGET_VAR) +]) +gp_found['94'] = GraphPattern([ + (SOURCE_VAR, URIRef('http://purl.org/dc/terms/subject'), v[1]), + (TARGET_VAR, v[0], SOURCE_VAR), + (v[1], URIRef('http://www.w3.org/2004/02/skos/core#subject'), TARGET_VAR) +]) +gp_found['131'] = GraphPattern([ + (SOURCE_VAR, v[0], v[2]), + (TARGET_VAR, pwl, v[1]), + (v[2], URIRef('http://www.w3.org/2004/02/skos/core#subject'), TARGET_VAR), +]) +gp_found['140'] = GraphPattern([ + (TARGET_VAR, pwl, SOURCE_VAR), + (TARGET_VAR, wpUseTemp, dbt['Other_uses']), + (TARGET_VAR, wpUseTemp, dbt['Pp-move-indef']), + (v[0], URIRef('http://www.w3.org/2000/01/rdf-schema#seeAlso'), TARGET_VAR), +]) +# Bis hier jedes mit neuem Fingerprint, jetzt noch 3 vom Rest +gp_found['231'] = GraphPattern([ + (SOURCE_VAR, dbo['class'], TARGET_VAR), + (TARGET_VAR, dbp['subdivisionRanks'], v[0]) +]) +gp_found['323'] = GraphPattern([ + (SOURCE_VAR, pwl, TARGET_VAR), + (v[0], dbp['species'], TARGET_VAR), + (v[1], dbo['wikiPageDisambiguates'], TARGET_VAR) +]) +gp_found['516'] = GraphPattern([ + (SOURCE_VAR, pwl, v[1]), + (TARGET_VAR, dbp['image'], v[0]), + (v[1], hypernym, TARGET_VAR), + (v[2], dbo['wikiPageRedirects'], SOURCE_VAR) +]) + +# Verschiedene Limits festlegen: +# Limit: search object-list => subject-values in next query +limit_next = 500 +# limt: search an object list from two diferrent subjects and get hits through +# comparing them +limit_endpoint_two_sided = 1000 +# limit: search object-list => compare with sources/targets from gtp +limit_choose_endpoint = 5000 +# limit: search subject-list from two diferrent objects and get hits through +# comparing them +limit_startpoint_two_sided = 200 +# limit: search subject-list => subject-values in next query +limit_subject_next = 350 +# limit: search subject list => compare with sources/targets from gtp +limit_choose_subject_endpoint = 3000 +# limits: hit-list => on side subject, one side object: +limit_subj_to_obj = 350 +limit_obj_to_subj = 1500 + + +# einen ein-hop-weg von source zu target zum pattern hinzufügen +# TODO Varianten (von gefundenen b aus Variante der zweiten query +# 1.(default) mit (b, c, d) Liste von d suchen und mit Target-Liste vergleichen +# 2. mit (b, c, target). VALUES(target) suchen => +# Ergebnisse direkt an existente Targets gebunden +# 3. mit (b, c, target).urspurngs_gp +def mutate_deep_narrow_one_hop_s_t_without_direction( + gp_, gtps, max_out=None, max_in=None, in_out=None +): + vars_ = gp_.vars_in_graph + if not (SOURCE_VAR in vars_ and TARGET_VAR in vars_): + logger.info('SOURCE or TARGET are not in gp: %s' % gp_) + return [] + # Erstelle pattern für den ersten Schritt + a = Variable('a') + b = Variable('b') + c = Variable('c') + values_s_t = {(SOURCE_VAR, TARGET_VAR): gtps} + gp1 = GraphPattern([(SOURCE_VAR, a, b)]) + q = gp1.to_sparql_filter_by_count_in_out_query( + values=values_s_t, count_node=b, in_out=in_out, max_out=max_out, + max_in=max_in, gp=gp_, limit=200) + logger.info(q) + t, q_res1 = run_query(q) + if not q_res1['results']['bindings']: + return [] + # logger.info('orig query took %.4f s, result:\n%s\n', t, q_res1) + # Erstelle values aus den Ergebnissen für b + values = get_values([b], q_res1) + gp2 = GraphPattern([(b, c, TARGET_VAR)]) + # Query die über eine var aus gp2 random samplet mit values aus b_list + q = gp2.to_sparql_select_sample_query(values=values, limit=5000) + logger.info(q) + try: + t, q_res2 = run_query(q) + except: + logger.info('Die Query (s.o.) hat nicht geklappt') + return [] + # Kreiere target_list, in der die "gefundenen" Targets vermerkt sind + target_list = get_values_list(TARGET_VAR, q_res2) + # logger.info('orig query took %.4f s, result:\n%s\n', t, q_res2) + # Kreiere gtps_hit in der alle gtps, deren targets in target_list enthalten + # sind, "gespeichert" werden + stp_hit = get_stp_hit(target_list, gtps, 1) + gp_list = get_fixed_path_gp_one_hop( + q_res1, q_res2, gp_, stp_hit, [], a, b, c + ) + return gp_list + + +# einen ein-hop-weg von source zu target zum pattern hinzufügen +# (gp in query 2 eingefügt) +def mutate_deep_narrow_one_hop_s_t_2(gp_, gtps, max_in_out=None, in_out=None): + vars_ = gp_.vars_in_graph + if not (SOURCE_VAR in vars_ and TARGET_VAR in vars_): + logger.info('SOURCE or TARGET are not in gp: %s' % gp_) + return [] + # Erstelle pattern für den ersten Schritt + a = Variable('a') + b = Variable('b') + c = Variable('c') + gp1 = GraphPattern([(SOURCE_VAR, a, b)]) + values_s_t = {(SOURCE_VAR, TARGET_VAR): gtps} + q = gp1.to_sparql_filter_by_count_in_out_query( + values=values_s_t, count_node=b, in_out=in_out, + max_out=max_in_out, gp=gp_, limit=200) + logger.info(q) + t, q_res1 = run_query(q) + if not q_res1['results']['bindings']: + return [] + # logger.info('orig query took %.4f s, result:\n%s\n', t, q_res1) + gp2 = GraphPattern([(b, c, TARGET_VAR)]) + # Erstelle values aus den Ergebnissen für b + values = get_values([b], q_res1) + # Query die über eine var aus gp2 random samplet mit values aus b_list + q = gp2.to_sparql_select_sample_query( + values=values, values_s_t=values_s_t, limit=5000 + ) + logger.info(q) + try: + t, q_res2 = run_query(q) + except: + logger.info('Die Query (s.o.) hat nicht geklappt') + return [] + # Kreiere target_list, in der die "gefundenen" Targets vermerkt sind + target_list = get_values_list(TARGET_VAR, q_res2) + # logger.info('orig query took %.4f s, result:\n%s\n', t, q_res2) + # Kreiere gtps_hit in der alle gtps, deren targets in target_list enthalten + # sind, "gespeichert" werden + stp_hit = get_stp_hit(target_list, gtps, 1) + gp_list = get_fixed_path_gp_one_hop(q_res1, q_res2, gp_, stp_hit, a, b, c) + return gp_list + + +# eine one-hop verbindung zwischen source und target finden (Richtungen random) +def mutate_deep_narrow_one_random_hop_s_t(): + ich_darf_nich_leer_sein = [] + return ich_darf_nich_leer_sein + + +# einen direkten weg um einen hop erweitern (Weg löschen und stattdessen +# ein-hop weg einfügen) + + +# zu einem direkten weg noch einen ein-hop weg hinzufügen (weg behalten, +# ein-hop weg dazu) + + +# Runs a given (as String) query against the Sparql-endpoint +def run_query(q): + try: + q_short = ' '.join((line.strip() for line in q.split('\n'))) + sparql.setQuery(q_short) + cal = time_func(sparql.queryAndConvert) + except socket.timeout: + cal = (timeout, {}) + except ValueError: + # e.g. if the endpoint gives us bad JSON for some unicode chars + logger.info( + 'Could not parse result for query, assuming empty result...\n' + 'Query:\n%s\nException:', q, + exc_info=1, # appends exception to message + ) + cal = (timeout, {}) + return cal + + +# returns a list of value-tupels for the given variables, out of an +# query-result +def get_values(varlist, q_res): + res_rows_path = ['results', 'bindings'] + bind = sparql_json_result_bindings_to_rdflib( + get_path(q_res, res_rows_path, default=[]) + ) + vallist = [] + for row in bind: + tup = () + for var in varlist: + tup = tup + (get_path(row, [var]), ) + vallist.append(tup) + # ausfiltern von vallist (leider notwendig vor allem wegen dbr:Template + vallist[:] = [valtup for valtup in vallist if not list_remove_bool(valtup)] + # dopppelte noch herausfiltern + vallist = list(set(vallist)) + vartup = () + for var in varlist: + vartup = vartup + (var, ) + values = {vartup: vallist} + return values + + +# returns a list of found values for a given variable and query-result +def get_values_list(var, q_res): + res_rows_path = ['results', 'bindings'] + bind = sparql_json_result_bindings_to_rdflib( + get_path(q_res, res_rows_path, default=[]) + ) + vallist = [get_path(row, [var]) for row in bind] + return vallist + + +# gibt ein sample nach der Gewichtung der counts zurück, +# Gewichtung ist hier innerhalb angesetzt +def get_weighted_sample(var, count, q_res): + res_rows_path = ['results', 'bindings'] + bind = sparql_json_result_bindings_to_rdflib( + get_path(q_res, res_rows_path, default=[]) + ) + val = [] + weight = [] + for row in bind: + val.append(get_path(row, [var])) + # Davon ausgehend, dass x besonders gut ist + if float(get_path(row, [count])) == 1.0: + weight.append(10000) + else: + weight.append(1/(abs(1-float(get_path(row, [count]))))) + # Davon ausgehend, dass x besonders schlecht ist + # weight.append(abs(7-float(get_path(row, [count])))) + # weight.append(get_path(row, [count])) + s = sum(weight) + for i in range(len(weight)): + weight[i] = weight[i] / s + cum_weights = [0] + list(np.cumsum(weight)) + res = [] + while len(res) < min(10, len(list(set(val)))): + x = np.random.random() + i = 0 + while x > cum_weights[i]: + i = i + 1 + index = i - 1 + if val[index] not in res: + res.append((val[index],)) + sample = {(var,): res} + return sample + + +# gibt zu einer gegebenen Liste von Variablen die stp aus gtps zurück, +# bei denen Target(st=1)/Source(st=0) in der Variablen Liste ist. +def get_stp_hit(varlist, gtps, st): + stp = [] + for t in varlist: + for gtp in gtps: + if t == gtp[st]: + stp.append(gtp) + return stp + + +# Checks if an found RDF-Term can be used as value in a new query +# (without conflicts) +def list_remove_bool(tup): + for var in tup: + if isinstance(var, Literal): + i_n3 = var.n3() + if len(i_n3) > 60: + return True + elif isinstance(var, BNode): + return True + elif isinstance(var, URIRef): + return '%' in var + # TODO: nochmal schauen das % rauswerfen war kuzfristig, + # weil sparql mir bei einer query nen Fehler geschmissen hat + return False + + +# evaluates a given graph-pattern-list +def eval_gp_list(gtp_scores, gp_list): + for gp_l in gp_list: + eval_gp(gtp_scores, gp_l) + return gp_list + + +# evaluate a given graph-pattern +def eval_gp(gtp_scores, gp): + res = evaluate( + sparql, timeout, gtp_scores, gp, run=0, gen=0) + update_individuals([gp], [res]) + + +# helper to get target-hits and the corresponding stp +def target_hit(stps, t_lis): + res = [] + for stp in stps: + for t in t_lis: + if t == stp[1]: + res.append( + (t, stp) + ) + return res + + +# add one hop with the given direction. +def mutate_deep_narrow_one_hop( + gp_, max_out=None, max_in=None, in_out=None, richtung=None +): + vars_ = gp_.vars_in_graph + if not (SOURCE_VAR in vars_ and TARGET_VAR in vars_): + logger.info('SOURCE or TARGET are not in gp: %s' % gp_) + return [] + if not gp_.matching_node_pairs: + logger.info( + 'No matching node pairs, cant get better through adding constraints' + ) + return [] + # Erstelle pattern für den ersten Schritt + a = Variable('a') + b = Variable('b') + c = Variable('c') + if richtung not in [1, 2, 3, 4]: + richtung = random.choice([1, 2, 3, 4]) + logger.info('Richtung %s wurde gewaehlt' % richtung) + if richtung == 1: + values_s_t = {(SOURCE_VAR, TARGET_VAR): gp_.matching_node_pairs} + gp1 = GraphPattern([(SOURCE_VAR, a, b)]) + q = gp1.to_sparql_filter_by_count_in_out_query( + values=values_s_t, count_node=b, in_out=in_out, max_out=max_out, + max_in=max_in, limit=200) + logger.info(q) + t, q_res1 = run_query(q) + if not q_res1: + return [] + # logger.info('orig query took %.4f s, result:\n%s\n', t, q_res1) + # Erstelle values aus den Ergebnissen für b + values = get_values([b], q_res1) + gp2 = GraphPattern([(b, c, TARGET_VAR)]) + # Query die über eine var aus gp2 random samplet mit values aus b_list + q = gp2.to_sparql_select_sample_query(values=values, limit=5000) + logger.info(q) + try: + t, q_res2 = run_query(q) + except: + logger.info('Die Query (s.o.) hat nicht geklappt') + return [] + # logger.info('orig query took %.4f s, result:\n%s\n', t, q_res2) + gp_list = get_fixed_path_gp_one_hop( + q_res1, q_res2, gp_, richtung, gp_.matching_node_pairs, a, b, c + ) + elif richtung == 2: + values_s = { + (SOURCE_VAR, ): [(tup[0], ) for tup in gp_.matching_node_pairs] + } + values_t = { + (TARGET_VAR, ): [(tup[1], ) for tup in gp_.matching_node_pairs] + } + gp1 = GraphPattern([(SOURCE_VAR, a, b)]) + gp2 = GraphPattern([(TARGET_VAR, c, b)]) + q = gp1.to_sparql_filter_by_count_in_out_query( + values=values_s, count_node=b, in_out=in_out, max_out=max_out, + max_in=max_in, limit=1000) + logger.info(q) + t, q_res1 = run_query(q) + if not q_res1['results']['bindings']: + return [] + q = gp2.to_sparql_filter_by_count_in_out_query( + values=values_t, count_node=b, in_out=in_out, max_out=max_out, + max_in=max_in, limit=1000) + logger.info(q) + t, q_res2 = run_query(q) + if not q_res2['results']['bindings']: + return [] + gp_list = get_fixed_path_gp_one_hop( + q_res1, q_res2, gp_, richtung, gp_.matching_node_pairs, a, b, c + ) + elif richtung == 3: + values_s_t = {(SOURCE_VAR, TARGET_VAR): gp_.matching_node_pairs} + gp2 = GraphPattern([(TARGET_VAR, c, b)]) + q = gp2.to_sparql_filter_by_count_in_out_query( + values=values_s_t, count_node=b, in_out=in_out, max_out=max_out, + max_in=max_in, limit=200) + logger.info(q) + t, q_res2 = run_query(q) + if not q_res2['results']['bindings']: + return [] + # logger.info('orig query took %.4f s, result:\n%s\n', t, q_res1) + gp1 = GraphPattern([(b, a, SOURCE_VAR)]) + # Erstelle values aus den Ergebnissen für b + values = get_values([b], q_res2) + # Query die über eine var aus gp2 random samplet mit values aus b_list + q = gp1.to_sparql_select_sample_query(values=values, limit=5000) + logger.info(q) + try: + t, q_res1 = run_query(q) + except: + logger.info('Die Query (s.o.) hat nicht geklappt') + return [] + gp_list = get_fixed_path_gp_one_hop( + q_res1, q_res2, gp_, richtung, gp_.matching_node_pairs, a, b, c + ) + else: + values_s = { + (SOURCE_VAR, ): [(tup[0], ) for tup in gp_.matching_node_pairs] + } + values_t = { + (TARGET_VAR, ): [(tup[1], ) for tup in gp_.matching_node_pairs] + } + gp1 = GraphPattern([(b, a, SOURCE_VAR)]) + gp2 = GraphPattern([(b, c, TARGET_VAR)]) + q = gp1.to_sparql_filter_by_count_in_out_query( + values=values_s, count_node=b, in_out=in_out, max_out=max_out, + max_in=max_in, limit=200) + logger.info(q) + t, q_res1 = run_query(q) + if not q_res1['results']['bindings']: + return [] + q = gp2.to_sparql_filter_by_count_in_out_query( + values=values_t, count_node=b, in_out=in_out, max_out=max_out, + max_in=max_in, limit=200) + logger.info(q) + t, q_res2 = run_query(q) + if not q_res2['results']['bindings']: + return [] + gp_list = get_fixed_path_gp_one_hop( + q_res1, q_res2, gp_, richtung, gp_.matching_node_pairs, a, b, c + ) + return gp_list + + +# fixed den ein-hop-pfad zwischen Source und Target, fügt ihn dem Pattern hinzu +# und gibt die Liste der resultierenden Pattern zurück +# TODO nicht so sehr auf source a b. b c Target fokussieren. +def get_fixed_path_gp_one_hop(q_res1, q_res2, gp_main, richtung, stp, a, b, c): + gp_list = [] + res_rows_path = ['results', 'bindings'] + bind1 = sparql_json_result_bindings_to_rdflib( + get_path(q_res1, res_rows_path, default=[]) + ) + bind2 = sparql_json_result_bindings_to_rdflib( + get_path(q_res2, res_rows_path, default=[]) + ) + for row2 in bind2: + for gtp in stp: + if gtp[1] == get_path(row2, [TARGET_VAR]): + for row1 in bind1: + if (get_path(row1, [b]) == get_path(row2, [b])) and \ + (get_path(row1, [SOURCE_VAR]) == gtp[0]): + if richtung == 1: + gp_ = GraphPattern([ + (SOURCE_VAR, get_path(row1, [a]), b), + (b, get_path(row2, [c]), TARGET_VAR) + ]) + elif richtung == 2: + gp_ = GraphPattern([ + (SOURCE_VAR, get_path(row1, [a]), b), + (TARGET_VAR, get_path(row2, [c]), b) + ]) + elif richtung == 3: + gp_ = GraphPattern([ + (b, get_path(row1, [a]), SOURCE_VAR), + (TARGET_VAR, get_path(row2, [c]), b) + ]) + else: + gp_ = GraphPattern([ + (b, get_path(row1, [a]), SOURCE_VAR), + (b, get_path(row2, [c]), TARGET_VAR) + ]) + + gp_ = GraphPattern(chain(gp_, gp_main)) + if gp_ not in gp_list: + gp_list.append(gp_) + logger.info(gtp) + return gp_list + + +# fixed den ein-hop-pfad zwischen Source und Target, fügt ihn dem Pattern hinzu +# und gibt die Liste der resultierenden Pattern zurück +# TODO nicht so sehr auf source a b. b c Target fokussieren. +def get_fixed_path_gp_two_hops( + q_res1, q_res2, q_res3, gp_main, richtung, stp, a, b, c, d, e +): + # TODO: überlegen nicht nur verschieden Pattern für verschiedene Richtungen + # zu machen, sondern auch in den Unterschiedlichen Ergebnissn anfangen + # (Idee wäre z.B. die a bis e durch nummerierte random vars zu ersetzen und + # sich dann zu überlegen wie man das übergibt, ob mans iwie immer entlang + # des patterns schafft oder eher nicht. + gp_list = [] + res_rows_path = ['results', 'bindings'] + bind1 = sparql_json_result_bindings_to_rdflib( + get_path(q_res1, res_rows_path, default=[]) + ) + bind2 = sparql_json_result_bindings_to_rdflib( + get_path(q_res2, res_rows_path, default=[]) + ) + bind3 = sparql_json_result_bindings_to_rdflib( + get_path(q_res3, res_rows_path, default=[]) + ) + for gtp in stp: + for row3 in bind3: + if gtp[1] == get_path(row3, [TARGET_VAR]): + for row2 in bind2: + if get_path(row2, [d]) == get_path(row3, [d]): + for row1 in bind1: + if get_path(row1, [b]) == \ + get_path(row2, [b]) and \ + get_path(row1, [SOURCE_VAR]) == \ + gtp[0]: + if richtung == 1: + gp_ = GraphPattern([ + (SOURCE_VAR, get_path(row1, [a]), b), + (b, get_path(row2, [c]), d), + (d, get_path(row3, [e]), TARGET_VAR) + ]) + elif richtung == 2: + gp_ = GraphPattern([ + (SOURCE_VAR, get_path(row1, [a]), b), + (b, get_path(row2, [c]), d), + (TARGET_VAR, get_path(row3, [e]), d) + ]) + elif richtung == 3: + gp_ = GraphPattern([ + (SOURCE_VAR, get_path(row1, [a]), b), + (d, get_path(row2, [c]), b), + (d, get_path(row3, [e]), TARGET_VAR) + ]) + elif richtung == 4: + gp_ = GraphPattern([ + (SOURCE_VAR, get_path(row1, [a]), b), + (d, get_path(row2, [c]), b), + (TARGET_VAR, get_path(row3, [e]), d) + ]) + elif richtung == 5: + gp_ = GraphPattern([ + (b, get_path(row1, [a]), SOURCE_VAR), + (b, get_path(row2, [c]), d), + (d, get_path(row3, [e]), TARGET_VAR) + ]) + elif richtung == 6: + gp_ = GraphPattern([ + (b, get_path(row1, [a]), SOURCE_VAR), + (b, get_path(row2, [c]), d), + (TARGET_VAR, get_path(row3, [e]), d) + ]) + elif richtung == 7: + gp_ = GraphPattern([ + (b, get_path(row1, [a]), SOURCE_VAR), + (d, get_path(row2, [c]), b), + (d, get_path(row3, [e]), TARGET_VAR) + ]) + else: + gp_ = GraphPattern([ + (b, get_path(row1, [a]), SOURCE_VAR), + (d, get_path(row2, [c]), b), + (TARGET_VAR, get_path(row3, [e]), d) + ]) + gp_ = GraphPattern(chain(gp_, gp_main)) + if gp_ not in gp_list: + gp_list.append(gp_) + logger.debug(gtp) + return gp_list + + +# add two hops. +def mutate_deep_narrow_two_hops( + gp_, max_out=None, max_in=None, in_out=None, richtung=None +): + vars_ = gp_.vars_in_graph + if not (SOURCE_VAR in vars_ and TARGET_VAR in vars_): + logger.debug('SOURCE or TARGET are not in gp: %s' % gp_) + return [] + if not gp_.matching_node_pairs: + logger.debug( + 'No matching node pairs, cant get better through adding constraints' + ) + return [] + a = Variable('a') + b = Variable('b') + c = Variable('c') + d = Variable('d') + e = Variable('e') + gp_list = [] + if richtung not in range(1, 9): + richtung = random.choice(range(1, 9)) + logger.debug('Richtung %s wurde gewaehlt' % richtung) + if richtung == 1: + gp1 = GraphPattern([(SOURCE_VAR, a, b)]) + gp2 = GraphPattern([(b, c, d)]) + gp3 = GraphPattern([(d, e, TARGET_VAR)]) + + values_s = { + (SOURCE_VAR, ): [(tup[0], ) for tup in gp_.matching_node_pairs] + } + q = gp1.to_sparql_filter_by_count_in_out_query( + values=values_s, count_node=b, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_next) + logger.debug(q) + try: + t, q_res1 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res1: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res1['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + values_b = get_values([b], q_res1) + q = gp2.to_sparql_filter_by_count_in_out_query( + values=values_b, count_node=d, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_next) + logger.debug(q) + try: + t, q_res2 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res2: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res2['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + values_d = get_values([d], q_res2) + q = gp3.to_sparql_select_sample_query( + values=values_d, limit=limit_choose_endpoint + ) + logger.debug(q) + try: + t, q_res3 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res3: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res3['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + gp_list = get_fixed_path_gp_two_hops( + q_res1, + q_res2, + q_res3, + gp_, + richtung, + gp_.matching_node_pairs, + a, + b, + c, + d, + e + ) + if richtung == 2: + gp1 = GraphPattern([(SOURCE_VAR, a, b)]) + gp2 = GraphPattern([(b, c, d)]) + gp3 = GraphPattern([(TARGET_VAR, e, d)]) + + values_s = { + (SOURCE_VAR, ): [(tup[0], ) for tup in gp_.matching_node_pairs] + } + q = gp1.to_sparql_filter_by_count_in_out_query( + values=values_s, count_node=b, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_next) + logger.debug(q) + try: + t, q_res1 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res1: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res1['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + values_b = get_values([b], q_res1) + q = gp2.to_sparql_filter_by_count_in_out_query( + values=values_b, count_node=d, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_endpoint_two_sided) + logger.debug(q) + try: + t, q_res2 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res2: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res2['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + values_t = { + (TARGET_VAR, ): [(tup[1], ) for tup in gp_.matching_node_pairs] + } + q = gp3.to_sparql_filter_by_count_in_out_query( + values=values_t, count_node=d, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_endpoint_two_sided) + logger.debug(q) + try: + t, q_res3 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res3: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res3['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + gp_list = get_fixed_path_gp_two_hops( + q_res1, + q_res2, + q_res3, + gp_, + richtung, + gp_.matching_node_pairs, + a, + b, + c, + d, + e + ) + if richtung == 3: + gp1 = GraphPattern([(SOURCE_VAR, a, b)]) + gp2 = GraphPattern([(d, c, b)]) + gp3 = GraphPattern([(d, e, TARGET_VAR)]) + + values_s = { + (SOURCE_VAR, ): [(tup[0], ) for tup in gp_.matching_node_pairs] + } + q = gp1.to_sparql_filter_by_count_in_out_query( + values=values_s, count_node=b, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_next) + logger.debug(q) + try: + t, q_res1 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res1: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res1['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + values_b = get_values([b], q_res1) + q = gp2.to_sparql_filter_by_count_in_out_query( + values=values_b, count_node=d, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_startpoint_two_sided) + logger.debug(q) + try: + t, q_res2 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res2: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res2['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + values_t = { + (TARGET_VAR, ): [(tup[1], ) for tup in gp_.matching_node_pairs] + } + q = gp3.to_sparql_filter_by_count_in_out_query( + values=values_t, count_node=d, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_startpoint_two_sided) + logger.debug(q) + try: + t, q_res3 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res3: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res3['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + gp_list = get_fixed_path_gp_two_hops( + q_res1, + q_res2, + q_res3, + gp_, + richtung, + gp_.matching_node_pairs, + a, + b, + c, + d, + e + ) + if richtung == 4: + gp1 = GraphPattern([(SOURCE_VAR, a, b)]) + gp2 = GraphPattern([(d, c, b)]) + gp3 = GraphPattern([(TARGET_VAR, e, d)]) + + values_s = { + (SOURCE_VAR, ): [(tup[0], ) for tup in gp_.matching_node_pairs] + } + q = gp1.to_sparql_filter_by_count_in_out_query( + values=values_s, count_node=b, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_endpoint_two_sided) + logger.debug(q) + try: + t, q_res1 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res1: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res1['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + values_t = { + (TARGET_VAR, ): [(tup[1], ) for tup in gp_.matching_node_pairs] + } + q = gp3.to_sparql_filter_by_count_in_out_query( + values=values_t, count_node=d, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_next) + logger.debug(q) + try: + t, q_res3 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res3: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res3['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + values_d = get_values([d], q_res3) + q = gp2.to_sparql_filter_by_count_in_out_query( + values=values_d, count_node=b, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_endpoint_two_sided) + logger.debug(q) + try: + t, q_res2 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res2: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res2['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + gp_list = get_fixed_path_gp_two_hops( + q_res1, + q_res2, + q_res3, + gp_, + richtung, + gp_.matching_node_pairs, + a, + b, + c, + d, + e + ) + if richtung == 5: + gp1 = GraphPattern([(b, a, SOURCE_VAR)]) + gp2 = GraphPattern([(b, c, d)]) + gp3 = GraphPattern([(d, e, TARGET_VAR)]) + + values_s = {(SOURCE_VAR, ): [(tup[0], ) for tup in gp_.matching_node_pairs]} + q = gp1.to_sparql_filter_by_count_in_out_query( + values=values_s, count_node=b, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_subject_next) + logger.debug(q) + try: + t, q_res1 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res1: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res1['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + values_b = get_values([b], q_res1) + q = gp2.to_sparql_filter_by_count_in_out_query( + values=values_b, count_node=d, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_next) + logger.debug(q) + try: + t, q_res2 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res2: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res2['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + values_d = get_values([d], q_res2) + q = gp3.to_sparql_select_sample_query( + values=values_d, limit=limit_choose_endpoint + ) + logger.debug(q) + try: + t, q_res3 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res3: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res3['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + gp_list = get_fixed_path_gp_two_hops( + q_res1, + q_res2, + q_res3, + gp_, + richtung, + gp_.matching_node_pairs, + a, + b, + c, + d, + e + ) + if richtung == 6: + gp1 = GraphPattern([(b, a, SOURCE_VAR)]) + gp2 = GraphPattern([(b, c, d)]) + gp3 = GraphPattern([(TARGET_VAR, e, d)]) + + values_t = { + (TARGET_VAR, ): [(tup[1], ) for tup in gp_.matching_node_pairs] + } + q = gp3.to_sparql_filter_by_count_in_out_query( + values=values_t, count_node=d, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_next) + logger.debug(q) + try: + t, q_res3 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res3: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res3['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + values_s = { + (SOURCE_VAR, ): [(tup[0], ) for tup in gp_.matching_node_pairs] + } + q = gp1.to_sparql_filter_by_count_in_out_query( + values=values_s, count_node=b, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_startpoint_two_sided) + logger.debug(q) + try: + t, q_res1 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res1: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res1['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + values_d = get_values([d], q_res3) + q = gp2.to_sparql_filter_by_count_in_out_query( + values=values_d, count_node=b, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_startpoint_two_sided) + logger.debug(q) + try: + t, q_res2 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res2: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res2['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + gp_list = get_fixed_path_gp_two_hops( + q_res1, + q_res2, + q_res3, + gp_, + richtung, + gp_.matching_node_pairs, + a, + b, + c, + d, + e + ) + if richtung == 7: + gp1 = GraphPattern([(b, a, SOURCE_VAR)]) + gp2 = GraphPattern([(d, c, b)]) + gp3 = GraphPattern([(d, e, TARGET_VAR)]) + + values_t = { + (TARGET_VAR, ): [(tup[1], ) for tup in gp_.matching_node_pairs] + } + q = gp3.to_sparql_filter_by_count_in_out_query( + values=values_t, count_node=d, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_subject_next) + logger.debug(q) + try: + t, q_res3 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res3: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res3['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + values_d = get_values([d], q_res3) + q = gp2.to_sparql_filter_by_count_in_out_query( + values=values_d, count_node=b, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_next) + logger.debug(q) + try: + t, q_res2 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res2: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res2['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + values_b = get_values([b], q_res2) + q = gp1.to_sparql_select_sample_query( + values=values_b, limit=limit_choose_endpoint) + logger.debug(q) + try: + t, q_res1 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res1: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res1['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + gp_list = get_fixed_path_gp_two_hops( + q_res1, + q_res2, + q_res3, + gp_, + richtung, + gp_.matching_node_pairs, + a, + b, + c, + d, + e + ) + if richtung == 8: + gp1 = GraphPattern([(b, a, SOURCE_VAR)]) + gp2 = GraphPattern([(d, c, b)]) + gp3 = GraphPattern([(TARGET_VAR, e, d)]) + + values_t = { + (TARGET_VAR, ): [(tup[1], ) for tup in gp_.matching_node_pairs] + } + q = gp3.to_sparql_filter_by_count_in_out_query( + values=values_t, count_node=d, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_next) + logger.debug(q) + try: + t, q_res3 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res3: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res3['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + values_d = get_values([d], q_res3) + q = gp2.to_sparql_filter_by_count_in_out_query( + values=values_d, count_node=b, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_next) + logger.debug(q) + try: + t, q_res2 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res2: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res2['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + values_b = get_values([b], q_res2) + q = gp1.to_sparql_select_sample_query( + values=values_b, limit=limit_choose_endpoint) + logger.debug(q) + try: + t, q_res1 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res1: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res1['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + gp_list = get_fixed_path_gp_two_hops( + q_res1, + q_res2, + q_res3, + gp_, + richtung, + gp_.matching_node_pairs, + a, + b, + c, + d, + e + ) + + return gp_list + + +# fixed den ein-hop-pfad zwischen Source und Target, fügt ihn dem Pattern hinzu +# und gibt die Liste der resultierenden Pattern zurück +# TODO nicht so sehr auf source a b. b c Target fokussieren. +def get_fixed_path_gp_three_hops( + q_res1, + q_res2, + q_res3, + q_res4, + gp_main, + richtung, + stp, + a, + b, + c, + d, + e, + f, + g +): + # TODO: überlegen nicht nur verschieden Pattern für verschiedene Richtungen + # zu machen, sondern auch in den Unterschiedlichen Ergebnissn anfangen + # (Idee wäre z.B. die a bis e durch nummerierte random vars zu ersetzen und + # sich dann zu überlegen wie man das übergibt, ob mans iwie immer entlang + # des patterns schafft oder eher nicht. + gp_list = [] + res_rows_path = ['results', 'bindings'] + bind1 = sparql_json_result_bindings_to_rdflib( + get_path(q_res1, res_rows_path, default=[]) + ) + bind2 = sparql_json_result_bindings_to_rdflib( + get_path(q_res2, res_rows_path, default=[]) + ) + bind3 = sparql_json_result_bindings_to_rdflib( + get_path(q_res3, res_rows_path, default=[]) + ) + bind4 = sparql_json_result_bindings_to_rdflib( + get_path(q_res4, res_rows_path, default=[]) + ) + for gtp in stp: + for row4 in bind4: + if gtp[1] == get_path(row4, [TARGET_VAR]): + for row3 in bind3: + if get_path(row3, [f]) == get_path(row4, [f]): + for row2 in bind2: + if get_path(row2, [d]) == get_path(row3, [d]): + for row1 in bind1: + if get_path(row1, [b]) == \ + get_path(row2, [b]) and \ + get_path(row1, [SOURCE_VAR]) == \ + gtp[0]: + if richtung == 1: + gp_ = GraphPattern([ + (SOURCE_VAR, get_path(row1, [a]), b), + (b, get_path(row2, [c]), d), + (d, get_path(row3, [e]), f), + (f, get_path(row4, [g]), TARGET_VAR) + ]) + elif richtung == 2: + gp_ = GraphPattern([ + (SOURCE_VAR, get_path(row1, [a]), b), + (b, get_path(row2, [c]), d), + (d, get_path(row3, [e]), f), + (TARGET_VAR, get_path(row4, [g]), f) + ]) + else: # dummy else, damit gp_ zugewiesen + gp_ = GraphPattern([]) + gp_ = GraphPattern(chain(gp_, gp_main)) + if gp_ not in gp_list: + gp_list.append(gp_) + logger.debug(gtp) + return gp_list + + +# add two hops. +def mutate_deep_narrow_three_hops( + gp_, max_out=None, max_in=None, in_out=None, richtung=None +): + vars_ = gp_.vars_in_graph + if not (SOURCE_VAR in vars_ and TARGET_VAR in vars_): + logger.debug('SOURCE or TARGET are not in gp: %s' % gp_) + return [] + if not gp_.matching_node_pairs: + logger.debug( + 'No matching node pairs, cant get better through adding constraints' + ) + return [] + a = Variable('a') + b = Variable('b') + c = Variable('c') + d = Variable('d') + e = Variable('e') + f = Variable('f') + g = Variable('g') + if richtung not in range(1, 17): + richtung = random.choice(range(1, 17)) + logger.debug('Richtung %s wurde gewaehlt' % richtung) + if richtung == 1: + gp1 = GraphPattern([(SOURCE_VAR, a, b)]) + gp2 = GraphPattern([(b, c, d)]) + gp3 = GraphPattern([(d, e, f)]) + gp4 = GraphPattern([(f, g, TARGET_VAR)]) + + values_s = { + (SOURCE_VAR, ): [(tup[0], ) for tup in gp_.matching_node_pairs] + } + q = gp1.to_sparql_filter_by_count_in_out_query( + values=values_s, count_node=b, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_next) + logger.debug(q) + try: + t, q_res1 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res1: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res1['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + values_b = get_values([b], q_res1) + q = gp2.to_sparql_filter_by_count_in_out_query( + values=values_b, count_node=d, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_next) + logger.debug(q) + try: + t, q_res2 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res2: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res2['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + values_d = get_values([d], q_res2) + q = gp3.to_sparql_filter_by_count_in_out_query( + values=values_d, count_node=f, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_next) + logger.debug(q) + try: + t, q_res3 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res3: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res3['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + values_f = get_values([f], q_res3) + q = gp4.to_sparql_select_sample_query( + values=values_f, limit=limit_choose_endpoint + ) + logger.debug(q) + try: + t, q_res4 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res4: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res4['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + gp_list = get_fixed_path_gp_three_hops( + q_res1, + q_res2, + q_res3, + q_res4, + gp_, + richtung, + gp_.matching_node_pairs, + a, + b, + c, + d, + e, + f, + g + ) + elif richtung == 2: + gp1 = GraphPattern([(SOURCE_VAR, a, b)]) + gp2 = GraphPattern([(b, c, d)]) + gp3 = GraphPattern([(d, e, f)]) + gp4 = GraphPattern([(TARGET_VAR, g, f)]) + + values_s = { + (SOURCE_VAR, ): [(tup[0], ) for tup in gp_.matching_node_pairs] + } + q = gp1.to_sparql_filter_by_count_in_out_query( + values=values_s, count_node=b, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_next) + logger.debug(q) + try: + t, q_res1 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res1: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res1['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + values_b = get_values([b], q_res1) + q = gp2.to_sparql_filter_by_count_in_out_query( + values=values_b, count_node=d, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_next) + logger.debug(q) + try: + t, q_res2 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res2: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res2['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + values_d = get_values([d], q_res2) + q = gp3.to_sparql_filter_by_count_in_out_query( + values=values_d, count_node=f, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_endpoint_two_sided) + logger.debug(q) + try: + t, q_res3 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res3: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res3['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + values_t = { + (TARGET_VAR,): [(tup[1],) for tup in gp_.matching_node_pairs] + } + q = gp4.to_sparql_filter_by_count_in_out_query( + values=values_t, count_node=f, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_endpoint_two_sided) + logger.debug(q) + try: + t, q_res4 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res4: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res4['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + gp_list = get_fixed_path_gp_three_hops( + q_res1, + q_res2, + q_res3, + q_res4, + gp_, + richtung, + gp_.matching_node_pairs, + a, + b, + c, + d, + e, + f, + g + ) + elif richtung == 3: + gp1 = GraphPattern([(SOURCE_VAR, a, b)]) + gp2 = GraphPattern([(b, c, d)]) + gp3 = GraphPattern([(f, e, d)]) + gp4 = GraphPattern([(f, g, TARGET_VAR)]) + + values_s = { + (SOURCE_VAR, ): [(tup[0], ) for tup in gp_.matching_node_pairs] + } + q = gp1.to_sparql_filter_by_count_in_out_query( + values=values_s, count_node=b, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_next) + logger.debug(q) + try: + t, q_res1 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res1: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res1['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + values_b = get_values([b], q_res1) + q = gp2.to_sparql_filter_by_count_in_out_query( + values=values_b, count_node=d, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_next) + logger.debug(q) + try: + t, q_res2 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res2: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res2['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + values_d = get_values([d], q_res2) + q = gp3.to_sparql_filter_by_count_in_out_query( + values=values_d, count_node=f, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_startpoint_two_sided) + logger.debug(q) + try: + t, q_res3 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res3: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res3['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + values_t = { + (TARGET_VAR,): [(tup[1],) for tup in gp_.matching_node_pairs] + } + q = gp4.to_sparql_filter_by_count_in_out_query( + values=values_t, count_node=f, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_startpoint_two_sided) + logger.debug(q) + try: + t, q_res4 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res4: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res4['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + gp_list = get_fixed_path_gp_three_hops( + q_res1, + q_res2, + q_res3, + q_res4, + gp_, + richtung, + gp_.matching_node_pairs, + a, + b, + c, + d, + e, + f, + g + ) + elif richtung == 4: + gp1 = GraphPattern([(SOURCE_VAR, a, b)]) + gp2 = GraphPattern([(b, c, d)]) + gp3 = GraphPattern([(f, e, d)]) + gp4 = GraphPattern([(TARGET_VAR, g, f)]) + + values_s = { + (SOURCE_VAR, ): [(tup[0], ) for tup in gp_.matching_node_pairs] + } + q = gp1.to_sparql_filter_by_count_in_out_query( + values=values_s, count_node=b, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_next) + logger.debug(q) + try: + t, q_res1 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res1: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res1['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + values_b = get_values([b], q_res1) + q = gp2.to_sparql_filter_by_count_in_out_query( + values=values_b, count_node=d, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_endpoint_two_sided) + logger.debug(q) + try: + t, q_res2 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res2: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res2['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + values_t = { + (TARGET_VAR,): [(tup[1],) for tup in gp_.matching_node_pairs] + } + q = gp4.to_sparql_filter_by_count_in_out_query( + values=values_t, count_node=f, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_next) + logger.debug(q) + try: + t, q_res4 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res4: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res4['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + values_f = get_values([f], q_res4) + q = gp3.to_sparql_filter_by_count_in_out_query( + values=values_f, count_node=d, in_out=in_out, max_out=max_out, + max_in=max_in, limit=limit_endpoint_two_sided) + logger.debug(q) + try: + t, q_res3 = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not q_res3: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not q_res3['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + gp_list = get_fixed_path_gp_three_hops( + q_res1, + q_res2, + q_res3, + q_res4, + gp_, + richtung, + gp_.matching_node_pairs, + a, + b, + c, + d, + e, + f, + g + ) + + return gp_list + + +def get_fixed_path_gp_n_hops( + res_q, gp_, n, direct, stp, node, hn_ind, hop +): + gp_list = [] + res_rows_path = ['results', 'bindings'] + bind = [] + for res_q_i in res_q: + bind.append(sparql_json_result_bindings_to_rdflib( + get_path(res_q_i, res_rows_path, default=[])) + ) + hit_paths = [] + hit_paths_help = [] + + if hn_ind == 0: + for row in bind[0]: + for mnp in stp: + if mnp[0] == (get_path(row, [node[0]])): + hit_paths.append([[ + mnp[0], + get_path(row, [hop[0]]), + get_path(row, [node[1]]) + ]]) + for i in range(1, n+1): + for path in hit_paths: + for row in bind[i]: + if path[i-1][2] == get_path(row, [node[i]]): + path_h = path + [[ + path[i-1][2], + get_path(row, [hop[i]]), + get_path(row, [node[i+1]]) + ]] + hit_paths_help.append(path_h) + hit_paths = hit_paths_help + hit_paths_help = [] + + elif hn_ind == n+1: + for row in bind[n]: + for mnp in stp: + if mnp[1] == (get_path(row, [node[n+1]])): + hit_paths.append([[ + get_path(row, [node[n]]), + get_path(row, [hop[n]]), + mnp[1] + ]]) + for i in range(n-1, -1, -1): + for path in hit_paths: + for row in bind[i]: + if path[(n-1)-i][0] == get_path(row, [node[i+1]]): + path_h = path.append([[ + get_path(row, [node[i]], + get_path(row, [hop[i]]), + path[(n-1)-i][0]) + ]]) + hit_paths_help.append(path_h) + hit_paths = hit_paths_help + hit_paths_help = [] + for path in hit_paths: + path.reverse() + + else: + hit_paths_l = [] + hit_paths_r = [] + # get the hits of hit_node to start from + for row_l in bind[hn_ind-1]: + for row_r in bind[hn_ind]: + if get_path(row_l, [node[hn_ind]]) == \ + get_path(row_r, [node[hn_ind]]): + hit_paths_l.append([[ + get_path(row_l, [node[hn_ind-1]]), + get_path(row_l, [hop[hn_ind-1]]), + get_path(row_l, [node[hn_ind]]) + ]]) + hit_paths_r.append([[ + get_path(row_r, [node[hn_ind]]), + get_path(row_r, [hop[hn_ind]]), + get_path(row_r, [node[hn_ind+1]]) + ]]) + # get the path from hit node to targets + for i in range(hn_ind+1, n+1): + for path in hit_paths_r: + for row in bind[i]: + if path[i-(hn_ind+1)][2] == get_path(row, [node[i]]): + path_h = path + [[ + path[i-(hn_ind+1)][2], + get_path(row, [hop[i]]), + get_path(row, [node[i+1]]) + ]] + hit_paths_help.append(path_h) + hit_paths_r = hit_paths_help + hit_paths_help = [] + # get the path from hit node to sources + for i in range(hn_ind, -1, -1): + for path in hit_paths_l: + for row in bind[i]: + if path[hn_ind-i][0] == get_path(row, [node[i+1]]): + path_h = path + [[ + get_path(row, [node[i]]), + get_path(row, [hop[i]]), + path[hn_ind-i][0] + ]] + hit_paths_help.append(path_h) + hit_paths_l = hit_paths_help + hit_paths_help = [] + # get the full path from source to target + for path_l in hit_paths_l: + path_l.reverse() + for path_r in hit_paths_r: + if path_l[hn_ind][2] == path_r[0][0]: + hit_paths.append(path_l + path_r) + # filter the paths, over stp-hits + + hit_paths = filter_stp_hits(hit_paths, stp) + + # Make Graph_Pattern_with fixed hops out of the found paths + for path in hit_paths: + gp_list.append( + GraphPattern( + chain( + GraphPattern([ + (node[i], path[i][1], node[i+1]) if direct(i) == 1 + else (node[i+1], path[i][1], node[i]) + for i in range(n+1) + ]), + gp_ + ) + ) + ) + + return gp_list + + +def filter_stp_hits( + hit_paths, stp +): + res = [] + for hit in hit_paths: + for mnp in stp: + if (mnp[0] == hit[0][0]) and (mnp[1] == hit[len(hit)-1][2]): + res.append(hit) + return res + + +def mutate_deep_narrow_n_hops( + gp_, n, max_out=None, max_in=None, in_out=None, direct=None +): + vars_ = gp_.vars_in_graph + if SOURCE_VAR not in vars_ and TARGET_VAR not in vars_: + logger.info('SOURCE or TARGET are not in gp: %s' % gp_) + return [] + if not gp_.matching_node_pairs: + logger.info( + 'No matching node pairs, cant get better through adding constraints' + ) + return [] + if n < 1: + logger.info('Cannot add less than one hop') + return [] + # setting up lists for nodes, hops, values, gp_helpers, query-results + node = [SOURCE_VAR] + for i in range(n): + node.append(gen_random_var()) + node.append(TARGET_VAR) + hop = [] + for i in range(n+1): + hop.append(gen_random_var()) + if direct is None or len(direct) != n+1: + logger.info('No direction chosen, or direction tuple with false length') + direct = [] + for i in range(n+1): + direct.append(0) + gp_helper = [] + for i in range(n+1): + if direct[i] == 0: + direct[i] = random.choice([-1, 1]) + if direct[i] == 1: + gp_helper.append( + GraphPattern([(node[i], hop[i], node[i + 1])]) + ) + else: + gp_helper.append( + GraphPattern([(node[i + 1], hop[i], node[i])]) + ) + values = [] + for i in range(n+2): + values.append({}) + values[0] = { + (SOURCE_VAR, ): [(tup[0], ) for tup in gp_.matching_node_pairs] + } + values[n+1] = { + (TARGET_VAR, ): [(tup[1], ) for tup in gp_.matching_node_pairs] + } + res_q = [] + for i in range(n+1): + res_q.append({}) + + # selecting an random "hit_node" => Node to check the random hits + hit_node = random.choice(node) + hn_ind = node.index(hit_node) + + # TODO: use direct for cases in queriing + # Querieing + # From source to target if hit_node is target: + if hit_node == TARGET_VAR: + # Firing the queries for the first n-2 steps + for i in range(0, n): + if gp_helper[i][0][0] == node[i]: + q = gp_helper[i].to_sparql_filter_by_count_in_out_query( + values=values[i], count_node=node[i+1], in_out=in_out, + max_out=max_out, max_in=max_in, limit=limit_next) + else: + q = gp_helper[i].to_sparql_filter_by_count_in_out_query( + values=values[i], count_node=node[i+1], in_out=in_out, + max_out=max_out, max_in=max_in, limit=limit_subject_next) + logger.info(q) + try: + t, res_q[i] = run_query(q) + except: + logger.info('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q[i]: + logger.info('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q[i]['results']['bindings']: + logger.info('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + values[i+1] = get_values([node[i+1]], res_q[i]) + # Firing the last query for the target hits: + if gp_helper[n][0][0] == node[n-1]: + q = gp_helper[n].to_sparql_select_sample_query( + values=values[n], limit=limit_choose_endpoint) + else: + q = gp_helper[n].to_sparql_select_sample_query( + values=values[n], limit=limit_choose_subject_endpoint) + logger.info(q) + try: + t, res_q[n] = run_query(q) + except: + logger.info('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q[n]: + logger.info('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q[n]['results']['bindings']: + logger.info('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + # From target to source if hit_node is source: + elif hit_node == SOURCE_VAR: + # Firing the queries for the first n-2 steps + for i in range(n, 0, -1): + if gp_helper[i][0][0] == node[i+1]: + q = gp_helper[i].to_sparql_filter_by_count_in_out_query( + values=values[i+1], count_node=node[i], in_out=in_out, + max_out=max_out, max_in=max_in, limit=limit_next) + else: + q = gp_helper[i].to_sparql_filter_by_count_in_out_query( + values=values[i+1], count_node=node[i], in_out=in_out, + max_out=max_out, max_in=max_in, limit=limit_subject_next) + logger.info(q) + try: + t, res_q[i] = run_query(q) + except: + logger.info('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q[i]: + logger.info('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q[i]['results']['bindings']: + logger.info('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + values[i] = get_values([node[i]], res_q[i]) + # Firing the last query for the target hits: + if gp_helper[0][0][0] == node[1]: + q = gp_helper[0].to_sparql_select_sample_query( + values=values[1], limit=limit_choose_endpoint) + else: + q = gp_helper[0].to_sparql_select_sample_query( + values=values[1], limit=limit_choose_subject_endpoint) + logger.info(q) + try: + t, res_q[0] = run_query(q) + except: + logger.info('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q[0]: + logger.info('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q[0]['results']['bindings']: + logger.info('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + # From both sides to the hit_node: + else: + # firing the queries from source to the last node before hit_node + for i in range(0, hn_ind-1): + if gp_helper[i][0][0] == node[i]: + q = gp_helper[i].to_sparql_filter_by_count_in_out_query( + values=values[i], count_node=node[i+1], in_out=in_out, + max_out=max_out, max_in=max_in, limit=limit_next) + else: + q = gp_helper[i].to_sparql_filter_by_count_in_out_query( + values=values[i], count_node=node[i+1], in_out=in_out, + max_out=max_out, max_in=max_in, limit=limit_subject_next) + logger.info(q) + try: + t, res_q[i] = run_query(q) + except: + logger.info('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q[i]: + logger.info('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q[i]['results']['bindings']: + logger.info('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + values[i+1] = get_values([node[i+1]], res_q[i]) + # Firing the queries from target to the last node before hit node + for i in range(n, hn_ind, -1): + if gp_helper[i][0][0] == node[i+1]: + q = gp_helper[i].to_sparql_filter_by_count_in_out_query( + values=values[i+1], count_node=node[i], in_out=in_out, + max_out=max_out, max_in=max_in, limit=limit_next) + else: + q = gp_helper[i].to_sparql_filter_by_count_in_out_query( + values=values[i+1], count_node=node[i], in_out=in_out, + max_out=max_out, max_in=max_in, limit=limit_subject_next) + logger.info(q) + try: + t, res_q[i] = run_query(q) + except: + logger.info('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q[i]: + logger.info('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q[i]['results']['bindings']: + logger.info('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + values[i] = get_values([node[i]], res_q[i]) + # feuere die letzten beiden queries richtung hit_node ab. + # Dabei unterscheide nach Richtungen beider queries. + if ((gp_helper[hn_ind-1][0][0] == node[hn_ind-1]) and # hit is Object + (gp_helper[hn_ind][0][0] == node[hn_ind+1])): # hit is Object + q = gp_helper[hn_ind-1].to_sparql_filter_by_count_in_out_query( + values=values[hn_ind-1], count_node=node[hn_ind], in_out=in_out, + max_out=max_out, max_in=max_in, limit=limit_endpoint_two_sided) + logger.info(q) + try: + t, res_q[hn_ind-1] = run_query(q) + except: + logger.info('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q[hn_ind-1]: + logger.info('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q[hn_ind-1]['results']['bindings']: + logger.info('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + q = gp_helper[hn_ind].to_sparql_filter_by_count_in_out_query( + values=values[hn_ind+1], count_node=node[hn_ind], in_out=in_out, + max_out=max_out, max_in=max_in, limit=limit_endpoint_two_sided) + logger.info(q) + try: + t, res_q[hn_ind] = run_query(q) + except: + logger.info('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q[hn_ind]: + logger.info('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q[hn_ind]['results']['bindings']: + logger.info('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + elif ((gp_helper[hn_ind-1][0][0] == node[hn_ind]) and # hit is Subject + (gp_helper[hn_ind][0][0] == node[hn_ind])): # hit is Subject + q = gp_helper[hn_ind-1].to_sparql_filter_by_count_in_out_query( + values=values[hn_ind-1], count_node=node[hn_ind], in_out=in_out, + max_out=max_out, max_in=max_in, limit=limit_startpoint_two_sided) + logger.info(q) + try: + t, res_q[hn_ind-1] = run_query(q) + except: + logger.info('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q[hn_ind-1]: + logger.info('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q[hn_ind-1]['results']['bindings']: + logger.info('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + q = gp_helper[hn_ind].to_sparql_filter_by_count_in_out_query( + values=values[hn_ind+1], count_node=node[hn_ind], in_out=in_out, + max_out=max_out, max_in=max_in, limit=limit_startpoint_two_sided) + logger.info(q) + try: + t, res_q[hn_ind] = run_query(q) + except: + logger.info('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q[hn_ind]: + logger.info('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q[hn_ind]['results']['bindings']: + logger.info('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + elif ((gp_helper[hn_ind-1][0][0] == node[hn_ind-1]) and # hit is Object + (gp_helper[hn_ind][0][0] == node[hn_ind])): # hit is Subject + q = gp_helper[hn_ind-1].to_sparql_filter_by_count_in_out_query( + values=values[hn_ind-1], count_node=node[hn_ind], in_out=in_out, + max_out=max_out, max_in=max_in, limit=limit_obj_to_subj) + logger.info(q) + try: + t, res_q[hn_ind-1] = run_query(q) + except: + logger.info('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q[hn_ind-1]: + logger.info('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q[hn_ind-1]['results']['bindings']: + logger.info('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + q = gp_helper[hn_ind].to_sparql_filter_by_count_in_out_query( + values=values[hn_ind+1], count_node=node[hn_ind], in_out=in_out, + max_out=max_out, max_in=max_in, limit=limit_subj_to_obj) + logger.info(q) + try: + t, res_q[hn_ind] = run_query(q) + except: + logger.info('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q[hn_ind]: + logger.info('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q[hn_ind]['results']['bindings']: + logger.info('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + elif ((gp_helper[hn_ind-1][0][0] == node[hn_ind]) and # hit is Subject + (gp_helper[hn_ind][0][0] == node[hn_ind+1])): # hit is Object + q = gp_helper[hn_ind-1].to_sparql_filter_by_count_in_out_query( + values=values[hn_ind-1], count_node=node[hn_ind], in_out=in_out, + max_out=max_out, max_in=max_in, limit=limit_subj_to_obj) + logger.info(q) + try: + t, res_q[hn_ind-1] = run_query(q) + except: + logger.info('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q[hn_ind-1]: + logger.info('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q[hn_ind-1]['results']['bindings']: + logger.info('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + q = gp_helper[hn_ind].to_sparql_filter_by_count_in_out_query( + values=values[hn_ind+1], count_node=node[hn_ind], in_out=in_out, + max_out=max_out, max_in=max_in, limit=limit_obj_to_subj) + logger.info(q) + try: + t, res_q[hn_ind] = run_query(q) + except: + logger.info('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q[hn_ind]: + logger.info('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q[hn_ind]['results']['bindings']: + logger.info('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + + gp_list = get_fixed_path_gp_n_hops( + res_q, gp_, n, direct, gp_.matching_node_pairs, node, hn_ind, hop + ) + + return gp_list + + +# erste Version, komplett straight forward +def mutate_deep_narrow( + gp_, gtps, n, direct=None, gp_in=False +): + node = [SOURCE_VAR] + for i in range(n): + node.append(Variable('n%i' % i)) + node.append(TARGET_VAR) + hop = [] + for i in range(n + 1): + hop.append(Variable('p%i' % i)) + if direct is None or len(direct) != n + 1: + logger.debug( + 'No direction chosen, or direction tuple with false length' + ) + direct = [] + for i in range(n + 1): + direct.append(0) + gp_helper = [] + for i in range(n + 1): + if direct[i] == 0: + direct[i] = random.choice([-1, 1]) + if direct[i] == 1: + gp_helper.append( + GraphPattern([(node[i], hop[i], node[i + 1])]) + ) + else: + gp_helper.append( + GraphPattern([(node[i + 1], hop[i], node[i])]) + ) + values = {} + values[SOURCE_VAR] = {(SOURCE_VAR,): [(tup[0],) for tup in gtps]} + values[TARGET_VAR] = {(TARGET_VAR,): [(tup[1],) for tup in gtps]} + values['st'] = {(SOURCE_VAR, TARGET_VAR): gtps} + res_q = [] + for i in range(n + 1): + res_q.append({}) + + # Queries für die Schritte + valueblocks = {} + valueblocks[SOURCE_VAR] = values[SOURCE_VAR] + for i in range(n+1): + q = gp_.to_sparql_useful_path_query( + hop[i], node[i+1], valueblocks, gp_helper[:i+1], gp_in=gp_in + ) + logger.debug(q) + try: + t, res_q[i] = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q[i]: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q[i]['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + values[hop[i]] = get_values([hop[i]], res_q[i]) + valueblocks[hop[i]] = { + (hop[i],): random.sample( + values[hop[i]][(hop[i],)], + min(10, len(values[hop[i]][(hop[i],)])) + ) + } + + # Query fürs Ergebnis + gp_help = GraphPattern([ + (node[i], hop[i], node[i+1]) if direct[i] == 1 + else (node[i+1], hop[i], node[i]) + for i in range(n+1) + ]) + # gemeinsamer source/target-block, damit nur "richtige" Pfade gefunden + # werden + del valueblocks[SOURCE_VAR] + valueblocks['st'] = values['st'] + q = gp_.to_sparql_inst_query(hop, valueblocks, gp_help, gp_in=gp_in) + logger.debug(q) + try: + t, res_q_inst = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q_inst: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q_inst['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + res = [] + res_rows_path = ['results', 'bindings'] + bind = sparql_json_result_bindings_to_rdflib( + get_path(res_q_inst, res_rows_path, default=[]) + ) + for row in bind: + gp_res = GraphPattern([ + (node[i], get_path(row, [hop[i]]), node[i + 1]) if direct[i] == 1 + else (node[i + 1], get_path(row, [hop[i]]), node[i]) + for i in range(n + 1) + ]) + res.append(gp_res) + + return res + + +# zweite Version: Query für letzten step bekommt schon die Targets +def mutate_deep_narrow_2( + gp_, gtps, n, direct=None, gp_in=False +): + node = [SOURCE_VAR] + for i in range(n): + node.append(Variable('n%i' % i)) + node.append(TARGET_VAR) + hop = [] + for i in range(n + 1): + hop.append(Variable('p%i' % i)) + if direct is None or len(direct) != n + 1: + logger.debug( + 'No direction chosen, or direction tuple with false length' + ) + direct = [] + for i in range(n + 1): + direct.append(0) + gp_helper = [] + for i in range(n + 1): + if direct[i] == 0: + direct[i] = random.choice([-1, 1]) + if direct[i] == 1: + gp_helper.append( + GraphPattern([(node[i], hop[i], node[i + 1])]) + ) + else: + gp_helper.append( + GraphPattern([(node[i + 1], hop[i], node[i])]) + ) + values = {} + values[SOURCE_VAR] = {(SOURCE_VAR,): [(tup[0],) for tup in gtps]} + values[TARGET_VAR] = {(TARGET_VAR,): [(tup[1],) for tup in gtps]} + values['st'] = {(SOURCE_VAR, TARGET_VAR): gtps} + res_q = [] + for i in range(n + 1): + res_q.append({}) + + # Queries für die Schritte + valueblocks = {} + valueblocks[SOURCE_VAR] = values[SOURCE_VAR] + for i in range(n): + q = gp_.to_sparql_useful_path_query( + hop[i], node[i+1], valueblocks, gp_helper[:i+1], gp_in=gp_in + ) + logger.debug(q) + try: + t, res_q[i] = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q[i]: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q[i]['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + values[hop[i]] = get_values([hop[i]], res_q[i]) + valueblocks[hop[i]] = { + (hop[i],): random.sample( + values[hop[i]][(hop[i],)], + min(10, len(values[hop[i]][(hop[i],)])) + ) + } + + # gemeinsamer source/target-block, damit nur "richtige" Pfade gefunden + # werden + del valueblocks[SOURCE_VAR] + valueblocks['st'] = values['st'] + q = gp_.to_sparql_useful_path_inst_query( + hop, valueblocks, gp_helper, gp_in=gp_in + ) + logger.debug(q) + try: + t, res_q_inst = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q_inst: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q_inst['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + res = [] + res_rows_path = ['results', 'bindings'] + bind = sparql_json_result_bindings_to_rdflib( + get_path(res_q_inst, res_rows_path, default=[]) + ) + for row in bind: + gp_res = GraphPattern([ + (node[i], get_path(row, [hop[i]]), node[i + 1]) if direct[i] == 1 + else (node[i + 1], get_path(row, [hop[i]]), node[i]) + for i in range(n + 1) + ]) + res.append(gp_res) + + return res + + +# dritte Version: BIDI straight forward +def mutate_deep_narrow_3( + gp_, gtps, n, direct=None, gp_in=False +): + node = [SOURCE_VAR] + for i in range(n): + node.append(Variable('n%i' % i)) + node.append(TARGET_VAR) + hop = [] + for i in range(n + 1): + hop.append(Variable('p%i' % i)) + if direct is None or len(direct) != n + 1: + logger.debug( + 'No direction chosen, or direction tuple with false length' + ) + direct = [] + for i in range(n + 1): + direct.append(0) + gp_helper = [] + for i in range(n + 1): + if direct[i] == 0: + direct[i] = random.choice([-1, 1]) + if direct[i] == 1: + gp_helper.append( + GraphPattern([(node[i], hop[i], node[i + 1])]) + ) + else: + gp_helper.append( + GraphPattern([(node[i + 1], hop[i], node[i])]) + ) + values = {} + values[SOURCE_VAR] = {(SOURCE_VAR,): [(tup[0],) for tup in gtps]} + values[TARGET_VAR] = {(TARGET_VAR,): [(tup[1],) for tup in gtps]} + values['st'] = {(SOURCE_VAR, TARGET_VAR): gtps} + res_q = [] + for i in range(n+1): + res_q.append({}) + + # Queries für die Schritte + valueblocks_s = {} + valueblocks_s[SOURCE_VAR] = values[SOURCE_VAR] + valueblocks_t = {} + valueblocks_t[TARGET_VAR] = values[TARGET_VAR] + for i in range(int((n / 2) + 1)): + q = gp_.to_sparql_useful_path_query( + hop[i], node[i+1], valueblocks_s, gp_helper[:i+1], gp_in=gp_in + ) + logger.debug(q) + try: + t, res_q[i] = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q[i]: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q[i]['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + values[hop[i]] = get_values([hop[i]], res_q[i]) + valueblocks_s[hop[i]] = { + (hop[i],): random.sample( + values[hop[i]][(hop[i],)], + min(10, len(values[hop[i]][(hop[i],)])) + ) + } + if n-i != i: + q = gp_.to_sparql_useful_path_query( + hop[n-i], + node[n-i], + valueblocks_t, + gp_helper[n-i:], + startvar=TARGET_VAR, + gp_in=gp_in + ) + logger.debug(q) + try: + t, res_q[n-i] = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q[n-i]: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q[n-i]['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + values[hop[n-i]] = get_values([hop[n-i]], res_q[n-i]) + valueblocks_t[hop[n-i]] = { + (hop[n-i],): random.sample( + values[hop[n-i]][(hop[n-i],)], + min(10, len(values[hop[n-i]][(hop[n-i],)])) + ) + } + + # Query fürs Ergebnis + gp_help = GraphPattern([ + (node[i], hop[i], node[i+1]) if direct[i] == 1 + else (node[i+1], hop[i], node[i]) + for i in range(n+1) + ]) + # gemeinsamer source/target-block, damit nur "richtige" Pfade gefunden + # werden + valueblocks = {} + for key in valueblocks_s: + if key is not SOURCE_VAR: + valueblocks[key] = valueblocks_s[key] + for key in valueblocks_t: + if key is not TARGET_VAR: + valueblocks[key] = valueblocks_t[key] + valueblocks['st'] = values['st'] + q = gp_.to_sparql_inst_query(hop, valueblocks, gp_help, gp_in=gp_in) + logger.debug(q) + try: + t, res_q_inst = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q_inst: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q_inst['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + res = [] + res_rows_path = ['results', 'bindings'] + bind = sparql_json_result_bindings_to_rdflib( + get_path(res_q_inst, res_rows_path, default=[]) + ) + for row in bind: + gp_res = GraphPattern([ + (node[i], get_path(row, [hop[i]]), node[i + 1]) if direct[i] == 1 + else (node[i + 1], get_path(row, [hop[i]]), node[i]) + for i in range(n + 1) + ]) + res.append(gp_res) + + return res + + +# vierte Version: BIDI with instantiation in last step +def mutate_deep_narrow_4( + gp_, gtps, n, direct=None, gp_in=False +): + node = [SOURCE_VAR] + for i in range(n): + node.append(Variable('n%i' % i)) + node.append(TARGET_VAR) + hop = [] + for i in range(n + 1): + hop.append(Variable('p%i' % i)) + if direct is None or len(direct) != n + 1: + logger.debug( + 'No direction chosen, or direction tuple with false length' + ) + direct = [] + for i in range(n + 1): + direct.append(0) + gp_helper = [] + for i in range(n + 1): + if direct[i] == 0: + direct[i] = random.choice([-1, 1]) + if direct[i] == 1: + gp_helper.append( + GraphPattern([(node[i], hop[i], node[i + 1])]) + ) + else: + gp_helper.append( + GraphPattern([(node[i + 1], hop[i], node[i])]) + ) + values = {} + values[SOURCE_VAR] = {(SOURCE_VAR,): [(tup[0],) for tup in gtps]} + values[TARGET_VAR] = {(TARGET_VAR,): [(tup[1],) for tup in gtps]} + values['st'] = {(SOURCE_VAR, TARGET_VAR): gtps} + res_q = [] + for i in range(n+1): + res_q.append({}) + + # Queries für die Schritte + valueblocks_s = {} + valueblocks_s[SOURCE_VAR] = values[SOURCE_VAR] + valueblocks_t = {} + valueblocks_t[TARGET_VAR] = values[TARGET_VAR] + for i in range(int((n / 2) + 1)): + if i < int(n/2): + q = gp_.to_sparql_useful_path_query( + hop[i], node[i+1], valueblocks_s, gp_helper[:i+1], gp_in=gp_in + ) + logger.debug(q) + try: + t, res_q[i] = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q[i]: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q[i]['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + values[hop[i]] = get_values([hop[i]], res_q[i]) + valueblocks_s[hop[i]] = { + (hop[i],): random.sample( + values[hop[i]][(hop[i],)], + min(10, len(values[hop[i]][(hop[i],)])) + ) + } + if n-i > i: + q = gp_.to_sparql_useful_path_query( + hop[n-i], + node[n-i], + valueblocks_t, + gp_helper[n-i:], + startvar=TARGET_VAR, + gp_in=gp_in + ) + logger.debug(q) + try: + t, res_q[n-i] = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q[n-i]: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q[n-i]['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + values[hop[n-i]] = get_values([hop[n-i]], res_q[n-i]) + valueblocks_t[hop[n-i]] = { + (hop[n-i],): random.sample( + values[hop[n-i]][(hop[n-i],)], + min(10, len(values[hop[n-i]][(hop[n-i],)])) + ) + } + + # Query fürs Ergebnis + # gemeinsamer source/target-block, damit nur "richtige" Pfade gefunden + # werden + valueblocks = {} + for key in valueblocks_s: + if key is not SOURCE_VAR: + valueblocks[key] = valueblocks_s[key] + for key in valueblocks_t: + if key is not TARGET_VAR: + valueblocks[key] = valueblocks_t[key] + valueblocks['st'] = values['st'] + q = gp_.to_sparql_useful_path_inst_query( + hop, valueblocks, gp_helper, gp_in=gp_in + ) + logger.debug(q) + try: + t, res_q_inst = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q_inst: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q_inst['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + res = [] + res_rows_path = ['results', 'bindings'] + bind = sparql_json_result_bindings_to_rdflib( + get_path(res_q_inst, res_rows_path, default=[]) + ) + for row in bind: + gp_res = GraphPattern([ + (node[i], get_path(row, [hop[i]]), node[i + 1]) if direct[i] == 1 + else (node[i + 1], get_path(row, [hop[i]]), node[i]) + for i in range(n + 1) + ]) + res.append(gp_res) + + return res + + +# fünfte Version: filtern nach Count +def mutate_deep_narrow_5( + gp_, gtps, n, direct=None, gp_in=False +): + node = [SOURCE_VAR] + for i in range(n): + node.append(Variable('n%i' % i)) + node.append(TARGET_VAR) + hop = [] + for i in range(n + 1): + hop.append(Variable('p%i' % i)) + if direct is None or len(direct) != n + 1: + logger.debug( + 'No direction chosen, or direction tuple with false length' + ) + direct = [] + for i in range(n + 1): + direct.append(0) + gp_helper = [] + for i in range(n + 1): + if direct[i] == 0: + direct[i] = random.choice([-1, 1]) + if direct[i] == 1: + gp_helper.append( + GraphPattern([(node[i], hop[i], node[i + 1])]) + ) + else: + gp_helper.append( + GraphPattern([(node[i + 1], hop[i], node[i])]) + ) + values = {} + values[SOURCE_VAR] = {(SOURCE_VAR,): [(tup[0],) for tup in gtps]} + values[TARGET_VAR] = {(TARGET_VAR,): [(tup[1],) for tup in gtps]} + values['st'] = {(SOURCE_VAR, TARGET_VAR): gtps} + res_q = [] + for i in range(n + 1): + res_q.append({}) + + # Queries für die Schritte + valueblocks = {} + valueblocks[SOURCE_VAR] = values[SOURCE_VAR] + for i in range(n+1): + q = gp_.to_sparql_useful_path_query( + hop[i], node[i+1], valueblocks, gp_helper[:i+1], gp_in=gp_in + ) + logger.debug(q) + try: + t, res_q[i] = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q[i]: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q[i]['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + values[hop[i]] = get_values([hop[i]], res_q[i]) + valueblocks[hop[i]] = get_weighted_sample( + hop[i], Variable('avgc'+''.join(node[i+1])), res_q[i] + ) + + # Query fürs Ergebnis + gp_help = GraphPattern([ + (node[i], hop[i], node[i+1]) if direct[i] == 1 + else (node[i+1], hop[i], node[i]) + for i in range(n+1) + ]) + # gemeinsamer source/target-block, damit nur "richtige" Pfade gefunden + # werden + del valueblocks[SOURCE_VAR] + valueblocks['st'] = values['st'] + q = gp_.to_sparql_inst_query(hop, valueblocks, gp_help, gp_in=gp_in) + logger.debug(q) + try: + t, res_q_inst = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q_inst: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q_inst['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + res = [] + res_rows_path = ['results', 'bindings'] + bind = sparql_json_result_bindings_to_rdflib( + get_path(res_q_inst, res_rows_path, default=[]) + ) + for row in bind: + gp_res = GraphPattern([ + (node[i], get_path(row, [hop[i]]), node[i + 1]) if direct[i] == 1 + else (node[i + 1], get_path(row, [hop[i]]), node[i]) + for i in range(n + 1) + ]) + res.append(gp_res) + + return res + + +# sechste Version: Query für letzten step bekommt schon die Targets +# => Precheck feasible? +def mutate_deep_narrow_6( + gp_, gtps, n, direct=None, gp_in=False +): + node = [SOURCE_VAR] + for i in range(n): + node.append(Variable('n%i' % i)) + node.append(TARGET_VAR) + hop = [] + for i in range(n + 1): + hop.append(Variable('p%i' % i)) + if direct is None or len(direct) != n + 1: + logger.debug( + 'No direction chosen, or direction tuple with false length' + ) + direct = [] + for i in range(n + 1): + direct.append(0) + gp_helper = [] + for i in range(n + 1): + if direct[i] == 0: + direct[i] = random.choice([-1, 1]) + if direct[i] == 1: + gp_helper.append( + GraphPattern([(node[i], hop[i], node[i + 1])]) + ) + else: + gp_helper.append( + GraphPattern([(node[i + 1], hop[i], node[i])]) + ) + values = {} + values[SOURCE_VAR] = {(SOURCE_VAR,): [(tup[0],) for tup in gtps]} + values[TARGET_VAR] = {(TARGET_VAR,): [(tup[1],) for tup in gtps]} + values['st'] = {(SOURCE_VAR, TARGET_VAR): gtps} + res_q = [] + for i in range(n + 1): + res_q.append({}) + + # Pre-check: + gp_help = GraphPattern([ + (node[i], hop[i], node[i+1]) if direct[i] == 1 + else (node[i+1], hop[i], node[i]) + for i in range(n+1) + ]) + q = gp_help.to_sparql_precheck_query(values['st'], gp_in=gp_in) + logger.debug(q) + try: + t, res_q = run_query(q) + except: + logger.info('Pre-Check hat nicht geklappt') + if not res_q: + logger.info('Pre-Check hat kein Ergebnis') + elif not res_q['results']['bindings']: + logger.info('Pre-Check hat keine gebundenen Variablen') + else: + logger.info('Pre-Check hat einen Treffer') + + # Queries für die Schritte + valueblocks = {} + valueblocks[SOURCE_VAR] = values[SOURCE_VAR] + for i in range(n): + q = gp_.to_sparql_useful_path_query( + hop[i], node[i+1], valueblocks, gp_helper[:i+1], gp_in=gp_in + ) + logger.debug(q) + try: + t, res_q[i] = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q[i]: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q[i]['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + values[hop[i]] = get_values([hop[i]], res_q[i]) + valueblocks[hop[i]] = { + (hop[i],): random.sample( + values[hop[i]][(hop[i],)], + min(10, len(values[hop[i]][(hop[i],)])) + ) + } + + # gemeinsamer source/target-block, damit nur "richtige" Pfade gefunden + # werden + del valueblocks[SOURCE_VAR] + valueblocks['st'] = values['st'] + q = gp_.to_sparql_useful_path_inst_query( + hop, valueblocks, gp_helper, gp_in=gp_in + ) + logger.debug(q) + try: + t, res_q_inst = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q_inst: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q_inst['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + res = [] + res_rows_path = ['results', 'bindings'] + bind = sparql_json_result_bindings_to_rdflib( + get_path(res_q_inst, res_rows_path, default=[]) + ) + for row in bind: + gp_res = GraphPattern([ + (node[i], get_path(row, [hop[i]]), node[i + 1]) if direct[i] == 1 + else (node[i + 1], get_path(row, [hop[i]]), node[i]) + for i in range(n + 1) + ]) + res.append(gp_res) + + return res + + +# siebte Version: BIDI with instantiation in last step + ws-sampling +def mutate_deep_narrow_7( + gp_, gtps, n, direct=None, gp_in=False +): + node = [SOURCE_VAR] + for i in range(n): + node.append(Variable('n%i' % i)) + node.append(TARGET_VAR) + hop = [] + for i in range(n + 1): + hop.append(Variable('p%i' % i)) + if direct is None or len(direct) != n + 1: + logger.debug( + 'No direction chosen, or direction tuple with false length' + ) + direct = [] + for i in range(n + 1): + direct.append(0) + gp_helper = [] + for i in range(n + 1): + if direct[i] == 0: + direct[i] = random.choice([-1, 1]) + if direct[i] == 1: + gp_helper.append( + GraphPattern([(node[i], hop[i], node[i + 1])]) + ) + else: + gp_helper.append( + GraphPattern([(node[i + 1], hop[i], node[i])]) + ) + values = {} + values[SOURCE_VAR] = {(SOURCE_VAR,): [(tup[0],) for tup in gtps]} + values[TARGET_VAR] = {(TARGET_VAR,): [(tup[1],) for tup in gtps]} + values['st'] = {(SOURCE_VAR, TARGET_VAR): gtps} + res_q = [] + for i in range(n+1): + res_q.append({}) + + # Queries für die Schritte + valueblocks_s = {} + valueblocks_s[SOURCE_VAR] = values[SOURCE_VAR] + valueblocks_t = {} + valueblocks_t[TARGET_VAR] = values[TARGET_VAR] + for i in range(int((n / 2) + 1)): + if i < int(n/2): + q = gp_.to_sparql_useful_path_query( + hop[i], node[i+1], valueblocks_s, gp_helper[:i+1], gp_in=gp_in + ) + logger.debug(q) + try: + t, res_q[i] = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q[i]: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q[i]['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + values[hop[i]] = get_values([hop[i]], res_q[i]) + valueblocks_s[hop[i]] = get_weighted_sample( + hop[i], Variable('avgc' + ''.join(node[i + 1])), res_q[i] + ) + if n-i > i: + q = gp_.to_sparql_useful_path_query( + hop[n-i], + node[n-i], + valueblocks_t, + gp_helper[n-i:], + startvar=TARGET_VAR, + gp_in=gp_in + ) + logger.debug(q) + try: + t, res_q[n-i] = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q[n-i]: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q[n-i]['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + values[hop[n-i]] = get_values([hop[n-i]], res_q[n-i]) + valueblocks_t[hop[n-i]] = get_weighted_sample( + hop[n-i], Variable('avgc' + ''.join(node[n-i])), res_q[n-i] + ) + + # Query fürs Ergebnis + # gemeinsamer source/target-block, damit nur "richtige" Pfade gefunden + # werden + valueblocks = {} + for key in valueblocks_s: + if key is not SOURCE_VAR: + valueblocks[key] = valueblocks_s[key] + for key in valueblocks_t: + if key is not TARGET_VAR: + valueblocks[key] = valueblocks_t[key] + valueblocks['st'] = values['st'] + q = gp_.to_sparql_useful_path_inst_query( + hop, valueblocks, gp_helper, gp_in=gp_in + ) + logger.debug(q) + try: + t, res_q_inst = run_query(q) + except: + logger.debug('Die Query (s.o.) hat nicht geklappt') + return [] + if not res_q_inst: + logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') + return [] + elif not res_q_inst['results']['bindings']: + logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') + return [] + res = [] + res_rows_path = ['results', 'bindings'] + bind = sparql_json_result_bindings_to_rdflib( + get_path(res_q_inst, res_rows_path, default=[]) + ) + for row in bind: + gp_res = GraphPattern([ + (node[i], get_path(row, [hop[i]]), node[i + 1]) if direct[i] == 1 + else (node[i + 1], get_path(row, [hop[i]]), node[i]) + for i in range(n + 1) + ]) + res.append(gp_res) + + return res + + +def main(): + ground_truth_pairs = get_semantic_associations() + ground_truth_pairs, _ = split_training_test_set(ground_truth_pairs) + # ground_truth_pairs = ground_truth_pairs[:100] + gtp_scores = GTPScores(ground_truth_pairs) + res = [] + # key = random.choice(gp_found.keys()) + # for i in range(100): + # # ground_truth_pairs = random.sample(ground_truth_pairs, 200) + # gp_ = GraphPattern([]) + # # gp_ = gp_found[key] + # res_= mutate_deep_narrow_5(gp_, ground_truth_pairs, 2, gp_in=False) + # res.append(res_) + # logger.info(i) + # if res_: + # logger.info(res_) + # + # logger.info(res) + for key in gp_found.keys(): + gp_ = gp_found[key] + eval_gp(gtp_scores, gp_) + for i in range(100): + res_ = mutate_deep_narrow_4( + gp_, gp_.matching_node_pairs, 6, gp_in=False + ) + res.append(res_) + logger.info((i, key)) + if res_: + logger.info(res_) + + # res_eval=[] + # res = [] + # + # max_out = 65 + # max_in = 40 + # in_out = 'out' + # richtung = 2 + # ground_truth_pairs = get_semantic_associations() + # ground_truth_pairs, _ = split_training_test_set(ground_truth_pairs) + # # ground_truth_pairs = ground_truth_pairs[0:200] + # gtp_scores = GTPScores(ground_truth_pairs) + # gp = gp_found['140'] + # eval_gp(gtp_scores, gp) + # + # for i in range(20): + # res.append(mutate_deep_narrow_n_hops(gp, 2, max_out=max_out, in_out=in_out)) + # + # logger.info(res) + # + # durchgaenge = [] + # + # for richtung in range(1, 9): + # for max_out in [5, 10, 20, 30, 40, 50, 65, 75, 85, 100, 200]: + # for key in gp_found.keys(): + # durchgaenge.append((richtung, max_out, key)) + # + # random.shuffle(durchgaenge) + # + # for (richtung, max_out, key) in durchgaenge: + # logger.info('Durchgang: richtung = %s, max_out = %s, gp.key = %s' % + # (richtung, max_out, key) + # ) + # ground_truth_pairs = get_semantic_associations() + # ground_truth_pairs, _ = split_training_test_set(ground_truth_pairs) + # # ground_truth_pairs = random.sample(ground_truth_pairs, 100) + # gtp_scores = GTPScores(ground_truth_pairs) + # gp = gp_found[key] + # eval_gp(gtp_scores, gp) + # + # res_gp = mutate_deep_narrow_two_hops( + # gp, + # max_out=max_out, + # max_in=max_in, + # in_out=in_out, + # richtung=richtung + # ) + # res_gp.append(gp) + # res_eval = eval_gp_list(gtp_scores, res_gp) + # gp_eval = res_eval[-1] + # res_eval = sorted( + # res_eval[:-1], key=lambda gp_: -gp_.fitness.values.score + # ) + # if res_eval: + # logger.info(max_out) + # print_graph_pattern(gp) + # for gp_ in res_eval: + # print_graph_pattern(gp_) + # res.append((richtung, key, max_out, gp_eval, res_eval)) + + # f = open('store.pckl', 'wb') + # pickle.dump(res, f) + # f.close() + + # in der Konsole das res nochmal anschauen: + # import pickle + # f = open('tests/store.pckl', 'rb') + # res = pickle.load(f) + # f.close() + + # print('HERE STARTS THE RES_PRINTING:') + # for r in res: + # print('richtung %s, key %s, max_out %s\n' % r[0:3]) + # print('Original GP:\n') + # print_graph_pattern(r[3], print_matching_node_pairs=0) + # print('Top 3 found (if 3 where found, else all found) GP:\n') + # for i in range(min(3, len(r[4]))): + # print_graph_pattern(r[4][i], print_matching_node_pairs=0) + + # ground_truth_pairs = get_semantic_associations() + # ground_truth_pairs, _ = split_training_test_set(ground_truth_pairs) + # ground_truth_pairs = random.sample(ground_truth_pairs, 100) + # gtp_scores = GTPScores(ground_truth_pairs) + # gp = gp_found[random.choice(gp_found.keys())] + # + # max_out = 50 + # max_in = 40 + # in_out = 'out' + # + # res = mutate_deep_narrow_one_hop_s_t_without_direction( + # gp, + # ground_truth_pairs, + # max_out=max_out, + # max_in=max_in, + # in_out=in_out + # ) + # res.append(gp) + # res_eval = eval_gp_list(gtp_scores, res) + # gp_eval = res_eval[-1] + # res_eval = sorted(res_eval[:-1], key=lambda gp_: -gp_.fitness.values.score) + # + # print_graph_pattern(gp_eval) + # for gp_ in res_eval: + # print_graph_pattern(gp_) + + # # Zählfelder für die Statistik (Zugriff über max_in_out) + # # durchschnittliche Anzahl der zurückgegebenen pattern + # avg_num_pat = {} + # # maximal zurückgegebene pattern + # max_num_pat = {} + # # durchschnittlicher Score aller zurückgegebenen pattern + # avg_score_all_pat = {} + # # durchschnittlicher Score des besten zurückgegegebenen pattern + # # (wenn vorhanden) + # avg_score_best_pat = {} + # # druchschnittlicher Score des besten zurückgegebenen patterns + # # (0 wenn keins vorhanden) + # avg_score_best_pat_pun = {} + # # maximaler Score eines zurückgegebenen patterns + # max_score_ovrall = {} + # # Wie oft wurde kein pattern zurückgegeben + # num_no_pattern = {} + # # durchschnittliche abweichung des besten patterns vom Score des + # # Ausgangspatterns, wenn vorhanden + # avg_diff_all_pat = {} + # # durchschnittliche Abweichung vom Score des Ausgangspatterns, + # # wenn vorhanden + # avg_diff_best_pat = {} + # # aufaddierter score von Durchgängen ohne pattern + # punish_avg_diff_best_pat = {} + # # aufaddierter score von Durchgängen ohne pattern mal der durchschnittlichen + # # Anzahl zurückgegebener pattern + # punish_avg_diff_all_pat = {} + # # durchschnittliche Abweichung des besten patterns vom score des + # # Ausgangspatterns mit Strafe für gar kein pattern + # avg_diff_all_pat_punished = {} + # # durchschnittliche Abweichung vom Score des Ausgangspatterns, mit Strafe + # # für gar kein pattern + # avg_diff_best_pat_punished = {} + # # die fünf besten (am stärksten verbessernden) pattern + # five_best_pattern = {} + # + # max_out_steps = [10, 15, 20, 25, 30, 40, 50, 75, 100] + # + # for j in max_out_steps: + # avg_num_pat[j] = 0 + # max_num_pat[j] = 0 + # avg_score_all_pat[j] = 0 + # avg_score_best_pat[j] = 0 + # avg_score_best_pat_pun[j] = 0 + # max_score_ovrall[j] = 0 + # num_no_pattern[j] = 0 + # avg_diff_all_pat[j] = 0 + # avg_diff_best_pat[j] = 0 + # punish_avg_diff_best_pat[j] = 0 + # punish_avg_diff_all_pat[j] = 0 + # avg_diff_all_pat_punished[j] = 0 + # avg_diff_best_pat_punished[j] = 0 + # five_best_pattern[j] = [] + # + # reps = 50 + # + # for i in range(reps): + # ground_truth_pairs = get_semantic_associations() + # ground_truth_pairs, _ = split_training_test_set(ground_truth_pairs) + # ground_truth_pairs = random.sample(ground_truth_pairs, 100) + # gtp_scores = GTPScores(ground_truth_pairs) + # gp = gp_found[random.choice(gp_found.keys())] + # for j in max_out_steps: + # res = mutate_deep_narrow_one_hop_s_t_without_direction( + # gp, ground_truth_pairs, max_out=j, in_out='out' + # ) # TODO: warum kommt oben None rein??? + # res.append(gp) + # res_eval = eval_gp_list(gtp_scores, res) + # gp_eval = res_eval[-1] + # res_eval = sorted( + # res_eval[:-1], key=lambda gp_: -gp_.fitness.values.score + # ) + # + # # Statistik: + # avg_num_pat[j] = avg_num_pat[j] + len(res_eval) / reps + # if len(res_eval) > max_num_pat[j]: + # max_num_pat[j] = len(res_eval) + # for gp_ in res_eval: + # avg_score_all_pat[j] = avg_score_all_pat[j] + \ + # gp_.fitness.values.score / \ + # (len(res_eval) * reps) + # if res_eval: + # avg_score_best_pat[j] = avg_score_best_pat[j] + \ + # res_eval[0].fitness.values.score + # if res_eval: + # if res_eval[0].fitness.values.score > max_score_ovrall[j]: + # max_score_ovrall[j] = res_eval[0].fitness.values.score + # if len(res_eval) == 0: + # num_no_pattern[j] = num_no_pattern[j] + 1 + # if res_eval: + # avg_diff_all_pat[j] = avg_diff_all_pat[j] + \ + # (res_eval[0].fitness.values.score - + # gp_eval.fitness.values.score) / \ + # reps + # for gp_ in res_eval: + # avg_diff_best_pat[j] = avg_diff_best_pat[j] + \ + # (gp_.fitness.values.score - + # gp_eval.fitness.values.score) / \ + # (len(res_eval) * reps) + # if not res_eval: + # punish_avg_diff_best_pat[j] = punish_avg_diff_best_pat[j] + \ + # gp_eval.fitness.values.score + # if res_eval: + # if len(five_best_pattern[j]) < 5: + # five_best_pattern[j].append(( + # res_eval[0].fitness.values.score - + # gp_eval.fitness.values.score, + # res_eval[0], + # gp_eval + # )) + # five_best_pattern[j] = sorted( + # five_best_pattern[j], + # key=lambda tup_: -tup_[0] + # ) + # else: + # five_best_pattern[j][4] = ( + # res_eval[0].fitness.values.score - + # gp_eval.fitness.values.score, + # res_eval[0], + # gp_eval + # ) + # five_best_pattern[j] = sorted( + # five_best_pattern[j], + # key=lambda tup_: -tup_[0] + # ) + # logger.info('Runde %s, min_max = %s' % (i, j)) + # print_graph_pattern(gp) + # if res_eval: + # print_graph_pattern(res_eval[0]) + # + # # print out the five best patterns per min_max: + # logger.info(' The five best new patterns (per min_max): ') + # for j in max_out_steps: + # for i in range(len(five_best_pattern[j])): + # print('min_max: %s\n' % j) + # print('Differenz: %s\n' % five_best_pattern[j][i][0]) + # print_graph_pattern(five_best_pattern[j][i][1]) + # print_graph_pattern(five_best_pattern[j][i][2]) + # + # # more statistics + # for j in max_out_steps: + # avg_score_best_pat_pun[j] = avg_score_best_pat[j] / reps + # if reps - num_no_pattern[j]: + # avg_score_best_pat[j] = avg_score_best_pat[j] / \ + # (reps - num_no_pattern[j]) + # else: + # avg_score_best_pat = -1 + # punish_avg_diff_all_pat[j] = punish_avg_diff_best_pat[j] * \ + # avg_num_pat[j] + # avg_diff_all_pat_punished[j] = avg_diff_all_pat[j] - \ + # punish_avg_diff_best_pat[j] + # avg_diff_best_pat_punished[j] = avg_diff_best_pat[j] - \ + # punish_avg_diff_all_pat[j] + # + # # print the statistics + # logger.info('min_max: %s\n' + # 'avg_num_pat: %s\n' + # 'max_num_pat: %s\n' + # 'avg_score_all_pat: %s\n' + # 'avg_score_best_pat: %s\n' + # 'avg_score_best_pat_pun: %s\n' + # 'max_score_ovrall: %s\n' + # 'num_no_pattern: %s\n' + # 'avg_diff_all_pat: %s\n' + # 'avg_diff_best_pat: %s\n' + # 'punish_avg_diff_best_pat: %s\n' + # 'punish_avg_diff_all_pat: %s\n' + # 'avg_diff_all_pat_punished: %s\n' + # 'avg_diff_best_pat_punished: %s\n' % ( + # ' '.join([str(x) for x in max_out_steps]), + # ' '.join([str(avg_num_pat[x]) for x in max_out_steps]), + # ' '.join([str(max_num_pat[x]) for x in max_out_steps]), + # ' '.join([str(avg_score_all_pat[x]) for x in max_out_steps]), + # ' '.join([str(avg_score_best_pat[x]) for x in max_out_steps]), + # ' '.join( + # [str(avg_score_best_pat_pun[x]) for x in max_out_steps] + # ), + # ' '.join([str(max_score_ovrall[x]) for x in max_out_steps]), + # ' '.join([str(num_no_pattern[x]) for x in max_out_steps]), + # ' '.join([str(avg_diff_all_pat[x]) for x in max_out_steps]), + # ' '.join([str(avg_diff_best_pat[x]) for x in max_out_steps]), + # ' '.join( + # [str(punish_avg_diff_best_pat[x]) for x in max_out_steps] + # ), + # ' '.join( + # [str(punish_avg_diff_all_pat[x]) for x in max_out_steps] + # ), + # ' '.join( + # [str(avg_diff_all_pat_punished[x]) for x in max_out_steps] + # ), + # ' '.join( + # [str(avg_diff_best_pat_punished[x]) for x in max_out_steps] + # ) + # )) + # + # # TODO: Fehler finden, warum die Differenz der gp-scores in + # five_best_patterns nicht stimmt + # + # res = res[0:100] + # for res_ in res: + # # print('max_out:' + str(res_[1])) + # print_graph_pattern(res_) + # + # # TODO: zweite Query auch mit SOURCE TARGET binden und gp in die query + # # dazunehmen, dann spar ich mir auch das suchen nach Treffern ?! + + +if __name__ == '__main__': + main() diff --git a/tests/test_sampling.py b/tests/test_sampling.py index c0afe08..044449d 100644 --- a/tests/test_sampling.py +++ b/tests/test_sampling.py @@ -8,6 +8,7 @@ """ import logging +import random from collections import defaultdict from collections import OrderedDict from os import getenv @@ -41,8 +42,8 @@ logger = logging.getLogger(__name__) sparql = SPARQLWrapper.SPARQLWrapper(SPARQL_ENDPOINT) -#sparql = SPARQLWrapper.SPARQLWrapper( -# getenv('SPARQL_ENDPOINT', 'http://dbpedia.org/sparql')) +# sparql = SPARQLWrapper.SPARQLWrapper( +# getenv('SPARQL_ENDPOINT', 'http://dbpedia.org/sparql')) try: timeout = max(5, calibrate_query_timeout(sparql)) # 5s for warmup except IOError: @@ -113,7 +114,7 @@ ground_truth_pairs_2 = get_semantic_associations() ground_truth_pairs_2, _ = split_training_test_set(ground_truth_pairs_2) -ground_truth_pairs_2 = ground_truth_pairs_2[1:100] +ground_truth_pairs_2 = random.sample(ground_truth_pairs_2, 100) ground_truth_pairs_3 = [ (dbp['Barrister'], dbp['Law']), @@ -132,7 +133,104 @@ gtp_scores_4 = GTPScores(ground_truth_pairs_4) -def test_steps(gtps): +def test_count(gtps, max_out): + # values = {(SOURCE_VAR, TARGET_VAR): gtps} hier besser nur die sources + source_list = [(stp[0], ) for stp in gtps] + values = {(SOURCE_VAR, ): source_list} + gp1 = GraphPattern([(SOURCE_VAR, a, b)]) + gp2 = GraphPattern([(b, c, TARGET_VAR)]) + # SPARQL-Query die über eine Var aus gp1 random samplet + q = gp1.to_sparql_filter_by_count_out_query( + values=values, count_node=b, max_out=max_out, limit=200) + logger.info(q) + t, q_res1 = run_query(q) + logger.info(q_res1) + # Kreiere b_list in der die Ergebnisse für b "gespeichert" sind + # TODO: als Methode, die Listenform (Tupellistenform) der gefundenen + # Bindings zu gewünschten Variablen zurückgibt. + res_rows_path = ['results', 'bindings'] + bind1 = sparql_json_result_bindings_to_rdflib( + get_path(q_res1, res_rows_path, default=[]) + ) + b_list = [] + for row in bind1: + x = get_path(row, [b]) + y = (x, ) + b_list.append(y) + logger.info('orig query took %.4f s, result:\n%s\n', t, b_list) + b_list[:] = [b_l for b_l in b_list if not list_remove_bool(b_l[0])] + b_list = list(set(b_list)) + # Values für die nächste query: b_list + values = {(b, ): b_list} + # Query die über eine var aus gp2 random samplet mit values aus b_list + q = gp2.to_sparql_select_sample_query(values=values, limit=5000) + logger.info(q) + try: + t, q_res2 = run_query(q) + except: + return [] + # Kreiere target_list, in der die "gefundenen" Targets vermerkt sind + bind2 = sparql_json_result_bindings_to_rdflib( + get_path(q_res2, res_rows_path, default=[]) + ) + target_list = [] + for row in bind2: + target_list.append(get_path(row, [TARGET_VAR])) + logger.info('orig query took %.4f s, result:\n%s\n', t, q_res2) + # Kreire gtps_2 in der alle gtps, deren targets in target_list enthalten + # sind, "gespeichert" werden + gtps_2 = [] + for t in target_list: + for gtp in gtps: + if t == gtp[1]: + gtps_2.append(gtp) + logger.info(gtps_2) + + # GraphPattern mit gefixten Pfaden aus den gefundenen gtp kreieren: + # TODO: Das ganze als Methode aus einem graph-pattern, den results und + # den stp + gp_list = [] + for row2 in bind2: + for gtp in gtps: + if gtp[1] == get_path(row2, [TARGET_VAR]): + for row1 in bind1: + if get_path(row1, [b]) == get_path(row2, [b]): + gp_ = GraphPattern([ + (SOURCE_VAR, get_path(row1, [a]), b), + (b, get_path(row2, [c]), TARGET_VAR) + ]) + if gp_ not in gp_list: + gp_list.append(gp_) + + # gp3 = GraphPattern([ + # (SOURCE_VAR, a, b), + # (b, c, TARGET_VAR) + # ]) + gtp_scores = GTPScores(gtps) + # gtp_scores2 = GTPScores(gtps_2) + + # # Fixe das pattern über die gefundenen gtps + # mfv2 = [] + # if len(gtps_2) > 1: + # mfv2 = mutate_fix_var(sparql, timeout, gtp_scores2, gp3) + # + # # lasse die gefundenen Pattern einmal durch die fix_var laufen + # mfv = [] + # for gp_mfv2 in mfv2: + # mfv_res = mutate_fix_var(sparql, timeout, gtp_scores, gp_mfv2) + # for gp_res in mfv_res: + # mfv.append(gp_res) + # + # # evaluiere die so gefundenen Pattern + # res_eval = eval_gp_list(gtp_scores, mfv) + # return res_eval + + # evaluiere die gefixten pattern + res_eval = eval_gp_list(gtp_scores, gp_list) + return res_eval + + +def test_sample(gtps): values = {(SOURCE_VAR, TARGET_VAR): gtps} gp1 = GraphPattern([(SOURCE_VAR, a, b)]) gp2 = GraphPattern([(b, c, TARGET_VAR)]) @@ -140,16 +238,15 @@ def test_steps(gtps): # TODO: Query so verändern, dass nach count gefiltert wird (siehe log.txt) q = gp1.to_sparql_select_sample_query(values=values, limit=100) logger.info(q) - t, q_res = run_query(q) - logger.info(q_res) + t, q_res1 = run_query(q) + logger.info(q_res1) # Kreiere b_list in der die Ergebnisse für b "gespeichert" sind - # TODO speichere alles um später den Weg nachzuvollziehen res_rows_path = ['results', 'bindings'] - bind = sparql_json_result_bindings_to_rdflib( - get_path(q_res, res_rows_path, default=[]) + bind1 = sparql_json_result_bindings_to_rdflib( + get_path(q_res1, res_rows_path, default=[]) ) b_list = [] - for row in bind: + for row in bind1: x = get_path(row, [b]) y = (x, ) b_list.append(y) @@ -160,16 +257,15 @@ def test_steps(gtps): # Query die über eine var aus gp2 random samplet mit values aus b_list q = gp2.to_sparql_select_sample_query(values=values, limit=5000) logger.info(q) - t, q_res = run_query(q) + t, q_res2 = run_query(q) # Kreiere target_list, in der die "gefundenen" Targets vermerkt sind - res_rows_path = ['results', 'bindings'] - bind = sparql_json_result_bindings_to_rdflib( - get_path(q_res, res_rows_path, default=[]) + bind2 = sparql_json_result_bindings_to_rdflib( + get_path(q_res2, res_rows_path, default=[]) ) target_list = [] - for row in bind: + for row in bind2: target_list.append(get_path(row, [TARGET_VAR])) - logger.info('orig query took %.4f s, result:\n%s\n', t, q_res) + logger.info('orig query took %.4f s, result:\n%s\n', t, q_res2) # Kreire gtps_2 in der alle gtps, deren targets in target_list enthalten # sind, "gespeichert" werden gtps_2 = [] @@ -179,27 +275,47 @@ def test_steps(gtps): gtps_2.append(gtp) logger.info(gtps_2) - gp3 = GraphPattern([ - (SOURCE_VAR, a, b), - (b, c, TARGET_VAR) - ]) + # GraphPattern mit gefixten Pfaden aus den gefundenen gtp kreieren: + # TODO: Das ganze als Methode aus einem graph-pattern, den results und + # den stp + gp_list = [] + for row2 in bind2: + for gtp in gtps: + if gtp[1] == get_path(row2, [TARGET_VAR]): + for row1 in bind1: + if get_path(row1, [b]) == get_path(row2, [b]): + gp_ = GraphPattern([ + (SOURCE_VAR, get_path(row1, [a]), b), + (b, get_path(row2, [c]), TARGET_VAR) + ]) + if gp_ not in gp_list: + gp_list.append(gp_) + + # gp3 = GraphPattern([ + # (SOURCE_VAR, a, b), + # (b, c, TARGET_VAR) + # ]) gtp_scores = GTPScores(gtps) - gtp_scores2 = GTPScores(gtps_2) - - # Fixe das pattern über die gefundenen gtps - mfv2 = [] - if len(gtps_2) > 1: - mfv2 = mutate_fix_var(sparql, timeout, gtp_scores2, gp3) - - # lasse die gefundenen Pattern einmal durch die fix_var laufen - mfv = [] - for gp_mfv2 in mfv2: - mfv_res = mutate_fix_var(sparql, timeout, gtp_scores, gp_mfv2) - for gp_res in mfv_res: - mfv.append(gp_res) - - # evaluiere die so gefundenen Pattern - res_eval = eval_gp_list(gtp_scores, mfv) + # gtp_scores2 = GTPScores(gtps_2) + + # # Fixe das pattern über die gefundenen gtps + # mfv2 = [] + # if len(gtps_2) > 1: + # mfv2 = mutate_fix_var(sparql, timeout, gtp_scores2, gp3) + # + # # lasse die gefundenen Pattern einmal durch die fix_var laufen + # mfv = [] + # for gp_mfv2 in mfv2: + # mfv_res = mutate_fix_var(sparql, timeout, gtp_scores, gp_mfv2) + # for gp_res in mfv_res: + # mfv.append(gp_res) + # + # # evaluiere die so gefundenen Pattern + # res_eval = eval_gp_list(gtp_scores, mfv) + # return res_eval + + # evaluiere die gefixten pattern + res_eval = eval_gp_list(gtp_scores, gp_list) return res_eval @@ -235,7 +351,8 @@ def list_remove_bool(var): # keine Probleme mit dem Category:Cigarettes-Beispiel zu bekommen # (siehe docs) # TODO: Möglicherweise dafür sorgen, dass die nicht rausgeschmissen, - # sondern nur nicht mit prefix gekürzt werden + # sondern nur nicht mit prefix gekürzt werden, also einfach mal schauen, + # dass die curify das tut was sie soll elif isinstance(var, URIRef): return ':' in var[7:] return False @@ -247,17 +364,35 @@ def eval_gp_list(gtp_scores, gp_list): res_ev = evaluate( sparql, timeout, gtp_scores, gp_l, run=0, gen=0) update_individuals([gp_l], [res_ev]) - #print_graph_pattern(gp_, print_matching_node_pairs=0) + # print_graph_pattern(gp_, print_matching_node_pairs=0) return gp_list if __name__ == '__main__': + # # test_sample: + # res = [] + # for i in range(10): + # res_ts = test_sample(ground_truth_pairs_2) + # for gp_ts in res_ts: + # res.append(gp_ts) + # + # res = sorted(res, key=lambda gp_: -gp_.fitness.values.score) + # for res_ in res: + # print_graph_pattern(res_) + + # test_count res = [] - for i in range(20): - res_ts = test_steps(ground_truth_pairs_2) - for gp_ts in res_ts: - res.append(gp_ts) - - res = sorted(res, key=lambda gp_: -gp_.fitness.values.score) - for i in range(10): - print_graph_pattern(res[i]) + for i in range(1): + ground_truth_pairs_5 = get_semantic_associations() + ground_truth_pairs_5 = random.sample(ground_truth_pairs_5, 200) + max_out_steps = [10, 15, 20, 25, 30, 40, 50, 75, 100] + for j in max_out_steps: + res_ts = test_count(ground_truth_pairs_5, j) + for gp_ts in res_ts: + res.append((gp_ts, j)) + + res = sorted(res, key=lambda gp_: -gp_[0].fitness.values.score) + res = res[0:100] + for res_ in res: + print('max_out:'+str(res_[1])) + print_graph_pattern(res_[0]) From f67c7309605bcdde33f1155beeda0d346c6bf9ab Mon Sep 17 00:00:00 2001 From: "philipp.neuer" Date: Fri, 31 Aug 2018 13:58:28 +0200 Subject: [PATCH 05/27] Deep-and-Narrow-Path-Mutation schould be runnable --- config/defaults.py | 3 + gp_learner.py | 128 +++++++++++++++++++++++++++- gp_query.py | 141 +++++++++++++++++++++++++++++++ graph_pattern.py | 5 +- tests/test_mutate_deep_narrow.py | 47 +++++------ 5 files changed, 294 insertions(+), 30 deletions(-) diff --git a/config/defaults.py b/config/defaults.py index cf153d9..cee401d 100644 --- a/config/defaults.py +++ b/config/defaults.py @@ -89,6 +89,9 @@ MUTPB_FV_SAMPLE_MAXN = 32 # max n of instantiations to sample from top k MUTPB_FV_QUERY_LIMIT = 256 # SPARQL query limit for the top k instantiations MUTPB_SP = 0.05 # prob to simplify pattern (warning: can restrict exploration) +MUTPB_DN = 0.5 # prob to try adding a deep and narrow path to a pattern +MUTPB_DN_PS_MAX_N = 10 # Max steps in the deep narrow path +MUTPB_DN_AVG_LIMIT = 10 # Max avg. reachable Nodes # fusion of target candidates: FUSION_SAMPLES_PER_CLASS = 500 # only use up to n training samples per class diff --git a/gp_learner.py b/gp_learner.py index 008310f..c9462e9 100644 --- a/gp_learner.py +++ b/gp_learner.py @@ -54,6 +54,8 @@ from gp_query import query_stats from gp_query import query_time_hard_exceeded from gp_query import query_time_soft_exceeded +from gp_query import useful_path_query +from gp_query import useful_path_inst_query from gp_query import variable_substitution_query from graph_pattern import canonicalize from graph_pattern import gen_random_var @@ -684,6 +686,121 @@ def mutate_fix_var( ] return res +def mutate_deep_narrow( + sparql, + timeout, + child, + gtp_scores, + dn_path_steps_max_n=config.MUTPB_DN_PS_MAX_N, + direct=None, + childin=False, + limit=config.MUTPB_FV_QUERY_LIMIT, # TODO: Limit benutzen? +): + if not child.matching_node_pairs: + ev = evaluate( + sparql, timeout, gtp_scores, child) # TODO: Muss hier run/gen dazu? + update_individuals([child], [ev]) + gtps = child.matching_node_pairs + if not gtps: + return [child] + #TODO: testen, wie die Verteilung gut ist + n = random.choice(range(dn_path_steps_max_n))+1 + n = 2 + node = [SOURCE_VAR] + for i in range(n): + node.append(Variable('n%i' % i)) + node.append(TARGET_VAR) + hop = [Variable('p%i' % i) for i in range(n + 1)] + # TODO: Entfernern, wenn direct einfach immer random gewählt werden soll + if direct is None or len(direct) != n + 1: + logger.debug( + 'No direction chosen, or direction tuple with false length' + ) + direct = [0 for _ in range(n + 1)] + gp_helper = [] + for i in range(n + 1): + if direct[i] == 0: + direct[i] = random.choice([-1, 1]) + if direct[i] == 1: + gp_helper.append( + GraphPattern([(node[i], hop[i], node[i + 1])]) + ) + else: + gp_helper.append( + GraphPattern([(node[i + 1], hop[i], node[i])]) + ) + # Queries für die Schritte + valueblocks_s = {} + valueblocks_t = {} + for i in range(int((n / 2) + 1)): + if i < int(n/2): + t, q_res = useful_path_query( + sparql, + timeout, + child, + hop[i], + node[i+1], + valueblocks_s, + gp_helper[:i + 1], + SOURCE_VAR, + gp_in=childin, + ) + if not q_res: + return [child] + valueblocks_s[hop[i]] = { + (hop[i],): random.sample( + [(q_r,) for q_r in q_res], + min(10, len(q_res)) + ) + } + if n-i > i: + t, q_res = useful_path_query( + sparql, + timeout, + child, + hop[n-i], + node[n-i], + valueblocks_t, + gp_helper[n - i:], + TARGET_VAR, + gp_in=childin, + ) + if not q_res: + return [child] + valueblocks_t[hop[n-i]] = { + (hop[n-i],): random.sample( + [(q_r,) for q_r in q_res], + min(10, len(q_res)) + ) + } + + # Query fürs Ergebnis + # gemeinsamer source/target-block, damit nur "richtige" Pfade gefunden + # werden + valueblocks = {} + valueblocks.update(valueblocks_s) + valueblocks.update(valueblocks_t) + t, q_res = useful_path_inst_query( + sparql, + timeout, + child, + hop, + valueblocks, + gp_helper, + gp_in=childin + ) + if not q_res: + return [child] + res = [] + for inst in q_res: + child_inst = GraphPattern([ + (node[i], inst[i], node[i + 1]) if direct[i] == 1 + else (node[i + 1], inst[i], node[i]) + for i in range(n + 1) + ]) + res.append(GraphPattern(child + child_inst)) + return res + def mutate_simplify_pattern(gp): if len(gp) < 2: @@ -797,6 +914,7 @@ def mutate( pb_mv=config.MUTPB_MV, pb_sp=config.MUTPB_SP, pb_sv=config.MUTPB_SV, + pb_dn=config.MUTPB_DN, ): # mutate patterns: # grow: select random identifier and convert them into a var (local) @@ -837,8 +955,14 @@ def mutate( else: children = [child] - - # TODO: deep & narrow paths mutation + helper = [] + for child in children: + if random.random() < pb_dn: + res = mutate_deep_narrow(sparql, timeout, gtp_scores, child) + helper += res + else: + helper.append(child) + children = helper children = { c if fit_to_live(c) else orig_child diff --git a/gp_query.py b/gp_query.py index 0a4618d..eae1c4f 100644 --- a/gp_query.py +++ b/gp_query.py @@ -62,6 +62,8 @@ def __init__(self): self.ask_multi_query_count = 0 self.combined_ask_count_multi_query_count = 0 self.variable_substitution_query_count = 0 + self.useful_path_query_count = 0 + self.useful_path_inst_query_count = 0 self.predict_query_count = 0 self.count_query_count = 0 @@ -695,6 +697,145 @@ def _var_subst_chunk_result_ext(q_res, _sel_var_and_vars, _, **kwds): def _var_subst_res_update(res, update, **_): res += update + + +def useful_path_query( + sparql, + timeout, + graph_pattern, + var_to_fix, + var_to_count, + valueblocks, + steps, + startvar, + avglimit=config.MUTPB_DN_AVG_LIMIT, + gp_in=False, + batch_size=None +): + _query_stats.useful_path_query_count += 1 + # TODO: evtl. je 10 pro 'gefixter' Variable von batch-size abziehen + # (weil der Block ja mit rein geht) + _values = graph_pattern.matching_node_pairs + # TODO: evtl. Schnitt mit noch nicht abgedeckten + _ret_val_mapping = {stp: [stp] for stp in graph_pattern.matching_node_pairs} + _vars_steps_and_stuff = ( + var_to_fix, var_to_count, startvar, valueblocks, steps, avglimit, gp_in + ) + return _multi_query( + sparql, timeout, graph_pattern, graph_pattern.matching_node_pairs, + batch_size, _vars_steps_and_stuff, _values, _ret_val_mapping, + _usef_path_res_init, _usef_path_chunk_q, _usef_path_chunk_result_ext, + _usef_path_res_update + ) + + +# noinspection PyUnusedLocal +def _usef_path_res_init(_, **kwds): + return [] + + +def _usef_path_chunk_q(gp, _vars_steps_and_stuff, values_chunk): + var_to_fix, var_to_count, startvar, _valueblocks, steps, avglimit, gp_in \ + = _vars_steps_and_stuff + valueblocks = { + startvar: { + (startvar,): + [(tup[0],) for tup in values_chunk] if startvar == SOURCE_VAR + else [(tup[1],) for tup in values_chunk] + } + } + valueblocks.update(_valueblocks) + return gp.to_sparql_useful_path_query( + var_to_fix, + var_to_count, + valueblocks, + steps, + startvar, + avglimit=avglimit, + gp_in=gp_in + ) + + +# noinspection PyUnusedLocal +def _usef_path_chunk_result_ext(q_res, _vars_steps_and_stuff, _, **kwds): + var_to_fix, var_to_count, startvar, _valueblocks, steps, avglimit, gp_in \ + = _vars_steps_and_stuff + chunk_res = [] + res_rows_path = ['results', 'bindings'] + bindings = sparql_json_result_bindings_to_rdflib( + get_path(q_res, res_rows_path, default=[]) + ) + + for row in bindings: + # TODO: Drüber nachdenken, ob iwie die avg-outgoing auch mit + # zurückgegeben werden sollen + chunk_res.append(get_path(row, [var_to_fix])) + return chunk_res + + +def _usef_path_res_update(res, update, **_): + res += update + + +def useful_path_inst_query( + sparql, + timeout, + graph_pattern, + hop, + valueblocks, + steps, + gp_in=False, + batch_size=None +): + _query_stats.useful_path_inst_query_count += 1 + # TODO: evtl. je 10 pro 'gefixter' Variable von batch-size abziehen + # (weil der Block ja mit rein geht) + _values = graph_pattern.matching_node_pairs + # evtl. Schnitt mit noch nicht abgedeckten + _ret_val_mapping = {stp: [stp] for stp in graph_pattern.matching_node_pairs} + _vars_steps_and_stuff = (hop, valueblocks, steps, gp_in) + return _multi_query( + sparql, timeout, graph_pattern, graph_pattern.matching_node_pairs, + batch_size, _vars_steps_and_stuff, _values, _ret_val_mapping, + _usef_path_inst_res_init, _usef_path_inst_chunk_q, + _usef_path_inst_chunk_result_ext, _usef_path_inst_res_update + ) + + +# noinspection PyUnusedLocal +def _usef_path_inst_res_init(_, **kwds): + return [] + + +def _usef_path_inst_chunk_q(gp, _vars_steps_and_stuff, values_chunk): + hop, _valueblocks, steps, gp_in = _vars_steps_and_stuff + valueblocks = { + 'st': { + (SOURCE_VAR, TARGET_VAR): values_chunk + } + } + valueblocks.update(_valueblocks) + return gp.to_sparql_useful_path_inst_query( + hop, valueblocks, steps, gp_in=gp_in + ) + + +# noinspection PyUnusedLocal +def _usef_path_inst_chunk_result_ext(q_res, _vars_steps_and_stuff, _, **kwds): + hop, _valueblocks, steps, gp_in = _vars_steps_and_stuff + chunk_res = [] + res_rows_path = ['results', 'bindings'] + bindings = sparql_json_result_bindings_to_rdflib( + get_path(q_res, res_rows_path, default=[]) + ) + + for row in bindings: + chunk_res.append([get_path(row, [h]) for h in hop]) + return chunk_res + + +def _usef_path_inst_res_update(res, update, **_): + res += update def generate_stps_from_gp(sparql, gp): diff --git a/graph_pattern.py b/graph_pattern.py index e1468ad..62c6a2c 100644 --- a/graph_pattern.py +++ b/graph_pattern.py @@ -830,14 +830,13 @@ def to_sparql_useful_path_query( var_to_count, valueblocks, steps, - startvar=None, + startvar, avglimit=10, gp_in=False ): + # TODO: evtl. Limit zufügen count_var_to_count = Variable('c' + ''.join(var_to_count)) avg_var_to_count = Variable('avgc' + ''.join(var_to_count)) - if startvar is None: - startvar = SOURCE_VAR res = "SELECT %(vtf)s (AVG(%(cvtc)s) as %(avtc)s) {\n" \ "SELECT %(stv)s %(vtf)s (COUNT (%(vtc)s) as %(cvtc)s) {\n" \ "%(val)s\n" \ diff --git a/tests/test_mutate_deep_narrow.py b/tests/test_mutate_deep_narrow.py index bbcbdca..1369f18 100644 --- a/tests/test_mutate_deep_narrow.py +++ b/tests/test_mutate_deep_narrow.py @@ -27,6 +27,7 @@ from config import SPARQL_ENDPOINT from gp_learner import evaluate +from gp_learner import mutate_deep_narrow from gp_learner import mutate_fix_var from gp_learner import update_individuals from gp_query import calibrate_query_timeout @@ -2262,7 +2263,7 @@ def mutate_deep_narrow_n_hops( # erste Version, komplett straight forward -def mutate_deep_narrow( +def mutate_deep_narrow_1( gp_, gtps, n, direct=None, gp_in=False ): node = [SOURCE_VAR] @@ -2651,7 +2652,7 @@ def mutate_deep_narrow_4( for i in range(int((n / 2) + 1)): if i < int(n/2): q = gp_.to_sparql_useful_path_query( - hop[i], node[i+1], valueblocks_s, gp_helper[:i+1], gp_in=gp_in + hop[i], node[i+1], valueblocks_s, gp_helper[:i+1], SOURCE_VAR, gp_in=gp_in ) logger.debug(q) try: @@ -2678,7 +2679,7 @@ def mutate_deep_narrow_4( node[n-i], valueblocks_t, gp_helper[n-i:], - startvar=TARGET_VAR, + TARGET_VAR, gp_in=gp_in ) logger.debug(q) @@ -3102,29 +3103,25 @@ def main(): # ground_truth_pairs = ground_truth_pairs[:100] gtp_scores = GTPScores(ground_truth_pairs) res = [] - # key = random.choice(gp_found.keys()) - # for i in range(100): - # # ground_truth_pairs = random.sample(ground_truth_pairs, 200) - # gp_ = GraphPattern([]) - # # gp_ = gp_found[key] - # res_= mutate_deep_narrow_5(gp_, ground_truth_pairs, 2, gp_in=False) - # res.append(res_) - # logger.info(i) - # if res_: - # logger.info(res_) - # - # logger.info(res) - for key in gp_found.keys(): + for i in range(100): + key = random.choice(gp_found.keys()) gp_ = gp_found[key] - eval_gp(gtp_scores, gp_) - for i in range(100): - res_ = mutate_deep_narrow_4( - gp_, gp_.matching_node_pairs, 6, gp_in=False - ) - res.append(res_) - logger.info((i, key)) - if res_: - logger.info(res_) + # eval_gp(gtp_scores, gp_) + r = mutate_deep_narrow(sparql, timeout, gp_, gtp_scores) + logger.info(i) + logger.info(r) + res.append(r) + # for key in gp_found.keys(): + # gp_ = gp_found[key] + # eval_gp(gtp_scores, gp_) + # for i in range(100): + # res_ = mutate_deep_narrow_4( + # gp_, gp_.matching_node_pairs, 6, gp_in=False + # ) + # res.append(res_) + # logger.info((i, key)) + # if res_: + # logger.info(res_) # res_eval=[] # res = [] From c908cafd24e736b75192dfb2ba38f767180a949f Mon Sep 17 00:00:00 2001 From: "philipp.neuer" Date: Mon, 3 Sep 2018 11:32:37 +0200 Subject: [PATCH 06/27] kleine Aenderung und Fehlerbehebung in gp_learner.py --- gp_learner.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/gp_learner.py b/gp_learner.py index c9462e9..108a59c 100644 --- a/gp_learner.py +++ b/gp_learner.py @@ -686,26 +686,26 @@ def mutate_fix_var( ] return res + def mutate_deep_narrow( sparql, timeout, - child, gtp_scores, + child, dn_path_steps_max_n=config.MUTPB_DN_PS_MAX_N, - direct=None, + direct=None, childin=False, limit=config.MUTPB_FV_QUERY_LIMIT, # TODO: Limit benutzen? ): - if not child.matching_node_pairs: + if not child.fitness.valid: ev = evaluate( - sparql, timeout, gtp_scores, child) # TODO: Muss hier run/gen dazu? + sparql, timeout, gtp_scores, child, run=-1, gen=-1) # TODO: Muss hier run/gen dazu? update_individuals([child], [ev]) gtps = child.matching_node_pairs if not gtps: return [child] - #TODO: testen, wie die Verteilung gut ist - n = random.choice(range(dn_path_steps_max_n))+1 - n = 2 + # TODO: testen, wie die Verteilung gut ist + n = random.choice(range(dn_path_steps_max_n)) + 1 node = [SOURCE_VAR] for i in range(n): node.append(Variable('n%i' % i)) From adbc215cc52c965a9fd2d7040d109d6d21b4a02a Mon Sep 17 00:00:00 2001 From: "philipp.neuer" Date: Tue, 4 Sep 2018 13:24:14 +0200 Subject: [PATCH 07/27] Undone modifying unrelated stuff --- graph_pattern.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/graph_pattern.py b/graph_pattern.py index 62c6a2c..bdb1d57 100644 --- a/graph_pattern.py +++ b/graph_pattern.py @@ -34,6 +34,7 @@ logger = logging.getLogger(__name__) + RANDOM_VAR_LEN = 5 # so in total we have 62**5=916132832 different random vars RANDOM_VAR_PREFIX = 'vr' SOURCE_VAR = Variable('source') @@ -240,11 +241,11 @@ def canonicalize(gp, shorten_varnames=True): cgp = GraphPattern(cbgp, mapping=mapping) if not ( - len(gp) == len(cbgp) == len(cgp) - and len(gp.nodes) == len(cgp.nodes) - and len(gp.edges) == len(cgp.edges) - and sorted(gp.identifier_counts().values()) == - sorted(cgp.identifier_counts().values()) + len(gp) == len(cbgp) == len(cgp) + and len(gp.nodes) == len(cgp.nodes) + and len(gp.edges) == len(cgp.edges) + and sorted(gp.identifier_counts().values()) == + sorted(cgp.identifier_counts().values()) ): # canonicalization should never change any of the features above, but it # did before (e.g., https://github.com/RDFLib/rdflib/issues/494 ). @@ -431,8 +432,8 @@ def exclude(self, identifiers): [(s, p, o) for s, p, o in self if p not in identifiers and - s not in identifiers and - o not in identifiers + s not in identifiers and + o not in identifiers ] ) @@ -447,7 +448,7 @@ def identifier_counts(self, exclude_vars=False, vars_only=False): :param vars_only: Only return counts for vars. :return: Counter of all identifiers in this graph pattern. """ - assert not (exclude_vars and vars_only) + assert not(exclude_vars and vars_only) ids = Counter([i for t in self for i in t]) if exclude_vars: for i in self.vars_in_graph: From 20e5b343307dbdafd7c01669077106a3ba6b2889 Mon Sep 17 00:00:00 2001 From: "philipp.neuer" Date: Tue, 4 Sep 2018 13:40:32 +0200 Subject: [PATCH 08/27] Renamed two values and added, alpha beta values for the path lentght of deep_and_narrwo_path_mutation --- config/defaults.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/config/defaults.py b/config/defaults.py index cee401d..9a95607 100644 --- a/config/defaults.py +++ b/config/defaults.py @@ -89,9 +89,12 @@ MUTPB_FV_SAMPLE_MAXN = 32 # max n of instantiations to sample from top k MUTPB_FV_QUERY_LIMIT = 256 # SPARQL query limit for the top k instantiations MUTPB_SP = 0.05 # prob to simplify pattern (warning: can restrict exploration) +# TODO: Lower the MUTPB_DN MUTPB_DN = 0.5 # prob to try adding a deep and narrow path to a pattern -MUTPB_DN_PS_MAX_N = 10 # Max steps in the deep narrow path -MUTPB_DN_AVG_LIMIT = 10 # Max avg. reachable Nodes +MUTPB_DN_MAX_HOPS = 10 # Max number of hops in the deep narrow path +MUTPB_DN_MAX_HOPS_ALPHA = 5. # alpha value in a length beta distribution +MUTPB_DN_MAX_HOPS_BETA = 30. # beta value in a length beta distribution +MUTPB_DN_AVG_DEG_LIMIT = 10 # Max avg. reachable Nodes # fusion of target candidates: FUSION_SAMPLES_PER_CLASS = 500 # only use up to n training samples per class From 22a786e6c5ddf231b1aa487600f794ab87b07e8d Mon Sep 17 00:00:00 2001 From: "philipp.neuer" Date: Tue, 4 Sep 2018 13:58:41 +0200 Subject: [PATCH 09/27] Changed values MUTPB_DN_MAX_HOPS_ALPHA / BETA --- config/defaults.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/defaults.py b/config/defaults.py index 9a95607..bd10c50 100644 --- a/config/defaults.py +++ b/config/defaults.py @@ -92,8 +92,8 @@ # TODO: Lower the MUTPB_DN MUTPB_DN = 0.5 # prob to try adding a deep and narrow path to a pattern MUTPB_DN_MAX_HOPS = 10 # Max number of hops in the deep narrow path -MUTPB_DN_MAX_HOPS_ALPHA = 5. # alpha value in a length beta distribution -MUTPB_DN_MAX_HOPS_BETA = 30. # beta value in a length beta distribution +MUTPB_DN_MAX_HOPS_ALPHA = 2. # alpha value in a length beta distribution +MUTPB_DN_MAX_HOPS_BETA = 5. # beta value in a length beta distribution MUTPB_DN_AVG_DEG_LIMIT = 10 # Max avg. reachable Nodes # fusion of target candidates: From 91cbde0ddaa7987a276919b3ead051cb956f5037 Mon Sep 17 00:00:00 2001 From: "philipp.neuer" Date: Wed, 5 Sep 2018 11:59:05 +0200 Subject: [PATCH 10/27] Changed order in mutate_deep_narrow() --- tests/test_mutate_deep_narrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_mutate_deep_narrow.py b/tests/test_mutate_deep_narrow.py index 1369f18..3f62984 100644 --- a/tests/test_mutate_deep_narrow.py +++ b/tests/test_mutate_deep_narrow.py @@ -3107,7 +3107,7 @@ def main(): key = random.choice(gp_found.keys()) gp_ = gp_found[key] # eval_gp(gtp_scores, gp_) - r = mutate_deep_narrow(sparql, timeout, gp_, gtp_scores) + r = mutate_deep_narrow(sparql, timeout, gtp_scores, gp_) logger.info(i) logger.info(r) res.append(r) From 9c3238a9d0e2e6df5dd18918e2fd333a1d580d50 Mon Sep 17 00:00:00 2001 From: "philipp.neuer" Date: Wed, 5 Sep 2018 11:59:58 +0200 Subject: [PATCH 11/27] Renamed MUTPB_DN_AVG_LIMIT --- gp_query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gp_query.py b/gp_query.py index eae1c4f..a7444b3 100644 --- a/gp_query.py +++ b/gp_query.py @@ -708,7 +708,7 @@ def useful_path_query( valueblocks, steps, startvar, - avglimit=config.MUTPB_DN_AVG_LIMIT, + avglimit=config.MUTPB_DN_AVG_DEG_LIMIT, gp_in=False, batch_size=None ): From 6362dc8c14278f093b55eb795efe7ed703c22834 Mon Sep 17 00:00:00 2001 From: "philipp.neuer" Date: Wed, 5 Sep 2018 12:17:45 +0200 Subject: [PATCH 12/27] added betadistribution for mut-length and dnp-mut only if not fixvar --- gp_learner.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/gp_learner.py b/gp_learner.py index 108a59c..6cc3335 100644 --- a/gp_learner.py +++ b/gp_learner.py @@ -692,20 +692,25 @@ def mutate_deep_narrow( timeout, gtp_scores, child, - dn_path_steps_max_n=config.MUTPB_DN_PS_MAX_N, direct=None, childin=False, limit=config.MUTPB_FV_QUERY_LIMIT, # TODO: Limit benutzen? ): if not child.fitness.valid: ev = evaluate( - sparql, timeout, gtp_scores, child, run=-1, gen=-1) # TODO: Muss hier run/gen dazu? + sparql, timeout, gtp_scores, child, run=-1, gen=-1) update_individuals([child], [ev]) gtps = child.matching_node_pairs if not gtps: return [child] - # TODO: testen, wie die Verteilung gut ist - n = random.choice(range(dn_path_steps_max_n)) + 1 + alpha = config.MUTPB_DN_MAX_HOPS_ALPHA + beta = config.MUTPB_DN_MAX_HOPS_BETA + max_hops = config.MUTPB_DN_MAX_HOPS + # more likely to create shorter paths + # with default values the distribution is as follows: + # PDF: 1: 14 %, 2: 27 %, 3: 25 %, 4: 17 %, 5: 10 %, 6: 5 %, 7: 1.5 %, ... + # CDF: 1: 14 %, 2: 40 %, 3: 66 %, 4: 83 %, 5: 93 %, 6: 98 %, 7: 99,6 %, ... + n = int(random.betavariate(alpha, beta) * (max_hops-1) + 1) node = [SOURCE_VAR] for i in range(n): node.append(Variable('n%i' % i)) @@ -953,16 +958,10 @@ def mutate( child = canonicalize(child) children = mutate_fix_var(sparql, timeout, gtp_scores, child) else: - children = [child] - - helper = [] - for child in children: if random.random() < pb_dn: - res = mutate_deep_narrow(sparql, timeout, gtp_scores, child) - helper += res + children = mutate_deep_narrow(sparql, timeout, gtp_scores, child) else: - helper.append(child) - children = helper + children = [child] children = { c if fit_to_live(c) else orig_child From 1130da4349a5b1b985566ec64875810fb185992f Mon Sep 17 00:00:00 2001 From: "philipp.neuer" Date: Wed, 5 Sep 2018 12:43:30 +0200 Subject: [PATCH 13/27] Code-Style changes and renamed mutate_deep_narrow to mutate_deep_narrow_path --- gp_learner.py | 41 +++++++++++++++----------------- tests/test_mutate_deep_narrow.py | 4 ++-- 2 files changed, 21 insertions(+), 24 deletions(-) diff --git a/gp_learner.py b/gp_learner.py index 6cc3335..ce7b1b4 100644 --- a/gp_learner.py +++ b/gp_learner.py @@ -687,7 +687,7 @@ def mutate_fix_var( return res -def mutate_deep_narrow( +def mutate_deep_narrow_path( sparql, timeout, gtp_scores, @@ -711,11 +711,8 @@ def mutate_deep_narrow( # PDF: 1: 14 %, 2: 27 %, 3: 25 %, 4: 17 %, 5: 10 %, 6: 5 %, 7: 1.5 %, ... # CDF: 1: 14 %, 2: 40 %, 3: 66 %, 4: 83 %, 5: 93 %, 6: 98 %, 7: 99,6 %, ... n = int(random.betavariate(alpha, beta) * (max_hops-1) + 1) - node = [SOURCE_VAR] - for i in range(n): - node.append(Variable('n%i' % i)) - node.append(TARGET_VAR) - hop = [Variable('p%i' % i) for i in range(n + 1)] + nodes = [SOURCE_VAR] + [Variable('n%d' % i) for i in range(n)] + [TARGET_VAR] + hops = [Variable('p%d' % i) for i in range(n + 1)] # TODO: Entfernern, wenn direct einfach immer random gewählt werden soll if direct is None or len(direct) != n + 1: logger.debug( @@ -728,23 +725,23 @@ def mutate_deep_narrow( direct[i] = random.choice([-1, 1]) if direct[i] == 1: gp_helper.append( - GraphPattern([(node[i], hop[i], node[i + 1])]) + GraphPattern([(nodes[i], hops[i], nodes[i + 1])]) ) else: gp_helper.append( - GraphPattern([(node[i + 1], hop[i], node[i])]) + GraphPattern([(nodes[i + 1], hops[i], nodes[i])]) ) # Queries für die Schritte valueblocks_s = {} valueblocks_t = {} - for i in range(int((n / 2) + 1)): + for i in range(n // 2 + 1): if i < int(n/2): t, q_res = useful_path_query( sparql, timeout, child, - hop[i], - node[i+1], + hops[i], + nodes[i+1], valueblocks_s, gp_helper[:i + 1], SOURCE_VAR, @@ -752,8 +749,8 @@ def mutate_deep_narrow( ) if not q_res: return [child] - valueblocks_s[hop[i]] = { - (hop[i],): random.sample( + valueblocks_s[hops[i]] = { + (hops[i],): random.sample( [(q_r,) for q_r in q_res], min(10, len(q_res)) ) @@ -763,8 +760,8 @@ def mutate_deep_narrow( sparql, timeout, child, - hop[n-i], - node[n-i], + hops[n-i], + nodes[n-i], valueblocks_t, gp_helper[n - i:], TARGET_VAR, @@ -772,10 +769,10 @@ def mutate_deep_narrow( ) if not q_res: return [child] - valueblocks_t[hop[n-i]] = { - (hop[n-i],): random.sample( + valueblocks_t[hops[n-i]] = { + (hops[n-i],): random.sample( [(q_r,) for q_r in q_res], - min(10, len(q_res)) + min(config.MUTPB_DN_AVG_DEG_LIMIT, len(q_res)) ) } @@ -789,7 +786,7 @@ def mutate_deep_narrow( sparql, timeout, child, - hop, + hops, valueblocks, gp_helper, gp_in=childin @@ -799,8 +796,8 @@ def mutate_deep_narrow( res = [] for inst in q_res: child_inst = GraphPattern([ - (node[i], inst[i], node[i + 1]) if direct[i] == 1 - else (node[i + 1], inst[i], node[i]) + (nodes[i], inst[i], nodes[i + 1]) if direct[i] == 1 + else (nodes[i + 1], inst[i], nodes[i]) for i in range(n + 1) ]) res.append(GraphPattern(child + child_inst)) @@ -959,7 +956,7 @@ def mutate( children = mutate_fix_var(sparql, timeout, gtp_scores, child) else: if random.random() < pb_dn: - children = mutate_deep_narrow(sparql, timeout, gtp_scores, child) + children = mutate_deep_narrow_path(sparql, timeout, gtp_scores, child) else: children = [child] diff --git a/tests/test_mutate_deep_narrow.py b/tests/test_mutate_deep_narrow.py index 3f62984..de0e443 100644 --- a/tests/test_mutate_deep_narrow.py +++ b/tests/test_mutate_deep_narrow.py @@ -27,7 +27,7 @@ from config import SPARQL_ENDPOINT from gp_learner import evaluate -from gp_learner import mutate_deep_narrow +from gp_learner import mutate_deep_narrow_path from gp_learner import mutate_fix_var from gp_learner import update_individuals from gp_query import calibrate_query_timeout @@ -3107,7 +3107,7 @@ def main(): key = random.choice(gp_found.keys()) gp_ = gp_found[key] # eval_gp(gtp_scores, gp_) - r = mutate_deep_narrow(sparql, timeout, gtp_scores, gp_) + r = mutate_deep_narrow_path(sparql, timeout, gtp_scores, gp_) logger.info(i) logger.info(r) res.append(r) From 12a95ae1e3507e015c381e616622d5cc5d6fa12b Mon Sep 17 00:00:00 2001 From: "philipp.neuer" Date: Wed, 5 Sep 2018 12:44:21 +0200 Subject: [PATCH 14/27] Renamed useful_path_(inst_)query to deep_narrow_path_(inst_)query --- gp_learner.py | 10 +++++----- gp_query.py | 31 +++++++++++++++++-------------- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/gp_learner.py b/gp_learner.py index ce7b1b4..099409c 100644 --- a/gp_learner.py +++ b/gp_learner.py @@ -54,8 +54,8 @@ from gp_query import query_stats from gp_query import query_time_hard_exceeded from gp_query import query_time_soft_exceeded -from gp_query import useful_path_query -from gp_query import useful_path_inst_query +from gp_query import deep_narrow_path_query +from gp_query import deep_narrow_path_inst_query from gp_query import variable_substitution_query from graph_pattern import canonicalize from graph_pattern import gen_random_var @@ -736,7 +736,7 @@ def mutate_deep_narrow_path( valueblocks_t = {} for i in range(n // 2 + 1): if i < int(n/2): - t, q_res = useful_path_query( + t, q_res = deep_narrow_path_query( sparql, timeout, child, @@ -756,7 +756,7 @@ def mutate_deep_narrow_path( ) } if n-i > i: - t, q_res = useful_path_query( + t, q_res = deep_narrow_path_query( sparql, timeout, child, @@ -782,7 +782,7 @@ def mutate_deep_narrow_path( valueblocks = {} valueblocks.update(valueblocks_s) valueblocks.update(valueblocks_t) - t, q_res = useful_path_inst_query( + t, q_res = deep_narrow_path_inst_query( sparql, timeout, child, diff --git a/gp_query.py b/gp_query.py index a7444b3..c0a8bea 100644 --- a/gp_query.py +++ b/gp_query.py @@ -699,7 +699,7 @@ def _var_subst_res_update(res, update, **_): res += update -def useful_path_query( +def deep_narrow_path_query( sparql, timeout, graph_pattern, @@ -724,17 +724,17 @@ def useful_path_query( return _multi_query( sparql, timeout, graph_pattern, graph_pattern.matching_node_pairs, batch_size, _vars_steps_and_stuff, _values, _ret_val_mapping, - _usef_path_res_init, _usef_path_chunk_q, _usef_path_chunk_result_ext, - _usef_path_res_update + _deep_narrow_path_res_init, _deep_narrow_path_chunk_q, + _deep_narrow_path_chunk_result_ext, _deep_narrow_path_res_update ) # noinspection PyUnusedLocal -def _usef_path_res_init(_, **kwds): +def _deep_narrow_path_res_init(_, **kwds): return [] -def _usef_path_chunk_q(gp, _vars_steps_and_stuff, values_chunk): +def _deep_narrow_path_chunk_q(gp, _vars_steps_and_stuff, values_chunk): var_to_fix, var_to_count, startvar, _valueblocks, steps, avglimit, gp_in \ = _vars_steps_and_stuff valueblocks = { @@ -757,7 +757,7 @@ def _usef_path_chunk_q(gp, _vars_steps_and_stuff, values_chunk): # noinspection PyUnusedLocal -def _usef_path_chunk_result_ext(q_res, _vars_steps_and_stuff, _, **kwds): +def _deep_narrow_path_chunk_result_ext(q_res, _vars_steps_and_stuff, _, **kwds): var_to_fix, var_to_count, startvar, _valueblocks, steps, avglimit, gp_in \ = _vars_steps_and_stuff chunk_res = [] @@ -773,11 +773,11 @@ def _usef_path_chunk_result_ext(q_res, _vars_steps_and_stuff, _, **kwds): return chunk_res -def _usef_path_res_update(res, update, **_): +def _deep_narrow_path_res_update(res, update, **_): res += update -def useful_path_inst_query( +def deep_narrow_path_inst_query( sparql, timeout, graph_pattern, @@ -797,17 +797,18 @@ def useful_path_inst_query( return _multi_query( sparql, timeout, graph_pattern, graph_pattern.matching_node_pairs, batch_size, _vars_steps_and_stuff, _values, _ret_val_mapping, - _usef_path_inst_res_init, _usef_path_inst_chunk_q, - _usef_path_inst_chunk_result_ext, _usef_path_inst_res_update + _deep_narrow_path_inst_res_init, _deep_narrow_path_inst_chunk_q, + _deep_narrow_path_inst_chunk_result_ext, + _deep_narrow_path_inst_res_update ) # noinspection PyUnusedLocal -def _usef_path_inst_res_init(_, **kwds): +def _deep_narrow_path_inst_res_init(_, **kwds): return [] -def _usef_path_inst_chunk_q(gp, _vars_steps_and_stuff, values_chunk): +def _deep_narrow_path_inst_chunk_q(gp, _vars_steps_and_stuff, values_chunk): hop, _valueblocks, steps, gp_in = _vars_steps_and_stuff valueblocks = { 'st': { @@ -821,7 +822,9 @@ def _usef_path_inst_chunk_q(gp, _vars_steps_and_stuff, values_chunk): # noinspection PyUnusedLocal -def _usef_path_inst_chunk_result_ext(q_res, _vars_steps_and_stuff, _, **kwds): +def _deep_narrow_path_inst_chunk_result_ext( + q_res, _vars_steps_and_stuff, _, **kwds +): hop, _valueblocks, steps, gp_in = _vars_steps_and_stuff chunk_res = [] res_rows_path = ['results', 'bindings'] @@ -834,7 +837,7 @@ def _usef_path_inst_chunk_result_ext(q_res, _vars_steps_and_stuff, _, **kwds): return chunk_res -def _usef_path_inst_res_update(res, update, **_): +def _deep_narrow_path_inst_res_update(res, update, **_): res += update From 22ca6aac182c47f970ade142698c44925c32c857 Mon Sep 17 00:00:00 2001 From: "philipp.neuer" Date: Wed, 5 Sep 2018 12:49:00 +0200 Subject: [PATCH 15/27] Renamed to_sparql_useful_path/_inst_query() to to_sparql_deep_narrow_path_(inst_)query --- gp_query.py | 2 +- graph_pattern.py | 4 ++-- tests/test_mutate_deep_narrow.py | 26 +++++++++++++------------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/gp_query.py b/gp_query.py index c0a8bea..5f1327d 100644 --- a/gp_query.py +++ b/gp_query.py @@ -745,7 +745,7 @@ def _deep_narrow_path_chunk_q(gp, _vars_steps_and_stuff, values_chunk): } } valueblocks.update(_valueblocks) - return gp.to_sparql_useful_path_query( + return gp.to_sparql_deep_narrow_path_query( var_to_fix, var_to_count, valueblocks, diff --git a/graph_pattern.py b/graph_pattern.py index bdb1d57..859584b 100644 --- a/graph_pattern.py +++ b/graph_pattern.py @@ -825,7 +825,7 @@ def to_sparql_filter_by_count_in_out_query( res = textwrap.dedent(res) return gp_._sparql_prefix(res) - def to_sparql_useful_path_query( + def to_sparql_deep_narrow_path_query( self, var_to_fix, var_to_count, @@ -866,7 +866,7 @@ def to_sparql_useful_path_query( res = textwrap.dedent(res) return self._sparql_prefix(res) - def to_sparql_inst_query( + def to_sparql_deep_narrow_path_inst_query( self, hop, valueblocks, diff --git a/tests/test_mutate_deep_narrow.py b/tests/test_mutate_deep_narrow.py index de0e443..a99a94a 100644 --- a/tests/test_mutate_deep_narrow.py +++ b/tests/test_mutate_deep_narrow.py @@ -2304,7 +2304,7 @@ def mutate_deep_narrow_1( valueblocks = {} valueblocks[SOURCE_VAR] = values[SOURCE_VAR] for i in range(n+1): - q = gp_.to_sparql_useful_path_query( + q = gp_.to_sparql_deep_narrow_path_query( hop[i], node[i+1], valueblocks, gp_helper[:i+1], gp_in=gp_in ) logger.debug(q) @@ -2337,7 +2337,7 @@ def mutate_deep_narrow_1( # werden del valueblocks[SOURCE_VAR] valueblocks['st'] = values['st'] - q = gp_.to_sparql_inst_query(hop, valueblocks, gp_help, gp_in=gp_in) + q = gp_.to_sparql_deep_narrow_path_inst_query(hop, valueblocks, gp_help, gp_in=gp_in) logger.debug(q) try: t, res_q_inst = run_query(q) @@ -2408,7 +2408,7 @@ def mutate_deep_narrow_2( valueblocks = {} valueblocks[SOURCE_VAR] = values[SOURCE_VAR] for i in range(n): - q = gp_.to_sparql_useful_path_query( + q = gp_.to_sparql_deep_narrow_path_query( hop[i], node[i+1], valueblocks, gp_helper[:i+1], gp_in=gp_in ) logger.debug(q) @@ -2510,7 +2510,7 @@ def mutate_deep_narrow_3( valueblocks_t = {} valueblocks_t[TARGET_VAR] = values[TARGET_VAR] for i in range(int((n / 2) + 1)): - q = gp_.to_sparql_useful_path_query( + q = gp_.to_sparql_deep_narrow_path_query( hop[i], node[i+1], valueblocks_s, gp_helper[:i+1], gp_in=gp_in ) logger.debug(q) @@ -2533,7 +2533,7 @@ def mutate_deep_narrow_3( ) } if n-i != i: - q = gp_.to_sparql_useful_path_query( + q = gp_.to_sparql_deep_narrow_path_query( hop[n-i], node[n-i], valueblocks_t, @@ -2577,7 +2577,7 @@ def mutate_deep_narrow_3( if key is not TARGET_VAR: valueblocks[key] = valueblocks_t[key] valueblocks['st'] = values['st'] - q = gp_.to_sparql_inst_query(hop, valueblocks, gp_help, gp_in=gp_in) + q = gp_.to_sparql_deep_narrow_path_inst_query(hop, valueblocks, gp_help, gp_in=gp_in) logger.debug(q) try: t, res_q_inst = run_query(q) @@ -2651,7 +2651,7 @@ def mutate_deep_narrow_4( valueblocks_t[TARGET_VAR] = values[TARGET_VAR] for i in range(int((n / 2) + 1)): if i < int(n/2): - q = gp_.to_sparql_useful_path_query( + q = gp_.to_sparql_deep_narrow_path_query( hop[i], node[i+1], valueblocks_s, gp_helper[:i+1], SOURCE_VAR, gp_in=gp_in ) logger.debug(q) @@ -2674,7 +2674,7 @@ def mutate_deep_narrow_4( ) } if n-i > i: - q = gp_.to_sparql_useful_path_query( + q = gp_.to_sparql_deep_narrow_path_query( hop[n-i], node[n-i], valueblocks_t, @@ -2786,7 +2786,7 @@ def mutate_deep_narrow_5( valueblocks = {} valueblocks[SOURCE_VAR] = values[SOURCE_VAR] for i in range(n+1): - q = gp_.to_sparql_useful_path_query( + q = gp_.to_sparql_deep_narrow_path_query( hop[i], node[i+1], valueblocks, gp_helper[:i+1], gp_in=gp_in ) logger.debug(q) @@ -2816,7 +2816,7 @@ def mutate_deep_narrow_5( # werden del valueblocks[SOURCE_VAR] valueblocks['st'] = values['st'] - q = gp_.to_sparql_inst_query(hop, valueblocks, gp_help, gp_in=gp_in) + q = gp_.to_sparql_deep_narrow_path_inst_query(hop, valueblocks, gp_help, gp_in=gp_in) logger.debug(q) try: t, res_q_inst = run_query(q) @@ -2907,7 +2907,7 @@ def mutate_deep_narrow_6( valueblocks = {} valueblocks[SOURCE_VAR] = values[SOURCE_VAR] for i in range(n): - q = gp_.to_sparql_useful_path_query( + q = gp_.to_sparql_deep_narrow_path_query( hop[i], node[i+1], valueblocks, gp_helper[:i+1], gp_in=gp_in ) logger.debug(q) @@ -3010,7 +3010,7 @@ def mutate_deep_narrow_7( valueblocks_t[TARGET_VAR] = values[TARGET_VAR] for i in range(int((n / 2) + 1)): if i < int(n/2): - q = gp_.to_sparql_useful_path_query( + q = gp_.to_sparql_deep_narrow_path_query( hop[i], node[i+1], valueblocks_s, gp_helper[:i+1], gp_in=gp_in ) logger.debug(q) @@ -3030,7 +3030,7 @@ def mutate_deep_narrow_7( hop[i], Variable('avgc' + ''.join(node[i + 1])), res_q[i] ) if n-i > i: - q = gp_.to_sparql_useful_path_query( + q = gp_.to_sparql_deep_narrow_path_query( hop[n-i], node[n-i], valueblocks_t, From 126e84db8abc0d35bdba914175f4fc420ef68429 Mon Sep 17 00:00:00 2001 From: "philipp.neuer" Date: Wed, 5 Sep 2018 12:55:36 +0200 Subject: [PATCH 16/27] Undone the changes in requirements.txt --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 4a02904..d61d2fd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,8 +11,8 @@ nose>=1.3.7 numpy>=1.12.1 objgraph>=3.1.0 requests>=2.16.5 -rdflib>=4.2.1 -#git+git://github.com/RDFLib/rdflib@master#egg=rdflib +#rdflib>=4.2.1 +git+git://github.com/RDFLib/rdflib@master#egg=rdflib scikit-learn>=0.18.1 scipy>=0.19.0 scoop>=0.7.1.1 From 331e06f0d5003ec914f9725bd20838fad4df2224 Mon Sep 17 00:00:00 2001 From: "philipp.neuer" Date: Wed, 5 Sep 2018 14:25:24 +0200 Subject: [PATCH 17/27] Added default-value for max instances of hops --- config/defaults.py | 1 + 1 file changed, 1 insertion(+) diff --git a/config/defaults.py b/config/defaults.py index bd10c50..75d61e9 100644 --- a/config/defaults.py +++ b/config/defaults.py @@ -95,6 +95,7 @@ MUTPB_DN_MAX_HOPS_ALPHA = 2. # alpha value in a length beta distribution MUTPB_DN_MAX_HOPS_BETA = 5. # beta value in a length beta distribution MUTPB_DN_AVG_DEG_LIMIT = 10 # Max avg. reachable Nodes +MUTPB_DN_MAX_HOP_INST = 10 # Max number of hop instances for the next query/ies # fusion of target candidates: FUSION_SAMPLES_PER_CLASS = 500 # only use up to n training samples per class From 49b5c4dfa6d5ced99284ee5c499f0e676dfc16a6 Mon Sep 17 00:00:00 2001 From: "philipp.neuer" Date: Wed, 5 Sep 2018 14:29:10 +0200 Subject: [PATCH 18/27] Renamed the correct to_sparql_deep_narrow_path_inst_query() --- graph_pattern.py | 4 ++-- tests/test_mutate_deep_narrow.py | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/graph_pattern.py b/graph_pattern.py index 859584b..ce7b45a 100644 --- a/graph_pattern.py +++ b/graph_pattern.py @@ -866,7 +866,7 @@ def to_sparql_deep_narrow_path_query( res = textwrap.dedent(res) return self._sparql_prefix(res) - def to_sparql_deep_narrow_path_inst_query( + def to_sparql_deep_narrow_path_inst_query_old( self, hop, valueblocks, @@ -896,7 +896,7 @@ def to_sparql_deep_narrow_path_inst_query( return self._sparql_prefix(res) # TODO: die normale inst durch diese hier ersetzen (sollte überall gehen) - def to_sparql_useful_path_inst_query( + def to_sparql_deep_narrow_path_inst_query( self, hop, valueblocks, diff --git a/tests/test_mutate_deep_narrow.py b/tests/test_mutate_deep_narrow.py index a99a94a..d564f6f 100644 --- a/tests/test_mutate_deep_narrow.py +++ b/tests/test_mutate_deep_narrow.py @@ -2337,7 +2337,7 @@ def mutate_deep_narrow_1( # werden del valueblocks[SOURCE_VAR] valueblocks['st'] = values['st'] - q = gp_.to_sparql_deep_narrow_path_inst_query(hop, valueblocks, gp_help, gp_in=gp_in) + q = gp_.to_sparql_deep_narrow_path_inst_query_old(hop, valueblocks, gp_help, gp_in=gp_in) logger.debug(q) try: t, res_q_inst = run_query(q) @@ -2435,7 +2435,7 @@ def mutate_deep_narrow_2( # werden del valueblocks[SOURCE_VAR] valueblocks['st'] = values['st'] - q = gp_.to_sparql_useful_path_inst_query( + q = gp_.to_sparql_deep_narrow_path_inst_query( hop, valueblocks, gp_helper, gp_in=gp_in ) logger.debug(q) @@ -2577,7 +2577,7 @@ def mutate_deep_narrow_3( if key is not TARGET_VAR: valueblocks[key] = valueblocks_t[key] valueblocks['st'] = values['st'] - q = gp_.to_sparql_deep_narrow_path_inst_query(hop, valueblocks, gp_help, gp_in=gp_in) + q = gp_.to_sparql_deep_narrow_path_inst_query_old(hop, valueblocks, gp_help, gp_in=gp_in) logger.debug(q) try: t, res_q_inst = run_query(q) @@ -2713,7 +2713,7 @@ def mutate_deep_narrow_4( if key is not TARGET_VAR: valueblocks[key] = valueblocks_t[key] valueblocks['st'] = values['st'] - q = gp_.to_sparql_useful_path_inst_query( + q = gp_.to_sparql_deep_narrow_path_inst_query( hop, valueblocks, gp_helper, gp_in=gp_in ) logger.debug(q) @@ -2816,7 +2816,7 @@ def mutate_deep_narrow_5( # werden del valueblocks[SOURCE_VAR] valueblocks['st'] = values['st'] - q = gp_.to_sparql_deep_narrow_path_inst_query(hop, valueblocks, gp_help, gp_in=gp_in) + q = gp_.to_sparql_deep_narrow_path_inst_query_old(hop, valueblocks, gp_help, gp_in=gp_in) logger.debug(q) try: t, res_q_inst = run_query(q) @@ -2934,7 +2934,7 @@ def mutate_deep_narrow_6( # werden del valueblocks[SOURCE_VAR] valueblocks['st'] = values['st'] - q = gp_.to_sparql_useful_path_inst_query( + q = gp_.to_sparql_deep_narrow_path_inst_query( hop, valueblocks, gp_helper, gp_in=gp_in ) logger.debug(q) @@ -3066,7 +3066,7 @@ def mutate_deep_narrow_7( if key is not TARGET_VAR: valueblocks[key] = valueblocks_t[key] valueblocks['st'] = values['st'] - q = gp_.to_sparql_useful_path_inst_query( + q = gp_.to_sparql_deep_narrow_path_inst_query( hop, valueblocks, gp_helper, gp_in=gp_in ) logger.debug(q) From c0617ea505d456b1e48a5a74b86503fdeacb346d Mon Sep 17 00:00:00 2001 From: "philipp.neuer" Date: Wed, 5 Sep 2018 15:00:11 +0200 Subject: [PATCH 19/27] Added docsring for mutate_deep_narrow_path() AND Renamed direct and childin AND added the possibility to choose n => length of directions AND some Codestyle changes AND comments => english --- gp_learner.py | 115 +++++++++++++++++++++++++++++--------------------- 1 file changed, 67 insertions(+), 48 deletions(-) diff --git a/gp_learner.py b/gp_learner.py index 099409c..3765dad 100644 --- a/gp_learner.py +++ b/gp_learner.py @@ -692,10 +692,34 @@ def mutate_deep_narrow_path( timeout, gtp_scores, child, - direct=None, - childin=False, - limit=config.MUTPB_FV_QUERY_LIMIT, # TODO: Limit benutzen? + directions=None, + child_in_queries=False, + limit=None, # TODO: Use a limit for the queries? ): + """ Finds n-hop-connections from Source to Target, to add them to a given + Graph-Pattern. + + The outline of the mutation is as follows: + - If not evaluated, evaluates the given GP to work on its matching-node- + pairs + - If not passed in, randomly selects the path-length and the directions + of the single hops. + - Issues SPARQL queries, to find hops (from Source and Target), that don't + have a big fan-out (smaller than the default-value). Uses an default max- + amount of found hops to find the next hop. + When there is only one hop left to find, it tries to instanciate paths, + that fit to an STP. If such a path is found, its hops are added to the GP. + As there could be more than one path, the mutation returns a list of such + patterns. + + :param directions: list of directions to use for the hops + (1: Source -> Target, -1: Target -> Source, + 0 (or everything else): choose random) + :param child_in_queries: If true: add the triples of the given pattern to + the queries + :param limit: SPARQL limnit + :return: list of children in which a deep_narrow_path is added + """ if not child.fitness.valid: ev = evaluate( sparql, timeout, gtp_scores, child, run=-1, gen=-1) @@ -703,35 +727,32 @@ def mutate_deep_narrow_path( gtps = child.matching_node_pairs if not gtps: return [child] - alpha = config.MUTPB_DN_MAX_HOPS_ALPHA - beta = config.MUTPB_DN_MAX_HOPS_BETA - max_hops = config.MUTPB_DN_MAX_HOPS - # more likely to create shorter paths - # with default values the distribution is as follows: - # PDF: 1: 14 %, 2: 27 %, 3: 25 %, 4: 17 %, 5: 10 %, 6: 5 %, 7: 1.5 %, ... - # CDF: 1: 14 %, 2: 40 %, 3: 66 %, 4: 83 %, 5: 93 %, 6: 98 %, 7: 99,6 %, ... - n = int(random.betavariate(alpha, beta) * (max_hops-1) + 1) + if directions: + n = len(directions) - 1 + else: + alpha = config.MUTPB_DN_MAX_HOPS_ALPHA + beta = config.MUTPB_DN_MAX_HOPS_BETA + max_hops = config.MUTPB_DN_MAX_HOPS + # more likely to create shorter paths + # with default values the distribution is as follows: + # PDF: 1: 14 %, 2: 27 %, 3: 25 %, 4: 17 %, 5: 10 %, 6: 5 %, 7: 1.5 %, ... + # CDF: 1: 14 %, 2: 40 %, 3: 66 %, 4: 83 %, 5: 93 %, 6: 98 %, 7: 99,6 %, ... + n = int(random.betavariate(alpha, beta) * (max_hops-1) + 1) nodes = [SOURCE_VAR] + [Variable('n%d' % i) for i in range(n)] + [TARGET_VAR] hops = [Variable('p%d' % i) for i in range(n + 1)] - # TODO: Entfernern, wenn direct einfach immer random gewählt werden soll - if direct is None or len(direct) != n + 1: - logger.debug( - 'No direction chosen, or direction tuple with false length' - ) - direct = [0 for _ in range(n + 1)] - gp_helper = [] - for i in range(n + 1): - if direct[i] == 0: - direct[i] = random.choice([-1, 1]) - if direct[i] == 1: - gp_helper.append( - GraphPattern([(nodes[i], hops[i], nodes[i + 1])]) - ) - else: - gp_helper.append( - GraphPattern([(nodes[i + 1], hops[i], nodes[i])]) - ) - # Queries für die Schritte + if not directions: + directions = [0 for _ in range(n + 1)] + directions = [ + random.choice([-1, 1]) if d not in [-1, 1] else d for d in directions + ] + gp_hops = [ + # directions[i] == 1 => hop in the direction source -> target + GraphPattern([(nodes[i], hops[i], nodes[i + 1])]) if directions[i] == 1 + # directions[i] == -1 => hop in the direction target -> source + else GraphPattern([(nodes[i + 1], hops[i], nodes[i])]) + for i in range(n+1) + ] + # queries to get the first n hops: valueblocks_s = {} valueblocks_t = {} for i in range(n // 2 + 1): @@ -743,16 +764,16 @@ def mutate_deep_narrow_path( hops[i], nodes[i+1], valueblocks_s, - gp_helper[:i + 1], + gp_hops[:i + 1], SOURCE_VAR, - gp_in=childin, + gp_in=child_in_queries, ) if not q_res: return [child] valueblocks_s[hops[i]] = { (hops[i],): random.sample( [(q_r,) for q_r in q_res], - min(10, len(q_res)) + min(config.MUTPB_DN_MAX_HOP_INST, len(q_res)) ) } if n-i > i: @@ -763,22 +784,21 @@ def mutate_deep_narrow_path( hops[n-i], nodes[n-i], valueblocks_t, - gp_helper[n - i:], + gp_hops[n - i:], TARGET_VAR, - gp_in=childin, + gp_in=child_in_queries, ) if not q_res: return [child] valueblocks_t[hops[n-i]] = { (hops[n-i],): random.sample( [(q_r,) for q_r in q_res], - min(config.MUTPB_DN_AVG_DEG_LIMIT, len(q_res)) + min(config.MUTPB_DN_MAX_HOP_INST, len(q_res)) ) } - # Query fürs Ergebnis - # gemeinsamer source/target-block, damit nur "richtige" Pfade gefunden - # werden + # query to get the last hop and instantiations, that connect source and + # target valueblocks = {} valueblocks.update(valueblocks_s) valueblocks.update(valueblocks_t) @@ -788,19 +808,18 @@ def mutate_deep_narrow_path( child, hops, valueblocks, - gp_helper, - gp_in=childin + gp_hops, + gp_in=child_in_queries ) if not q_res: return [child] - res = [] - for inst in q_res: - child_inst = GraphPattern([ - (nodes[i], inst[i], nodes[i + 1]) if direct[i] == 1 - else (nodes[i + 1], inst[i], nodes[i]) + res = [ + child + GraphPattern([ + (nodes[i], qr[i], nodes[i + 1]) if directions[i] == 1 + else (nodes[i + 1], qr[i], nodes[i]) for i in range(n + 1) - ]) - res.append(GraphPattern(child + child_inst)) + ]) for qr in q_res + ] return res From 82cdacfb84addf1ef760c8ffcb9de324ec94cc59 Mon Sep 17 00:00:00 2001 From: "philipp.neuer" Date: Wed, 5 Sep 2018 15:02:35 +0200 Subject: [PATCH 20/27] Renamed the correct to_sparql_depp_narrow_path_inst_query() --- gp_query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gp_query.py b/gp_query.py index 5f1327d..60caee0 100644 --- a/gp_query.py +++ b/gp_query.py @@ -816,7 +816,7 @@ def _deep_narrow_path_inst_chunk_q(gp, _vars_steps_and_stuff, values_chunk): } } valueblocks.update(_valueblocks) - return gp.to_sparql_useful_path_inst_query( + return gp.to_sparql_deep_narrow_path_inst_query( hop, valueblocks, steps, gp_in=gp_in ) From 9117d059a678023e8478e1446721d524a72ef2bd Mon Sep 17 00:00:00 2001 From: "philipp.neuer" Date: Wed, 5 Sep 2018 15:12:40 +0200 Subject: [PATCH 21/27] Comments -> english --- gp_query.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/gp_query.py b/gp_query.py index 60caee0..1763ece 100644 --- a/gp_query.py +++ b/gp_query.py @@ -713,10 +713,9 @@ def deep_narrow_path_query( batch_size=None ): _query_stats.useful_path_query_count += 1 - # TODO: evtl. je 10 pro 'gefixter' Variable von batch-size abziehen - # (weil der Block ja mit rein geht) + # TODO: maybe batch_size = batch_size - 10 * number of valueblocks for hops _values = graph_pattern.matching_node_pairs - # TODO: evtl. Schnitt mit noch nicht abgedeckten + # TODO: maybe use not good covered stp _ret_val_mapping = {stp: [stp] for stp in graph_pattern.matching_node_pairs} _vars_steps_and_stuff = ( var_to_fix, var_to_count, startvar, valueblocks, steps, avglimit, gp_in @@ -765,10 +764,8 @@ def _deep_narrow_path_chunk_result_ext(q_res, _vars_steps_and_stuff, _, **kwds): bindings = sparql_json_result_bindings_to_rdflib( get_path(q_res, res_rows_path, default=[]) ) - for row in bindings: - # TODO: Drüber nachdenken, ob iwie die avg-outgoing auch mit - # zurückgegeben werden sollen + # TODO: Maybe return the avg-degree too chunk_res.append(get_path(row, [var_to_fix])) return chunk_res @@ -788,10 +785,9 @@ def deep_narrow_path_inst_query( batch_size=None ): _query_stats.useful_path_inst_query_count += 1 - # TODO: evtl. je 10 pro 'gefixter' Variable von batch-size abziehen - # (weil der Block ja mit rein geht) + # TODO: maybe batch_size = batch_size - 10 * number of valueblocks for hops _values = graph_pattern.matching_node_pairs - # evtl. Schnitt mit noch nicht abgedeckten + # TODO: maybe use not good covered stp _ret_val_mapping = {stp: [stp] for stp in graph_pattern.matching_node_pairs} _vars_steps_and_stuff = (hop, valueblocks, steps, gp_in) return _multi_query( From d792d10d9467e0b0538cc69d63f9384a83457317 Mon Sep 17 00:00:00 2001 From: "philipp.neuer" Date: Wed, 5 Sep 2018 15:16:59 +0200 Subject: [PATCH 22/27] Erased all unused to_sparql_*_query() --- graph_pattern.py | 217 ----------------------------------------------- 1 file changed, 217 deletions(-) diff --git a/graph_pattern.py b/graph_pattern.py index ce7b45a..5b31917 100644 --- a/graph_pattern.py +++ b/graph_pattern.py @@ -636,194 +636,6 @@ def to_sparql_select_query( res = textwrap.dedent(res) return self._sparql_prefix(res) - def to_sparql_select_sample_query( - self, - values, - values_s_t=None, - projection=None, - limit=None, - sample_var=None - ): - """Generates a SPARQL select sample query from the graph pattern. - - Examples: - TODO - - Args: - values: a dict mapping a variable tuple to a list of binding tuples, - e.g. {(v1, v2): [(uri1, uri2), (uri3, uri4), ...]} - values_s_t: TODO - projection: which variables to select on, by default all vars. - limit: integer to limit the result size - sample_var: the variable to sample over - """ - assert self.vars_in_graph, \ - "tried to get sparql for pattern without vars: %s" % (self,) - - if projection is None: - projection = sorted([v for v in self.vars_in_graph]) - - # if sample_var is None: - # sample_var = random.choice(projection) - # logger.info(sample_var) - - if sample_var: - projection.remove(sample_var) - - res = "SELECT %(samp)s %(proj)s WHERE {\n" \ - "%(valst)s\n" \ - "%(qpp)s}\n" \ - "%(lim)s" % { - 'samp': (' SAMPLE(%s) as %s' % ( - ''.join(sample_var.n3()), - ''.join(sample_var.n3()) - )) if sample_var else '', - 'proj': ' '.join([v.n3() for v in projection]), - 'valst': self._sparql_values_part(values=values_s_t, indent=' ') - if values_s_t is not None else '', - 'qpp': self._sparql_query_pattern_part( - values=values, - indent=' ', - ), - 'lim': ('LIMIT %d\n' % limit) if limit is not None else '', - } - res = textwrap.dedent(res) - return self._sparql_prefix(res) - - def to_sparql_filter_by_count_in_out_query( - self, - values, - count_node, - in_out=None, - max_in=None, - max_out=None, - projection=None, - gp=None, - limit=None, - sample_var=None - ): - # TODO: Möglicherweise noch die Pfade aus dem gp_in rausfiltern, man - # will ja eher selten einen zusatzhop über einen schon vorhandenen - # Pfad finden - - """Generates a SPARQL select query from the graph pattern. - - Examples: - TODO - - Args: TODO - values: a dict mapping a variable tuple to a list of binding tuples, - e.g. {(v1, v2): [(uri1, uri2), (uri3, uri4), ...]} - count_node: Node to filter over outgoing arcs. - in_out: - max_in: - max_out: max outgoing arcs - projection: which variables to select on, by default all vars. - gp: - limit: integer to limit the result size - sample_var: the variable to sample over - """ - assert self.vars_in_graph, \ - "tried to get sparql for pattern without vars: %s" % (self,) - - if projection is None: - projection = sorted([v for v in self.vars_in_graph]) - if sample_var: - projection.remove(sample_var) - - if max_out is None: - max_out = 20 - if max_in is None: - max_in = 20 - - if in_out not in ['in', 'out', 'inout']: - in_out = random.choice(['in', 'out', 'inout']) - logger.info('in_out was set on %s' % in_out) - count_out = Variable('cout') - count_in = Variable('cin') - rand_var_out = gen_random_var() - rand_var_in = gen_random_var() - if gp: - if in_out == 'out': - gp_ = GraphPattern(chain(self, - GraphPattern([ - (count_node, count_out, rand_var_out) - ]), - gp)) - elif in_out == 'in': - gp_ = GraphPattern(chain(self, - GraphPattern([ - (rand_var_in, count_in, count_node) - ]), - gp)) - else: # TODO: Testen ob inout überhaupt passt - gp_ = GraphPattern(chain(self, - GraphPattern([ - (rand_var_in, count_in, count_node), - (count_node, count_out, rand_var_out) - ]), - gp)) - else: - if in_out == 'out': - gp_ = GraphPattern(chain(self, - GraphPattern([ - (count_node, count_out, rand_var_out) - ]) - )) - elif in_out == 'in': - gp_ = GraphPattern(chain(self, - GraphPattern([ - (rand_var_in, count_in, count_node) - ]) - )) - else: # TODO: Testen ob inout überhaupt passt - gp_ = GraphPattern(chain(self, - GraphPattern([ - (rand_var_in, count_in, count_node), - (count_node, count_out, rand_var_out) - ]) - )) - - res = "SELECT %(samp)s %(proj)s %(count)s WHERE " \ - "{\n%(qpp)s}\n%(gb)s\n%(hv)s\n%(lim)s" % { - 'samp': (' SAMPLE(%s) as %s' % ( - ''.join(sample_var.n3()), - ''.join(sample_var.n3()) - )) if sample_var else '', - 'proj': ' '.join([v.n3() for v in projection]), - 'count': (' COUNT(%s) as %s' % ( - ''.join(count_out.n3()), - ''.join(count_out.n3()))) if in_out == 'out' else - (' COUNT(%s) as %s' % ( - ''.join(count_in.n3()), - ''.join(count_in.n3()))) if in_out == 'in' else - (' COUNT(%s) as %s COUNT(%s) as %s' % ( - ''.join(count_out.n3()), - ''.join(count_out.n3()), - ''.join(count_in.n3()), - ''.join(count_in.n3()) - )), - 'qpp': gp_._sparql_query_pattern_part( - values=values, - indent=' ', - ), - 'gb': ('GROUP BY ' + ' '.join([v.n3() for v in projection])), - 'hv': ('HAVING (COUNT(%s)<%s)' % ( - ''.join(count_out.n3()), - str(max_out))) if in_out == 'out' else - ('HAVING (COUNT(%s)<%s)' % ( - ''.join(count_in.n3()), - str(max_in))) if in_out == 'in' else - ('HAVING (COUNT(%s)<%s&&COUNT(%s)<%s)' % ( - ''.join(count_out.n3()), - str(max_out), - ''.join(count_in.n3()), - str(max_in) - )), - 'lim': ('LIMIT %d\n' % limit) if limit is not None else '', - } - res = textwrap.dedent(res) - return gp_._sparql_prefix(res) def to_sparql_deep_narrow_path_query( self, @@ -866,36 +678,7 @@ def to_sparql_deep_narrow_path_query( res = textwrap.dedent(res) return self._sparql_prefix(res) - def to_sparql_deep_narrow_path_inst_query_old( - self, - hop, - valueblocks, - gp_help, - gp_in=False - ): - res = "SELECT %(vtf)s (COUNT (?source) as ?cst) {\n" \ - "%(val)s\n" \ - "%(trip)s }\n" \ - "GROUP BY %(vtf)s\n" \ - "HAVING (COUNT (?source) > 0)" % { - 'vtf': ' '.join([var.n3() for var in hop]), - 'val': ''.join([ - self._sparql_values_part( - values=valueblocks[key], indent=' ' - ) for key in valueblocks - ]), - 'trip': ''.join(gp_help._sparql_triples_part()) + - # TODO: nicht auf private Methode zugreifen - ''.join([ - self._sparql_triples_part( - indent=' ' - ) if gp_in else '' - ]), - } - res = textwrap.dedent(res) - return self._sparql_prefix(res) - # TODO: die normale inst durch diese hier ersetzen (sollte überall gehen) def to_sparql_deep_narrow_path_inst_query( self, hop, From 75bd1ea4af6ff6fa1b3b9eb983ac38b39bfec7ca Mon Sep 17 00:00:00 2001 From: "philipp.neuer" Date: Wed, 5 Sep 2018 15:24:40 +0200 Subject: [PATCH 23/27] Comments -> english --- graph_pattern.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/graph_pattern.py b/graph_pattern.py index 5b31917..3cef58b 100644 --- a/graph_pattern.py +++ b/graph_pattern.py @@ -647,7 +647,7 @@ def to_sparql_deep_narrow_path_query( avglimit=10, gp_in=False ): - # TODO: evtl. Limit zufügen + # TODO: Maybe use a limit count_var_to_count = Variable('c' + ''.join(var_to_count)) avg_var_to_count = Variable('avgc' + ''.join(var_to_count)) res = "SELECT %(vtf)s (AVG(%(cvtc)s) as %(avtc)s) {\n" \ @@ -669,7 +669,7 @@ def to_sparql_deep_narrow_path_query( ]), 'trip': ''.join([ step._sparql_triples_part(indent=' ') for step in steps - # TODO: nicht auf private Methode zugreifen + # TODO: don't use private method ]) + ''.join([ self._sparql_triples_part(indent=' ') if gp_in else '' ]), @@ -686,6 +686,7 @@ def to_sparql_deep_narrow_path_inst_query( steps, gp_in=False ): + # TODO: Maybe use a limit res = "SELECT %(vtf)s (COUNT (?source) as ?cst) {\n" \ "%(val)s\n" \ "%(trip)s }\n" \ @@ -699,7 +700,7 @@ def to_sparql_deep_narrow_path_inst_query( ]), 'trip': ''.join([ step._sparql_triples_part() for step in steps - # TODO: nicht auf private Methode zugreifen + # TODO: don't use private method ]) + ''.join([ self._sparql_triples_part(indent=' ') if gp_in else '' ]), From 72f2fee9e3078bd7b21bf30e6d5998cedd3cb956 Mon Sep 17 00:00:00 2001 From: "philipp.neuer" Date: Wed, 5 Sep 2018 15:58:45 +0200 Subject: [PATCH 24/27] deleted test_fv_eval.py and SPARQL-query.py --- tests/SPARQL-query.py | 75 ------------------ tests/test_fv_eval.py | 174 ------------------------------------------ 2 files changed, 249 deletions(-) delete mode 100644 tests/SPARQL-query.py delete mode 100644 tests/test_fv_eval.py diff --git a/tests/SPARQL-query.py b/tests/SPARQL-query.py deleted file mode 100644 index 4bbb7e0..0000000 --- a/tests/SPARQL-query.py +++ /dev/null @@ -1,75 +0,0 @@ -# coding=utf-8 -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -"""ein File einfach um SPARQL-queries abzufeuern, statt es online im Browser -zu machen. -""" - -import logging -from collections import OrderedDict -from os import getenv - -import SPARQLWrapper -from splendid import time_func -import socket -import rdflib -from rdflib import URIRef -from rdflib import Variable - -from config import SPARQL_ENDPOINT -from gp_learner import evaluate -from gp_learner import mutate_fix_var -from gp_learner import update_individuals -from gp_query import calibrate_query_timeout -from gp_query import query_time_hard_exceeded -from gp_query import query_time_soft_exceeded -from graph_pattern import GraphPattern -from graph_pattern import SOURCE_VAR -from graph_pattern import TARGET_VAR -from ground_truth_tools import get_semantic_associations -from ground_truth_tools import split_training_test_set -from gtp_scores import GTPScores -from serialization import print_graph_pattern - - -sparql = SPARQLWrapper.SPARQLWrapper( - getenv('SPARQL_ENDPOINT', 'http://dbpedia.org/sparql')) -try: - timeout = max(5, calibrate_query_timeout(sparql)) # 5s for warmup -except IOError: - from nose import SkipTest - raise SkipTest( - "Can't establish connection to SPARQL_ENDPOINT:\n %s\n" - "Skipping tests in\n %s" % (SPARQL_ENDPOINT, __file__)) - -sparql.resetQuery() -sparql.setTimeout(timeout) -sparql.setReturnFormat(SPARQLWrapper.JSON) - -q = 'SELECT ?source ?target ?vcb0 ?vcb1 ?vcb2 ?vcb3 WHERE {' \ - '?source ?vcb0 ?vcb2 .' \ - '?target ?vcb3 .' \ - '?target ?vcb1 .' \ - '?vcb2 ?source .' \ - '?vcb2 ?target ' \ - '}' - -try: - q_short = ' '.join((line.strip() for line in q.split('\n'))) - sparql.setQuery(q_short) - c = time_func(sparql.queryAndConvert) -except socket.timeout: - c = (timeout, {}) -except ValueError: - # e.g. if the endpoint gives us bad JSON for some unicode chars - print( - 'Could not parse result for query, assuming empty result...\n' - 'Query:\n%s\nException:', q, - exc_info=1, # appends exception to message - ) - c = (timeout, {}) - -t, res = c -print('orig query took %.4f s, result:\n%s\n', t, res) \ No newline at end of file diff --git a/tests/test_fv_eval.py b/tests/test_fv_eval.py deleted file mode 100644 index 3c847b6..0000000 --- a/tests/test_fv_eval.py +++ /dev/null @@ -1,174 +0,0 @@ -# coding=utf-8 -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -"""test_mutate_fix_var und test_evaluate einmal davor und -einmal über die results aus mutate_fix_var -""" - -import logging -from collections import defaultdict -from collections import OrderedDict -from os import getenv - -import SPARQLWrapper -from splendid import get_path -from splendid import time_func -import socket -import rdflib -from rdflib import BNode -from rdflib import Literal -from rdflib import URIRef -from rdflib import Variable - -from config import SPARQL_ENDPOINT -from gp_learner import evaluate -from gp_learner import mutate_fix_var -from gp_learner import update_individuals -from gp_query import calibrate_query_timeout -from gp_query import query_time_hard_exceeded -from gp_query import query_time_soft_exceeded -from graph_pattern import GraphPattern -from graph_pattern import SOURCE_VAR -from graph_pattern import TARGET_VAR -from ground_truth_tools import get_semantic_associations -from ground_truth_tools import split_training_test_set -from gtp_scores import GTPScores -from serialization import print_graph_pattern -from utils import sparql_json_result_bindings_to_rdflib - -logger = logging.getLogger(__name__) - -dbp = rdflib.Namespace('http://dbpedia.org/resource/') -owl = rdflib.Namespace('http://www.w3.org/2002/07/owl#') - -a = Variable('a') -b = Variable('b') -c = Variable('c') -d = Variable('d') -e = Variable('e') -f = Variable('f') -v = Variable('v') -w = Variable('w') - -sameAs = owl['sameAs'] - -gp_1 = GraphPattern([ - (SOURCE_VAR, v, TARGET_VAR) -]) - -gp_2 = GraphPattern([ - (SOURCE_VAR, v, TARGET_VAR), - (TARGET_VAR, w, SOURCE_VAR) -]) - -gp_3 = GraphPattern([ - (SOURCE_VAR, a, b), - (b, c, d), - (d, e, TARGET_VAR) -]) - -gp_4 = GraphPattern([ - (SOURCE_VAR, a, b), - (b, c, d), - (TARGET_VAR, e, d) -]) - -gp_5 = GraphPattern([ - (SOURCE_VAR, a, c), - (TARGET_VAR, URIRef('http://dbpedia.org/ontology/thumbnail'), d), - (TARGET_VAR, URIRef('http://dbpedia.org/property/image'), b), - (c, URIRef('http://dbpedia.org/ontology/wikiPageWikiLink'), SOURCE_VAR), - (c, URIRef('http://purl.org/linguistics/gold/hypernym'), TARGET_VAR) -]) - -ground_truth_pairs_1 = [ - (dbp['Berlin'], dbp['Germany']), - (dbp['Hamburg'], dbp['Germany']), - (dbp['Kaiserslautern'], dbp['Germany']), - (dbp['Wien'], dbp['Austria']), - (dbp['Insbruck'], dbp['Austria']), - (dbp['Salzburg'], dbp['Austria']), - (dbp['Paris'], dbp['France']), - (dbp['Lyon'], dbp['France']), - (dbp['Amsterdam'], dbp['Netherlands']), - (dbp['Brussels'], dbp['Belgium']), - (dbp['Washington'], dbp['United_States']), - (dbp['Madrid'], dbp['Spain']), - (dbp['Prague'], dbp['Czech_Republic']), - (dbp['Bern'], dbp['Switzerland']), -] - -ground_truth_pairs_2 = get_semantic_associations() -ground_truth_pairs_2, _ = split_training_test_set(ground_truth_pairs_2) -ground_truth_pairs_2 = ground_truth_pairs_2[1:100] - -ground_truth_pairs_3 = [ - (dbp['Barrister'], dbp['Law']), - (dbp['Christ'], dbp['Jesus']), - (dbp['Pottage'], dbp['Soup']) - ] - -ground_truth_pairs_4 = [ - (dbp['Motorrad_(disambiguation)'], dbp['Bmw_motorcycle']), - (dbp['Horse'], dbp['Saddle']) -] - -gtp_scores_1 = GTPScores(ground_truth_pairs_1) -gtp_scores_2 = GTPScores(ground_truth_pairs_2) -gtp_scores_3 = GTPScores(ground_truth_pairs_3) -gtp_scores_4 = GTPScores(ground_truth_pairs_4) - -sparql = SPARQLWrapper.SPARQLWrapper( - getenv('SPARQL_ENDPOINT', 'http://dbpedia.org/sparql')) -#sparql = SPARQLWrapper.SPARQLWrapper(SPARQL_ENDPOINT) -try: - timeout = max(5, calibrate_query_timeout(sparql)) # 5s for warmup -except IOError: - from nose import SkipTest - raise SkipTest( - "Can't establish connection to SPARQL_ENDPOINT:\n %s\n" - "Skipping tests in\n %s" % (SPARQL_ENDPOINT, __file__)) - - -def test_eval(gtp_scores, gp): - res, matching_node_pairs, gtp_precisions = evaluate( - sparql, timeout, gtp_scores, gp, run=0, gen=0) - update_individuals([gp], [(res, matching_node_pairs, gtp_precisions)]) - logger.info(gp.fitness) - - -def test_mut_fv(gtp_scores, gp, r=None): - res = mutate_fix_var(sparql, timeout, gtp_scores, gp, rand_var=r) - for gp_ in res: - logger.info(gp_) - - -def test_eval_list(gtp_scores, gp, r=None): - mfv_res = mutate_fix_var(sparql, timeout, gtp_scores, gp, rand_var=r) - for gp_ in mfv_res: - res, matching_node_pairs, gtp_precisions = evaluate( - sparql, timeout, gtp_scores, gp_, run=0, gen=0) - update_individuals([gp_], [(res, matching_node_pairs, gtp_precisions)]) - print_graph_pattern(gp_, print_matching_node_pairs=0) - return mfv_res - - -def test_eval_list_double(gtp_scores, gp, r_1=None, r_2=None): - # testing double execution of mutate_fix_var() on gp - res = test_eval_list(gtp_scores, gp, r_1) - gtp_scores.update_with_gps(res) - res_list = list(res) - for gp in res: - res_ = test_eval_list(gtp_scores, gp, r_2) - for gp_ in res_: - res_list.append(gp_) - gtp_scores.update_with_gps(res_list) - for gp in res_list: - print_graph_pattern(gp, print_matching_node_pairs=0) - -if __name__ == '__main__': - test_steps(ground_truth_pairs_2) - #values = {(SOURCE_VAR, TARGET_VAR): ground_truth_pairs_1} - #print(gp_1.to_sparql_select_sample_query(values)) From e2e09a4203dc88ab0668037f255c37ace6d7cd81 Mon Sep 17 00:00:00 2001 From: "philipp.neuer" Date: Wed, 5 Sep 2018 16:03:41 +0200 Subject: [PATCH 25/27] Erased everything except the test for the mutation in the learner --- tests/test_mutate_deep_narrow.py | 3235 ------------------------------ 1 file changed, 3235 deletions(-) diff --git a/tests/test_mutate_deep_narrow.py b/tests/test_mutate_deep_narrow.py index d564f6f..b636735 100644 --- a/tests/test_mutate_deep_narrow.py +++ b/tests/test_mutate_deep_narrow.py @@ -183,2919 +183,6 @@ (v[2], dbo['wikiPageRedirects'], SOURCE_VAR) ]) -# Verschiedene Limits festlegen: -# Limit: search object-list => subject-values in next query -limit_next = 500 -# limt: search an object list from two diferrent subjects and get hits through -# comparing them -limit_endpoint_two_sided = 1000 -# limit: search object-list => compare with sources/targets from gtp -limit_choose_endpoint = 5000 -# limit: search subject-list from two diferrent objects and get hits through -# comparing them -limit_startpoint_two_sided = 200 -# limit: search subject-list => subject-values in next query -limit_subject_next = 350 -# limit: search subject list => compare with sources/targets from gtp -limit_choose_subject_endpoint = 3000 -# limits: hit-list => on side subject, one side object: -limit_subj_to_obj = 350 -limit_obj_to_subj = 1500 - - -# einen ein-hop-weg von source zu target zum pattern hinzufügen -# TODO Varianten (von gefundenen b aus Variante der zweiten query -# 1.(default) mit (b, c, d) Liste von d suchen und mit Target-Liste vergleichen -# 2. mit (b, c, target). VALUES(target) suchen => -# Ergebnisse direkt an existente Targets gebunden -# 3. mit (b, c, target).urspurngs_gp -def mutate_deep_narrow_one_hop_s_t_without_direction( - gp_, gtps, max_out=None, max_in=None, in_out=None -): - vars_ = gp_.vars_in_graph - if not (SOURCE_VAR in vars_ and TARGET_VAR in vars_): - logger.info('SOURCE or TARGET are not in gp: %s' % gp_) - return [] - # Erstelle pattern für den ersten Schritt - a = Variable('a') - b = Variable('b') - c = Variable('c') - values_s_t = {(SOURCE_VAR, TARGET_VAR): gtps} - gp1 = GraphPattern([(SOURCE_VAR, a, b)]) - q = gp1.to_sparql_filter_by_count_in_out_query( - values=values_s_t, count_node=b, in_out=in_out, max_out=max_out, - max_in=max_in, gp=gp_, limit=200) - logger.info(q) - t, q_res1 = run_query(q) - if not q_res1['results']['bindings']: - return [] - # logger.info('orig query took %.4f s, result:\n%s\n', t, q_res1) - # Erstelle values aus den Ergebnissen für b - values = get_values([b], q_res1) - gp2 = GraphPattern([(b, c, TARGET_VAR)]) - # Query die über eine var aus gp2 random samplet mit values aus b_list - q = gp2.to_sparql_select_sample_query(values=values, limit=5000) - logger.info(q) - try: - t, q_res2 = run_query(q) - except: - logger.info('Die Query (s.o.) hat nicht geklappt') - return [] - # Kreiere target_list, in der die "gefundenen" Targets vermerkt sind - target_list = get_values_list(TARGET_VAR, q_res2) - # logger.info('orig query took %.4f s, result:\n%s\n', t, q_res2) - # Kreiere gtps_hit in der alle gtps, deren targets in target_list enthalten - # sind, "gespeichert" werden - stp_hit = get_stp_hit(target_list, gtps, 1) - gp_list = get_fixed_path_gp_one_hop( - q_res1, q_res2, gp_, stp_hit, [], a, b, c - ) - return gp_list - - -# einen ein-hop-weg von source zu target zum pattern hinzufügen -# (gp in query 2 eingefügt) -def mutate_deep_narrow_one_hop_s_t_2(gp_, gtps, max_in_out=None, in_out=None): - vars_ = gp_.vars_in_graph - if not (SOURCE_VAR in vars_ and TARGET_VAR in vars_): - logger.info('SOURCE or TARGET are not in gp: %s' % gp_) - return [] - # Erstelle pattern für den ersten Schritt - a = Variable('a') - b = Variable('b') - c = Variable('c') - gp1 = GraphPattern([(SOURCE_VAR, a, b)]) - values_s_t = {(SOURCE_VAR, TARGET_VAR): gtps} - q = gp1.to_sparql_filter_by_count_in_out_query( - values=values_s_t, count_node=b, in_out=in_out, - max_out=max_in_out, gp=gp_, limit=200) - logger.info(q) - t, q_res1 = run_query(q) - if not q_res1['results']['bindings']: - return [] - # logger.info('orig query took %.4f s, result:\n%s\n', t, q_res1) - gp2 = GraphPattern([(b, c, TARGET_VAR)]) - # Erstelle values aus den Ergebnissen für b - values = get_values([b], q_res1) - # Query die über eine var aus gp2 random samplet mit values aus b_list - q = gp2.to_sparql_select_sample_query( - values=values, values_s_t=values_s_t, limit=5000 - ) - logger.info(q) - try: - t, q_res2 = run_query(q) - except: - logger.info('Die Query (s.o.) hat nicht geklappt') - return [] - # Kreiere target_list, in der die "gefundenen" Targets vermerkt sind - target_list = get_values_list(TARGET_VAR, q_res2) - # logger.info('orig query took %.4f s, result:\n%s\n', t, q_res2) - # Kreiere gtps_hit in der alle gtps, deren targets in target_list enthalten - # sind, "gespeichert" werden - stp_hit = get_stp_hit(target_list, gtps, 1) - gp_list = get_fixed_path_gp_one_hop(q_res1, q_res2, gp_, stp_hit, a, b, c) - return gp_list - - -# eine one-hop verbindung zwischen source und target finden (Richtungen random) -def mutate_deep_narrow_one_random_hop_s_t(): - ich_darf_nich_leer_sein = [] - return ich_darf_nich_leer_sein - - -# einen direkten weg um einen hop erweitern (Weg löschen und stattdessen -# ein-hop weg einfügen) - - -# zu einem direkten weg noch einen ein-hop weg hinzufügen (weg behalten, -# ein-hop weg dazu) - - -# Runs a given (as String) query against the Sparql-endpoint -def run_query(q): - try: - q_short = ' '.join((line.strip() for line in q.split('\n'))) - sparql.setQuery(q_short) - cal = time_func(sparql.queryAndConvert) - except socket.timeout: - cal = (timeout, {}) - except ValueError: - # e.g. if the endpoint gives us bad JSON for some unicode chars - logger.info( - 'Could not parse result for query, assuming empty result...\n' - 'Query:\n%s\nException:', q, - exc_info=1, # appends exception to message - ) - cal = (timeout, {}) - return cal - - -# returns a list of value-tupels for the given variables, out of an -# query-result -def get_values(varlist, q_res): - res_rows_path = ['results', 'bindings'] - bind = sparql_json_result_bindings_to_rdflib( - get_path(q_res, res_rows_path, default=[]) - ) - vallist = [] - for row in bind: - tup = () - for var in varlist: - tup = tup + (get_path(row, [var]), ) - vallist.append(tup) - # ausfiltern von vallist (leider notwendig vor allem wegen dbr:Template - vallist[:] = [valtup for valtup in vallist if not list_remove_bool(valtup)] - # dopppelte noch herausfiltern - vallist = list(set(vallist)) - vartup = () - for var in varlist: - vartup = vartup + (var, ) - values = {vartup: vallist} - return values - - -# returns a list of found values for a given variable and query-result -def get_values_list(var, q_res): - res_rows_path = ['results', 'bindings'] - bind = sparql_json_result_bindings_to_rdflib( - get_path(q_res, res_rows_path, default=[]) - ) - vallist = [get_path(row, [var]) for row in bind] - return vallist - - -# gibt ein sample nach der Gewichtung der counts zurück, -# Gewichtung ist hier innerhalb angesetzt -def get_weighted_sample(var, count, q_res): - res_rows_path = ['results', 'bindings'] - bind = sparql_json_result_bindings_to_rdflib( - get_path(q_res, res_rows_path, default=[]) - ) - val = [] - weight = [] - for row in bind: - val.append(get_path(row, [var])) - # Davon ausgehend, dass x besonders gut ist - if float(get_path(row, [count])) == 1.0: - weight.append(10000) - else: - weight.append(1/(abs(1-float(get_path(row, [count]))))) - # Davon ausgehend, dass x besonders schlecht ist - # weight.append(abs(7-float(get_path(row, [count])))) - # weight.append(get_path(row, [count])) - s = sum(weight) - for i in range(len(weight)): - weight[i] = weight[i] / s - cum_weights = [0] + list(np.cumsum(weight)) - res = [] - while len(res) < min(10, len(list(set(val)))): - x = np.random.random() - i = 0 - while x > cum_weights[i]: - i = i + 1 - index = i - 1 - if val[index] not in res: - res.append((val[index],)) - sample = {(var,): res} - return sample - - -# gibt zu einer gegebenen Liste von Variablen die stp aus gtps zurück, -# bei denen Target(st=1)/Source(st=0) in der Variablen Liste ist. -def get_stp_hit(varlist, gtps, st): - stp = [] - for t in varlist: - for gtp in gtps: - if t == gtp[st]: - stp.append(gtp) - return stp - - -# Checks if an found RDF-Term can be used as value in a new query -# (without conflicts) -def list_remove_bool(tup): - for var in tup: - if isinstance(var, Literal): - i_n3 = var.n3() - if len(i_n3) > 60: - return True - elif isinstance(var, BNode): - return True - elif isinstance(var, URIRef): - return '%' in var - # TODO: nochmal schauen das % rauswerfen war kuzfristig, - # weil sparql mir bei einer query nen Fehler geschmissen hat - return False - - -# evaluates a given graph-pattern-list -def eval_gp_list(gtp_scores, gp_list): - for gp_l in gp_list: - eval_gp(gtp_scores, gp_l) - return gp_list - - -# evaluate a given graph-pattern -def eval_gp(gtp_scores, gp): - res = evaluate( - sparql, timeout, gtp_scores, gp, run=0, gen=0) - update_individuals([gp], [res]) - - -# helper to get target-hits and the corresponding stp -def target_hit(stps, t_lis): - res = [] - for stp in stps: - for t in t_lis: - if t == stp[1]: - res.append( - (t, stp) - ) - return res - - -# add one hop with the given direction. -def mutate_deep_narrow_one_hop( - gp_, max_out=None, max_in=None, in_out=None, richtung=None -): - vars_ = gp_.vars_in_graph - if not (SOURCE_VAR in vars_ and TARGET_VAR in vars_): - logger.info('SOURCE or TARGET are not in gp: %s' % gp_) - return [] - if not gp_.matching_node_pairs: - logger.info( - 'No matching node pairs, cant get better through adding constraints' - ) - return [] - # Erstelle pattern für den ersten Schritt - a = Variable('a') - b = Variable('b') - c = Variable('c') - if richtung not in [1, 2, 3, 4]: - richtung = random.choice([1, 2, 3, 4]) - logger.info('Richtung %s wurde gewaehlt' % richtung) - if richtung == 1: - values_s_t = {(SOURCE_VAR, TARGET_VAR): gp_.matching_node_pairs} - gp1 = GraphPattern([(SOURCE_VAR, a, b)]) - q = gp1.to_sparql_filter_by_count_in_out_query( - values=values_s_t, count_node=b, in_out=in_out, max_out=max_out, - max_in=max_in, limit=200) - logger.info(q) - t, q_res1 = run_query(q) - if not q_res1: - return [] - # logger.info('orig query took %.4f s, result:\n%s\n', t, q_res1) - # Erstelle values aus den Ergebnissen für b - values = get_values([b], q_res1) - gp2 = GraphPattern([(b, c, TARGET_VAR)]) - # Query die über eine var aus gp2 random samplet mit values aus b_list - q = gp2.to_sparql_select_sample_query(values=values, limit=5000) - logger.info(q) - try: - t, q_res2 = run_query(q) - except: - logger.info('Die Query (s.o.) hat nicht geklappt') - return [] - # logger.info('orig query took %.4f s, result:\n%s\n', t, q_res2) - gp_list = get_fixed_path_gp_one_hop( - q_res1, q_res2, gp_, richtung, gp_.matching_node_pairs, a, b, c - ) - elif richtung == 2: - values_s = { - (SOURCE_VAR, ): [(tup[0], ) for tup in gp_.matching_node_pairs] - } - values_t = { - (TARGET_VAR, ): [(tup[1], ) for tup in gp_.matching_node_pairs] - } - gp1 = GraphPattern([(SOURCE_VAR, a, b)]) - gp2 = GraphPattern([(TARGET_VAR, c, b)]) - q = gp1.to_sparql_filter_by_count_in_out_query( - values=values_s, count_node=b, in_out=in_out, max_out=max_out, - max_in=max_in, limit=1000) - logger.info(q) - t, q_res1 = run_query(q) - if not q_res1['results']['bindings']: - return [] - q = gp2.to_sparql_filter_by_count_in_out_query( - values=values_t, count_node=b, in_out=in_out, max_out=max_out, - max_in=max_in, limit=1000) - logger.info(q) - t, q_res2 = run_query(q) - if not q_res2['results']['bindings']: - return [] - gp_list = get_fixed_path_gp_one_hop( - q_res1, q_res2, gp_, richtung, gp_.matching_node_pairs, a, b, c - ) - elif richtung == 3: - values_s_t = {(SOURCE_VAR, TARGET_VAR): gp_.matching_node_pairs} - gp2 = GraphPattern([(TARGET_VAR, c, b)]) - q = gp2.to_sparql_filter_by_count_in_out_query( - values=values_s_t, count_node=b, in_out=in_out, max_out=max_out, - max_in=max_in, limit=200) - logger.info(q) - t, q_res2 = run_query(q) - if not q_res2['results']['bindings']: - return [] - # logger.info('orig query took %.4f s, result:\n%s\n', t, q_res1) - gp1 = GraphPattern([(b, a, SOURCE_VAR)]) - # Erstelle values aus den Ergebnissen für b - values = get_values([b], q_res2) - # Query die über eine var aus gp2 random samplet mit values aus b_list - q = gp1.to_sparql_select_sample_query(values=values, limit=5000) - logger.info(q) - try: - t, q_res1 = run_query(q) - except: - logger.info('Die Query (s.o.) hat nicht geklappt') - return [] - gp_list = get_fixed_path_gp_one_hop( - q_res1, q_res2, gp_, richtung, gp_.matching_node_pairs, a, b, c - ) - else: - values_s = { - (SOURCE_VAR, ): [(tup[0], ) for tup in gp_.matching_node_pairs] - } - values_t = { - (TARGET_VAR, ): [(tup[1], ) for tup in gp_.matching_node_pairs] - } - gp1 = GraphPattern([(b, a, SOURCE_VAR)]) - gp2 = GraphPattern([(b, c, TARGET_VAR)]) - q = gp1.to_sparql_filter_by_count_in_out_query( - values=values_s, count_node=b, in_out=in_out, max_out=max_out, - max_in=max_in, limit=200) - logger.info(q) - t, q_res1 = run_query(q) - if not q_res1['results']['bindings']: - return [] - q = gp2.to_sparql_filter_by_count_in_out_query( - values=values_t, count_node=b, in_out=in_out, max_out=max_out, - max_in=max_in, limit=200) - logger.info(q) - t, q_res2 = run_query(q) - if not q_res2['results']['bindings']: - return [] - gp_list = get_fixed_path_gp_one_hop( - q_res1, q_res2, gp_, richtung, gp_.matching_node_pairs, a, b, c - ) - return gp_list - - -# fixed den ein-hop-pfad zwischen Source und Target, fügt ihn dem Pattern hinzu -# und gibt die Liste der resultierenden Pattern zurück -# TODO nicht so sehr auf source a b. b c Target fokussieren. -def get_fixed_path_gp_one_hop(q_res1, q_res2, gp_main, richtung, stp, a, b, c): - gp_list = [] - res_rows_path = ['results', 'bindings'] - bind1 = sparql_json_result_bindings_to_rdflib( - get_path(q_res1, res_rows_path, default=[]) - ) - bind2 = sparql_json_result_bindings_to_rdflib( - get_path(q_res2, res_rows_path, default=[]) - ) - for row2 in bind2: - for gtp in stp: - if gtp[1] == get_path(row2, [TARGET_VAR]): - for row1 in bind1: - if (get_path(row1, [b]) == get_path(row2, [b])) and \ - (get_path(row1, [SOURCE_VAR]) == gtp[0]): - if richtung == 1: - gp_ = GraphPattern([ - (SOURCE_VAR, get_path(row1, [a]), b), - (b, get_path(row2, [c]), TARGET_VAR) - ]) - elif richtung == 2: - gp_ = GraphPattern([ - (SOURCE_VAR, get_path(row1, [a]), b), - (TARGET_VAR, get_path(row2, [c]), b) - ]) - elif richtung == 3: - gp_ = GraphPattern([ - (b, get_path(row1, [a]), SOURCE_VAR), - (TARGET_VAR, get_path(row2, [c]), b) - ]) - else: - gp_ = GraphPattern([ - (b, get_path(row1, [a]), SOURCE_VAR), - (b, get_path(row2, [c]), TARGET_VAR) - ]) - - gp_ = GraphPattern(chain(gp_, gp_main)) - if gp_ not in gp_list: - gp_list.append(gp_) - logger.info(gtp) - return gp_list - - -# fixed den ein-hop-pfad zwischen Source und Target, fügt ihn dem Pattern hinzu -# und gibt die Liste der resultierenden Pattern zurück -# TODO nicht so sehr auf source a b. b c Target fokussieren. -def get_fixed_path_gp_two_hops( - q_res1, q_res2, q_res3, gp_main, richtung, stp, a, b, c, d, e -): - # TODO: überlegen nicht nur verschieden Pattern für verschiedene Richtungen - # zu machen, sondern auch in den Unterschiedlichen Ergebnissn anfangen - # (Idee wäre z.B. die a bis e durch nummerierte random vars zu ersetzen und - # sich dann zu überlegen wie man das übergibt, ob mans iwie immer entlang - # des patterns schafft oder eher nicht. - gp_list = [] - res_rows_path = ['results', 'bindings'] - bind1 = sparql_json_result_bindings_to_rdflib( - get_path(q_res1, res_rows_path, default=[]) - ) - bind2 = sparql_json_result_bindings_to_rdflib( - get_path(q_res2, res_rows_path, default=[]) - ) - bind3 = sparql_json_result_bindings_to_rdflib( - get_path(q_res3, res_rows_path, default=[]) - ) - for gtp in stp: - for row3 in bind3: - if gtp[1] == get_path(row3, [TARGET_VAR]): - for row2 in bind2: - if get_path(row2, [d]) == get_path(row3, [d]): - for row1 in bind1: - if get_path(row1, [b]) == \ - get_path(row2, [b]) and \ - get_path(row1, [SOURCE_VAR]) == \ - gtp[0]: - if richtung == 1: - gp_ = GraphPattern([ - (SOURCE_VAR, get_path(row1, [a]), b), - (b, get_path(row2, [c]), d), - (d, get_path(row3, [e]), TARGET_VAR) - ]) - elif richtung == 2: - gp_ = GraphPattern([ - (SOURCE_VAR, get_path(row1, [a]), b), - (b, get_path(row2, [c]), d), - (TARGET_VAR, get_path(row3, [e]), d) - ]) - elif richtung == 3: - gp_ = GraphPattern([ - (SOURCE_VAR, get_path(row1, [a]), b), - (d, get_path(row2, [c]), b), - (d, get_path(row3, [e]), TARGET_VAR) - ]) - elif richtung == 4: - gp_ = GraphPattern([ - (SOURCE_VAR, get_path(row1, [a]), b), - (d, get_path(row2, [c]), b), - (TARGET_VAR, get_path(row3, [e]), d) - ]) - elif richtung == 5: - gp_ = GraphPattern([ - (b, get_path(row1, [a]), SOURCE_VAR), - (b, get_path(row2, [c]), d), - (d, get_path(row3, [e]), TARGET_VAR) - ]) - elif richtung == 6: - gp_ = GraphPattern([ - (b, get_path(row1, [a]), SOURCE_VAR), - (b, get_path(row2, [c]), d), - (TARGET_VAR, get_path(row3, [e]), d) - ]) - elif richtung == 7: - gp_ = GraphPattern([ - (b, get_path(row1, [a]), SOURCE_VAR), - (d, get_path(row2, [c]), b), - (d, get_path(row3, [e]), TARGET_VAR) - ]) - else: - gp_ = GraphPattern([ - (b, get_path(row1, [a]), SOURCE_VAR), - (d, get_path(row2, [c]), b), - (TARGET_VAR, get_path(row3, [e]), d) - ]) - gp_ = GraphPattern(chain(gp_, gp_main)) - if gp_ not in gp_list: - gp_list.append(gp_) - logger.debug(gtp) - return gp_list - - -# add two hops. -def mutate_deep_narrow_two_hops( - gp_, max_out=None, max_in=None, in_out=None, richtung=None -): - vars_ = gp_.vars_in_graph - if not (SOURCE_VAR in vars_ and TARGET_VAR in vars_): - logger.debug('SOURCE or TARGET are not in gp: %s' % gp_) - return [] - if not gp_.matching_node_pairs: - logger.debug( - 'No matching node pairs, cant get better through adding constraints' - ) - return [] - a = Variable('a') - b = Variable('b') - c = Variable('c') - d = Variable('d') - e = Variable('e') - gp_list = [] - if richtung not in range(1, 9): - richtung = random.choice(range(1, 9)) - logger.debug('Richtung %s wurde gewaehlt' % richtung) - if richtung == 1: - gp1 = GraphPattern([(SOURCE_VAR, a, b)]) - gp2 = GraphPattern([(b, c, d)]) - gp3 = GraphPattern([(d, e, TARGET_VAR)]) - - values_s = { - (SOURCE_VAR, ): [(tup[0], ) for tup in gp_.matching_node_pairs] - } - q = gp1.to_sparql_filter_by_count_in_out_query( - values=values_s, count_node=b, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_next) - logger.debug(q) - try: - t, q_res1 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res1: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res1['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - values_b = get_values([b], q_res1) - q = gp2.to_sparql_filter_by_count_in_out_query( - values=values_b, count_node=d, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_next) - logger.debug(q) - try: - t, q_res2 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res2: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res2['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - values_d = get_values([d], q_res2) - q = gp3.to_sparql_select_sample_query( - values=values_d, limit=limit_choose_endpoint - ) - logger.debug(q) - try: - t, q_res3 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res3: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res3['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - gp_list = get_fixed_path_gp_two_hops( - q_res1, - q_res2, - q_res3, - gp_, - richtung, - gp_.matching_node_pairs, - a, - b, - c, - d, - e - ) - if richtung == 2: - gp1 = GraphPattern([(SOURCE_VAR, a, b)]) - gp2 = GraphPattern([(b, c, d)]) - gp3 = GraphPattern([(TARGET_VAR, e, d)]) - - values_s = { - (SOURCE_VAR, ): [(tup[0], ) for tup in gp_.matching_node_pairs] - } - q = gp1.to_sparql_filter_by_count_in_out_query( - values=values_s, count_node=b, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_next) - logger.debug(q) - try: - t, q_res1 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res1: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res1['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - values_b = get_values([b], q_res1) - q = gp2.to_sparql_filter_by_count_in_out_query( - values=values_b, count_node=d, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_endpoint_two_sided) - logger.debug(q) - try: - t, q_res2 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res2: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res2['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - values_t = { - (TARGET_VAR, ): [(tup[1], ) for tup in gp_.matching_node_pairs] - } - q = gp3.to_sparql_filter_by_count_in_out_query( - values=values_t, count_node=d, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_endpoint_two_sided) - logger.debug(q) - try: - t, q_res3 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res3: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res3['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - gp_list = get_fixed_path_gp_two_hops( - q_res1, - q_res2, - q_res3, - gp_, - richtung, - gp_.matching_node_pairs, - a, - b, - c, - d, - e - ) - if richtung == 3: - gp1 = GraphPattern([(SOURCE_VAR, a, b)]) - gp2 = GraphPattern([(d, c, b)]) - gp3 = GraphPattern([(d, e, TARGET_VAR)]) - - values_s = { - (SOURCE_VAR, ): [(tup[0], ) for tup in gp_.matching_node_pairs] - } - q = gp1.to_sparql_filter_by_count_in_out_query( - values=values_s, count_node=b, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_next) - logger.debug(q) - try: - t, q_res1 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res1: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res1['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - values_b = get_values([b], q_res1) - q = gp2.to_sparql_filter_by_count_in_out_query( - values=values_b, count_node=d, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_startpoint_two_sided) - logger.debug(q) - try: - t, q_res2 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res2: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res2['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - values_t = { - (TARGET_VAR, ): [(tup[1], ) for tup in gp_.matching_node_pairs] - } - q = gp3.to_sparql_filter_by_count_in_out_query( - values=values_t, count_node=d, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_startpoint_two_sided) - logger.debug(q) - try: - t, q_res3 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res3: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res3['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - gp_list = get_fixed_path_gp_two_hops( - q_res1, - q_res2, - q_res3, - gp_, - richtung, - gp_.matching_node_pairs, - a, - b, - c, - d, - e - ) - if richtung == 4: - gp1 = GraphPattern([(SOURCE_VAR, a, b)]) - gp2 = GraphPattern([(d, c, b)]) - gp3 = GraphPattern([(TARGET_VAR, e, d)]) - - values_s = { - (SOURCE_VAR, ): [(tup[0], ) for tup in gp_.matching_node_pairs] - } - q = gp1.to_sparql_filter_by_count_in_out_query( - values=values_s, count_node=b, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_endpoint_two_sided) - logger.debug(q) - try: - t, q_res1 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res1: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res1['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - values_t = { - (TARGET_VAR, ): [(tup[1], ) for tup in gp_.matching_node_pairs] - } - q = gp3.to_sparql_filter_by_count_in_out_query( - values=values_t, count_node=d, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_next) - logger.debug(q) - try: - t, q_res3 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res3: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res3['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - values_d = get_values([d], q_res3) - q = gp2.to_sparql_filter_by_count_in_out_query( - values=values_d, count_node=b, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_endpoint_two_sided) - logger.debug(q) - try: - t, q_res2 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res2: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res2['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - gp_list = get_fixed_path_gp_two_hops( - q_res1, - q_res2, - q_res3, - gp_, - richtung, - gp_.matching_node_pairs, - a, - b, - c, - d, - e - ) - if richtung == 5: - gp1 = GraphPattern([(b, a, SOURCE_VAR)]) - gp2 = GraphPattern([(b, c, d)]) - gp3 = GraphPattern([(d, e, TARGET_VAR)]) - - values_s = {(SOURCE_VAR, ): [(tup[0], ) for tup in gp_.matching_node_pairs]} - q = gp1.to_sparql_filter_by_count_in_out_query( - values=values_s, count_node=b, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_subject_next) - logger.debug(q) - try: - t, q_res1 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res1: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res1['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - values_b = get_values([b], q_res1) - q = gp2.to_sparql_filter_by_count_in_out_query( - values=values_b, count_node=d, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_next) - logger.debug(q) - try: - t, q_res2 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res2: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res2['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - values_d = get_values([d], q_res2) - q = gp3.to_sparql_select_sample_query( - values=values_d, limit=limit_choose_endpoint - ) - logger.debug(q) - try: - t, q_res3 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res3: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res3['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - gp_list = get_fixed_path_gp_two_hops( - q_res1, - q_res2, - q_res3, - gp_, - richtung, - gp_.matching_node_pairs, - a, - b, - c, - d, - e - ) - if richtung == 6: - gp1 = GraphPattern([(b, a, SOURCE_VAR)]) - gp2 = GraphPattern([(b, c, d)]) - gp3 = GraphPattern([(TARGET_VAR, e, d)]) - - values_t = { - (TARGET_VAR, ): [(tup[1], ) for tup in gp_.matching_node_pairs] - } - q = gp3.to_sparql_filter_by_count_in_out_query( - values=values_t, count_node=d, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_next) - logger.debug(q) - try: - t, q_res3 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res3: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res3['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - values_s = { - (SOURCE_VAR, ): [(tup[0], ) for tup in gp_.matching_node_pairs] - } - q = gp1.to_sparql_filter_by_count_in_out_query( - values=values_s, count_node=b, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_startpoint_two_sided) - logger.debug(q) - try: - t, q_res1 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res1: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res1['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - values_d = get_values([d], q_res3) - q = gp2.to_sparql_filter_by_count_in_out_query( - values=values_d, count_node=b, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_startpoint_two_sided) - logger.debug(q) - try: - t, q_res2 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res2: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res2['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - gp_list = get_fixed_path_gp_two_hops( - q_res1, - q_res2, - q_res3, - gp_, - richtung, - gp_.matching_node_pairs, - a, - b, - c, - d, - e - ) - if richtung == 7: - gp1 = GraphPattern([(b, a, SOURCE_VAR)]) - gp2 = GraphPattern([(d, c, b)]) - gp3 = GraphPattern([(d, e, TARGET_VAR)]) - - values_t = { - (TARGET_VAR, ): [(tup[1], ) for tup in gp_.matching_node_pairs] - } - q = gp3.to_sparql_filter_by_count_in_out_query( - values=values_t, count_node=d, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_subject_next) - logger.debug(q) - try: - t, q_res3 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res3: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res3['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - values_d = get_values([d], q_res3) - q = gp2.to_sparql_filter_by_count_in_out_query( - values=values_d, count_node=b, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_next) - logger.debug(q) - try: - t, q_res2 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res2: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res2['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - values_b = get_values([b], q_res2) - q = gp1.to_sparql_select_sample_query( - values=values_b, limit=limit_choose_endpoint) - logger.debug(q) - try: - t, q_res1 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res1: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res1['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - gp_list = get_fixed_path_gp_two_hops( - q_res1, - q_res2, - q_res3, - gp_, - richtung, - gp_.matching_node_pairs, - a, - b, - c, - d, - e - ) - if richtung == 8: - gp1 = GraphPattern([(b, a, SOURCE_VAR)]) - gp2 = GraphPattern([(d, c, b)]) - gp3 = GraphPattern([(TARGET_VAR, e, d)]) - - values_t = { - (TARGET_VAR, ): [(tup[1], ) for tup in gp_.matching_node_pairs] - } - q = gp3.to_sparql_filter_by_count_in_out_query( - values=values_t, count_node=d, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_next) - logger.debug(q) - try: - t, q_res3 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res3: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res3['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - values_d = get_values([d], q_res3) - q = gp2.to_sparql_filter_by_count_in_out_query( - values=values_d, count_node=b, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_next) - logger.debug(q) - try: - t, q_res2 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res2: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res2['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - values_b = get_values([b], q_res2) - q = gp1.to_sparql_select_sample_query( - values=values_b, limit=limit_choose_endpoint) - logger.debug(q) - try: - t, q_res1 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res1: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res1['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - gp_list = get_fixed_path_gp_two_hops( - q_res1, - q_res2, - q_res3, - gp_, - richtung, - gp_.matching_node_pairs, - a, - b, - c, - d, - e - ) - - return gp_list - - -# fixed den ein-hop-pfad zwischen Source und Target, fügt ihn dem Pattern hinzu -# und gibt die Liste der resultierenden Pattern zurück -# TODO nicht so sehr auf source a b. b c Target fokussieren. -def get_fixed_path_gp_three_hops( - q_res1, - q_res2, - q_res3, - q_res4, - gp_main, - richtung, - stp, - a, - b, - c, - d, - e, - f, - g -): - # TODO: überlegen nicht nur verschieden Pattern für verschiedene Richtungen - # zu machen, sondern auch in den Unterschiedlichen Ergebnissn anfangen - # (Idee wäre z.B. die a bis e durch nummerierte random vars zu ersetzen und - # sich dann zu überlegen wie man das übergibt, ob mans iwie immer entlang - # des patterns schafft oder eher nicht. - gp_list = [] - res_rows_path = ['results', 'bindings'] - bind1 = sparql_json_result_bindings_to_rdflib( - get_path(q_res1, res_rows_path, default=[]) - ) - bind2 = sparql_json_result_bindings_to_rdflib( - get_path(q_res2, res_rows_path, default=[]) - ) - bind3 = sparql_json_result_bindings_to_rdflib( - get_path(q_res3, res_rows_path, default=[]) - ) - bind4 = sparql_json_result_bindings_to_rdflib( - get_path(q_res4, res_rows_path, default=[]) - ) - for gtp in stp: - for row4 in bind4: - if gtp[1] == get_path(row4, [TARGET_VAR]): - for row3 in bind3: - if get_path(row3, [f]) == get_path(row4, [f]): - for row2 in bind2: - if get_path(row2, [d]) == get_path(row3, [d]): - for row1 in bind1: - if get_path(row1, [b]) == \ - get_path(row2, [b]) and \ - get_path(row1, [SOURCE_VAR]) == \ - gtp[0]: - if richtung == 1: - gp_ = GraphPattern([ - (SOURCE_VAR, get_path(row1, [a]), b), - (b, get_path(row2, [c]), d), - (d, get_path(row3, [e]), f), - (f, get_path(row4, [g]), TARGET_VAR) - ]) - elif richtung == 2: - gp_ = GraphPattern([ - (SOURCE_VAR, get_path(row1, [a]), b), - (b, get_path(row2, [c]), d), - (d, get_path(row3, [e]), f), - (TARGET_VAR, get_path(row4, [g]), f) - ]) - else: # dummy else, damit gp_ zugewiesen - gp_ = GraphPattern([]) - gp_ = GraphPattern(chain(gp_, gp_main)) - if gp_ not in gp_list: - gp_list.append(gp_) - logger.debug(gtp) - return gp_list - - -# add two hops. -def mutate_deep_narrow_three_hops( - gp_, max_out=None, max_in=None, in_out=None, richtung=None -): - vars_ = gp_.vars_in_graph - if not (SOURCE_VAR in vars_ and TARGET_VAR in vars_): - logger.debug('SOURCE or TARGET are not in gp: %s' % gp_) - return [] - if not gp_.matching_node_pairs: - logger.debug( - 'No matching node pairs, cant get better through adding constraints' - ) - return [] - a = Variable('a') - b = Variable('b') - c = Variable('c') - d = Variable('d') - e = Variable('e') - f = Variable('f') - g = Variable('g') - if richtung not in range(1, 17): - richtung = random.choice(range(1, 17)) - logger.debug('Richtung %s wurde gewaehlt' % richtung) - if richtung == 1: - gp1 = GraphPattern([(SOURCE_VAR, a, b)]) - gp2 = GraphPattern([(b, c, d)]) - gp3 = GraphPattern([(d, e, f)]) - gp4 = GraphPattern([(f, g, TARGET_VAR)]) - - values_s = { - (SOURCE_VAR, ): [(tup[0], ) for tup in gp_.matching_node_pairs] - } - q = gp1.to_sparql_filter_by_count_in_out_query( - values=values_s, count_node=b, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_next) - logger.debug(q) - try: - t, q_res1 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res1: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res1['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - values_b = get_values([b], q_res1) - q = gp2.to_sparql_filter_by_count_in_out_query( - values=values_b, count_node=d, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_next) - logger.debug(q) - try: - t, q_res2 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res2: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res2['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - values_d = get_values([d], q_res2) - q = gp3.to_sparql_filter_by_count_in_out_query( - values=values_d, count_node=f, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_next) - logger.debug(q) - try: - t, q_res3 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res3: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res3['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - values_f = get_values([f], q_res3) - q = gp4.to_sparql_select_sample_query( - values=values_f, limit=limit_choose_endpoint - ) - logger.debug(q) - try: - t, q_res4 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res4: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res4['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - gp_list = get_fixed_path_gp_three_hops( - q_res1, - q_res2, - q_res3, - q_res4, - gp_, - richtung, - gp_.matching_node_pairs, - a, - b, - c, - d, - e, - f, - g - ) - elif richtung == 2: - gp1 = GraphPattern([(SOURCE_VAR, a, b)]) - gp2 = GraphPattern([(b, c, d)]) - gp3 = GraphPattern([(d, e, f)]) - gp4 = GraphPattern([(TARGET_VAR, g, f)]) - - values_s = { - (SOURCE_VAR, ): [(tup[0], ) for tup in gp_.matching_node_pairs] - } - q = gp1.to_sparql_filter_by_count_in_out_query( - values=values_s, count_node=b, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_next) - logger.debug(q) - try: - t, q_res1 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res1: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res1['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - values_b = get_values([b], q_res1) - q = gp2.to_sparql_filter_by_count_in_out_query( - values=values_b, count_node=d, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_next) - logger.debug(q) - try: - t, q_res2 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res2: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res2['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - values_d = get_values([d], q_res2) - q = gp3.to_sparql_filter_by_count_in_out_query( - values=values_d, count_node=f, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_endpoint_two_sided) - logger.debug(q) - try: - t, q_res3 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res3: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res3['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - values_t = { - (TARGET_VAR,): [(tup[1],) for tup in gp_.matching_node_pairs] - } - q = gp4.to_sparql_filter_by_count_in_out_query( - values=values_t, count_node=f, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_endpoint_two_sided) - logger.debug(q) - try: - t, q_res4 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res4: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res4['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - gp_list = get_fixed_path_gp_three_hops( - q_res1, - q_res2, - q_res3, - q_res4, - gp_, - richtung, - gp_.matching_node_pairs, - a, - b, - c, - d, - e, - f, - g - ) - elif richtung == 3: - gp1 = GraphPattern([(SOURCE_VAR, a, b)]) - gp2 = GraphPattern([(b, c, d)]) - gp3 = GraphPattern([(f, e, d)]) - gp4 = GraphPattern([(f, g, TARGET_VAR)]) - - values_s = { - (SOURCE_VAR, ): [(tup[0], ) for tup in gp_.matching_node_pairs] - } - q = gp1.to_sparql_filter_by_count_in_out_query( - values=values_s, count_node=b, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_next) - logger.debug(q) - try: - t, q_res1 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res1: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res1['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - values_b = get_values([b], q_res1) - q = gp2.to_sparql_filter_by_count_in_out_query( - values=values_b, count_node=d, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_next) - logger.debug(q) - try: - t, q_res2 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res2: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res2['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - values_d = get_values([d], q_res2) - q = gp3.to_sparql_filter_by_count_in_out_query( - values=values_d, count_node=f, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_startpoint_two_sided) - logger.debug(q) - try: - t, q_res3 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res3: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res3['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - values_t = { - (TARGET_VAR,): [(tup[1],) for tup in gp_.matching_node_pairs] - } - q = gp4.to_sparql_filter_by_count_in_out_query( - values=values_t, count_node=f, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_startpoint_two_sided) - logger.debug(q) - try: - t, q_res4 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res4: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res4['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - gp_list = get_fixed_path_gp_three_hops( - q_res1, - q_res2, - q_res3, - q_res4, - gp_, - richtung, - gp_.matching_node_pairs, - a, - b, - c, - d, - e, - f, - g - ) - elif richtung == 4: - gp1 = GraphPattern([(SOURCE_VAR, a, b)]) - gp2 = GraphPattern([(b, c, d)]) - gp3 = GraphPattern([(f, e, d)]) - gp4 = GraphPattern([(TARGET_VAR, g, f)]) - - values_s = { - (SOURCE_VAR, ): [(tup[0], ) for tup in gp_.matching_node_pairs] - } - q = gp1.to_sparql_filter_by_count_in_out_query( - values=values_s, count_node=b, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_next) - logger.debug(q) - try: - t, q_res1 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res1: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res1['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - values_b = get_values([b], q_res1) - q = gp2.to_sparql_filter_by_count_in_out_query( - values=values_b, count_node=d, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_endpoint_two_sided) - logger.debug(q) - try: - t, q_res2 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res2: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res2['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - values_t = { - (TARGET_VAR,): [(tup[1],) for tup in gp_.matching_node_pairs] - } - q = gp4.to_sparql_filter_by_count_in_out_query( - values=values_t, count_node=f, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_next) - logger.debug(q) - try: - t, q_res4 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res4: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res4['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - values_f = get_values([f], q_res4) - q = gp3.to_sparql_filter_by_count_in_out_query( - values=values_f, count_node=d, in_out=in_out, max_out=max_out, - max_in=max_in, limit=limit_endpoint_two_sided) - logger.debug(q) - try: - t, q_res3 = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not q_res3: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not q_res3['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - gp_list = get_fixed_path_gp_three_hops( - q_res1, - q_res2, - q_res3, - q_res4, - gp_, - richtung, - gp_.matching_node_pairs, - a, - b, - c, - d, - e, - f, - g - ) - - return gp_list - - -def get_fixed_path_gp_n_hops( - res_q, gp_, n, direct, stp, node, hn_ind, hop -): - gp_list = [] - res_rows_path = ['results', 'bindings'] - bind = [] - for res_q_i in res_q: - bind.append(sparql_json_result_bindings_to_rdflib( - get_path(res_q_i, res_rows_path, default=[])) - ) - hit_paths = [] - hit_paths_help = [] - - if hn_ind == 0: - for row in bind[0]: - for mnp in stp: - if mnp[0] == (get_path(row, [node[0]])): - hit_paths.append([[ - mnp[0], - get_path(row, [hop[0]]), - get_path(row, [node[1]]) - ]]) - for i in range(1, n+1): - for path in hit_paths: - for row in bind[i]: - if path[i-1][2] == get_path(row, [node[i]]): - path_h = path + [[ - path[i-1][2], - get_path(row, [hop[i]]), - get_path(row, [node[i+1]]) - ]] - hit_paths_help.append(path_h) - hit_paths = hit_paths_help - hit_paths_help = [] - - elif hn_ind == n+1: - for row in bind[n]: - for mnp in stp: - if mnp[1] == (get_path(row, [node[n+1]])): - hit_paths.append([[ - get_path(row, [node[n]]), - get_path(row, [hop[n]]), - mnp[1] - ]]) - for i in range(n-1, -1, -1): - for path in hit_paths: - for row in bind[i]: - if path[(n-1)-i][0] == get_path(row, [node[i+1]]): - path_h = path.append([[ - get_path(row, [node[i]], - get_path(row, [hop[i]]), - path[(n-1)-i][0]) - ]]) - hit_paths_help.append(path_h) - hit_paths = hit_paths_help - hit_paths_help = [] - for path in hit_paths: - path.reverse() - - else: - hit_paths_l = [] - hit_paths_r = [] - # get the hits of hit_node to start from - for row_l in bind[hn_ind-1]: - for row_r in bind[hn_ind]: - if get_path(row_l, [node[hn_ind]]) == \ - get_path(row_r, [node[hn_ind]]): - hit_paths_l.append([[ - get_path(row_l, [node[hn_ind-1]]), - get_path(row_l, [hop[hn_ind-1]]), - get_path(row_l, [node[hn_ind]]) - ]]) - hit_paths_r.append([[ - get_path(row_r, [node[hn_ind]]), - get_path(row_r, [hop[hn_ind]]), - get_path(row_r, [node[hn_ind+1]]) - ]]) - # get the path from hit node to targets - for i in range(hn_ind+1, n+1): - for path in hit_paths_r: - for row in bind[i]: - if path[i-(hn_ind+1)][2] == get_path(row, [node[i]]): - path_h = path + [[ - path[i-(hn_ind+1)][2], - get_path(row, [hop[i]]), - get_path(row, [node[i+1]]) - ]] - hit_paths_help.append(path_h) - hit_paths_r = hit_paths_help - hit_paths_help = [] - # get the path from hit node to sources - for i in range(hn_ind, -1, -1): - for path in hit_paths_l: - for row in bind[i]: - if path[hn_ind-i][0] == get_path(row, [node[i+1]]): - path_h = path + [[ - get_path(row, [node[i]]), - get_path(row, [hop[i]]), - path[hn_ind-i][0] - ]] - hit_paths_help.append(path_h) - hit_paths_l = hit_paths_help - hit_paths_help = [] - # get the full path from source to target - for path_l in hit_paths_l: - path_l.reverse() - for path_r in hit_paths_r: - if path_l[hn_ind][2] == path_r[0][0]: - hit_paths.append(path_l + path_r) - # filter the paths, over stp-hits - - hit_paths = filter_stp_hits(hit_paths, stp) - - # Make Graph_Pattern_with fixed hops out of the found paths - for path in hit_paths: - gp_list.append( - GraphPattern( - chain( - GraphPattern([ - (node[i], path[i][1], node[i+1]) if direct(i) == 1 - else (node[i+1], path[i][1], node[i]) - for i in range(n+1) - ]), - gp_ - ) - ) - ) - - return gp_list - - -def filter_stp_hits( - hit_paths, stp -): - res = [] - for hit in hit_paths: - for mnp in stp: - if (mnp[0] == hit[0][0]) and (mnp[1] == hit[len(hit)-1][2]): - res.append(hit) - return res - - -def mutate_deep_narrow_n_hops( - gp_, n, max_out=None, max_in=None, in_out=None, direct=None -): - vars_ = gp_.vars_in_graph - if SOURCE_VAR not in vars_ and TARGET_VAR not in vars_: - logger.info('SOURCE or TARGET are not in gp: %s' % gp_) - return [] - if not gp_.matching_node_pairs: - logger.info( - 'No matching node pairs, cant get better through adding constraints' - ) - return [] - if n < 1: - logger.info('Cannot add less than one hop') - return [] - # setting up lists for nodes, hops, values, gp_helpers, query-results - node = [SOURCE_VAR] - for i in range(n): - node.append(gen_random_var()) - node.append(TARGET_VAR) - hop = [] - for i in range(n+1): - hop.append(gen_random_var()) - if direct is None or len(direct) != n+1: - logger.info('No direction chosen, or direction tuple with false length') - direct = [] - for i in range(n+1): - direct.append(0) - gp_helper = [] - for i in range(n+1): - if direct[i] == 0: - direct[i] = random.choice([-1, 1]) - if direct[i] == 1: - gp_helper.append( - GraphPattern([(node[i], hop[i], node[i + 1])]) - ) - else: - gp_helper.append( - GraphPattern([(node[i + 1], hop[i], node[i])]) - ) - values = [] - for i in range(n+2): - values.append({}) - values[0] = { - (SOURCE_VAR, ): [(tup[0], ) for tup in gp_.matching_node_pairs] - } - values[n+1] = { - (TARGET_VAR, ): [(tup[1], ) for tup in gp_.matching_node_pairs] - } - res_q = [] - for i in range(n+1): - res_q.append({}) - - # selecting an random "hit_node" => Node to check the random hits - hit_node = random.choice(node) - hn_ind = node.index(hit_node) - - # TODO: use direct for cases in queriing - # Querieing - # From source to target if hit_node is target: - if hit_node == TARGET_VAR: - # Firing the queries for the first n-2 steps - for i in range(0, n): - if gp_helper[i][0][0] == node[i]: - q = gp_helper[i].to_sparql_filter_by_count_in_out_query( - values=values[i], count_node=node[i+1], in_out=in_out, - max_out=max_out, max_in=max_in, limit=limit_next) - else: - q = gp_helper[i].to_sparql_filter_by_count_in_out_query( - values=values[i], count_node=node[i+1], in_out=in_out, - max_out=max_out, max_in=max_in, limit=limit_subject_next) - logger.info(q) - try: - t, res_q[i] = run_query(q) - except: - logger.info('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q[i]: - logger.info('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q[i]['results']['bindings']: - logger.info('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - values[i+1] = get_values([node[i+1]], res_q[i]) - # Firing the last query for the target hits: - if gp_helper[n][0][0] == node[n-1]: - q = gp_helper[n].to_sparql_select_sample_query( - values=values[n], limit=limit_choose_endpoint) - else: - q = gp_helper[n].to_sparql_select_sample_query( - values=values[n], limit=limit_choose_subject_endpoint) - logger.info(q) - try: - t, res_q[n] = run_query(q) - except: - logger.info('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q[n]: - logger.info('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q[n]['results']['bindings']: - logger.info('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - # From target to source if hit_node is source: - elif hit_node == SOURCE_VAR: - # Firing the queries for the first n-2 steps - for i in range(n, 0, -1): - if gp_helper[i][0][0] == node[i+1]: - q = gp_helper[i].to_sparql_filter_by_count_in_out_query( - values=values[i+1], count_node=node[i], in_out=in_out, - max_out=max_out, max_in=max_in, limit=limit_next) - else: - q = gp_helper[i].to_sparql_filter_by_count_in_out_query( - values=values[i+1], count_node=node[i], in_out=in_out, - max_out=max_out, max_in=max_in, limit=limit_subject_next) - logger.info(q) - try: - t, res_q[i] = run_query(q) - except: - logger.info('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q[i]: - logger.info('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q[i]['results']['bindings']: - logger.info('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - values[i] = get_values([node[i]], res_q[i]) - # Firing the last query for the target hits: - if gp_helper[0][0][0] == node[1]: - q = gp_helper[0].to_sparql_select_sample_query( - values=values[1], limit=limit_choose_endpoint) - else: - q = gp_helper[0].to_sparql_select_sample_query( - values=values[1], limit=limit_choose_subject_endpoint) - logger.info(q) - try: - t, res_q[0] = run_query(q) - except: - logger.info('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q[0]: - logger.info('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q[0]['results']['bindings']: - logger.info('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - # From both sides to the hit_node: - else: - # firing the queries from source to the last node before hit_node - for i in range(0, hn_ind-1): - if gp_helper[i][0][0] == node[i]: - q = gp_helper[i].to_sparql_filter_by_count_in_out_query( - values=values[i], count_node=node[i+1], in_out=in_out, - max_out=max_out, max_in=max_in, limit=limit_next) - else: - q = gp_helper[i].to_sparql_filter_by_count_in_out_query( - values=values[i], count_node=node[i+1], in_out=in_out, - max_out=max_out, max_in=max_in, limit=limit_subject_next) - logger.info(q) - try: - t, res_q[i] = run_query(q) - except: - logger.info('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q[i]: - logger.info('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q[i]['results']['bindings']: - logger.info('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - values[i+1] = get_values([node[i+1]], res_q[i]) - # Firing the queries from target to the last node before hit node - for i in range(n, hn_ind, -1): - if gp_helper[i][0][0] == node[i+1]: - q = gp_helper[i].to_sparql_filter_by_count_in_out_query( - values=values[i+1], count_node=node[i], in_out=in_out, - max_out=max_out, max_in=max_in, limit=limit_next) - else: - q = gp_helper[i].to_sparql_filter_by_count_in_out_query( - values=values[i+1], count_node=node[i], in_out=in_out, - max_out=max_out, max_in=max_in, limit=limit_subject_next) - logger.info(q) - try: - t, res_q[i] = run_query(q) - except: - logger.info('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q[i]: - logger.info('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q[i]['results']['bindings']: - logger.info('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - values[i] = get_values([node[i]], res_q[i]) - # feuere die letzten beiden queries richtung hit_node ab. - # Dabei unterscheide nach Richtungen beider queries. - if ((gp_helper[hn_ind-1][0][0] == node[hn_ind-1]) and # hit is Object - (gp_helper[hn_ind][0][0] == node[hn_ind+1])): # hit is Object - q = gp_helper[hn_ind-1].to_sparql_filter_by_count_in_out_query( - values=values[hn_ind-1], count_node=node[hn_ind], in_out=in_out, - max_out=max_out, max_in=max_in, limit=limit_endpoint_two_sided) - logger.info(q) - try: - t, res_q[hn_ind-1] = run_query(q) - except: - logger.info('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q[hn_ind-1]: - logger.info('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q[hn_ind-1]['results']['bindings']: - logger.info('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - q = gp_helper[hn_ind].to_sparql_filter_by_count_in_out_query( - values=values[hn_ind+1], count_node=node[hn_ind], in_out=in_out, - max_out=max_out, max_in=max_in, limit=limit_endpoint_two_sided) - logger.info(q) - try: - t, res_q[hn_ind] = run_query(q) - except: - logger.info('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q[hn_ind]: - logger.info('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q[hn_ind]['results']['bindings']: - logger.info('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - elif ((gp_helper[hn_ind-1][0][0] == node[hn_ind]) and # hit is Subject - (gp_helper[hn_ind][0][0] == node[hn_ind])): # hit is Subject - q = gp_helper[hn_ind-1].to_sparql_filter_by_count_in_out_query( - values=values[hn_ind-1], count_node=node[hn_ind], in_out=in_out, - max_out=max_out, max_in=max_in, limit=limit_startpoint_two_sided) - logger.info(q) - try: - t, res_q[hn_ind-1] = run_query(q) - except: - logger.info('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q[hn_ind-1]: - logger.info('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q[hn_ind-1]['results']['bindings']: - logger.info('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - q = gp_helper[hn_ind].to_sparql_filter_by_count_in_out_query( - values=values[hn_ind+1], count_node=node[hn_ind], in_out=in_out, - max_out=max_out, max_in=max_in, limit=limit_startpoint_two_sided) - logger.info(q) - try: - t, res_q[hn_ind] = run_query(q) - except: - logger.info('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q[hn_ind]: - logger.info('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q[hn_ind]['results']['bindings']: - logger.info('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - elif ((gp_helper[hn_ind-1][0][0] == node[hn_ind-1]) and # hit is Object - (gp_helper[hn_ind][0][0] == node[hn_ind])): # hit is Subject - q = gp_helper[hn_ind-1].to_sparql_filter_by_count_in_out_query( - values=values[hn_ind-1], count_node=node[hn_ind], in_out=in_out, - max_out=max_out, max_in=max_in, limit=limit_obj_to_subj) - logger.info(q) - try: - t, res_q[hn_ind-1] = run_query(q) - except: - logger.info('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q[hn_ind-1]: - logger.info('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q[hn_ind-1]['results']['bindings']: - logger.info('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - q = gp_helper[hn_ind].to_sparql_filter_by_count_in_out_query( - values=values[hn_ind+1], count_node=node[hn_ind], in_out=in_out, - max_out=max_out, max_in=max_in, limit=limit_subj_to_obj) - logger.info(q) - try: - t, res_q[hn_ind] = run_query(q) - except: - logger.info('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q[hn_ind]: - logger.info('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q[hn_ind]['results']['bindings']: - logger.info('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - elif ((gp_helper[hn_ind-1][0][0] == node[hn_ind]) and # hit is Subject - (gp_helper[hn_ind][0][0] == node[hn_ind+1])): # hit is Object - q = gp_helper[hn_ind-1].to_sparql_filter_by_count_in_out_query( - values=values[hn_ind-1], count_node=node[hn_ind], in_out=in_out, - max_out=max_out, max_in=max_in, limit=limit_subj_to_obj) - logger.info(q) - try: - t, res_q[hn_ind-1] = run_query(q) - except: - logger.info('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q[hn_ind-1]: - logger.info('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q[hn_ind-1]['results']['bindings']: - logger.info('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - q = gp_helper[hn_ind].to_sparql_filter_by_count_in_out_query( - values=values[hn_ind+1], count_node=node[hn_ind], in_out=in_out, - max_out=max_out, max_in=max_in, limit=limit_obj_to_subj) - logger.info(q) - try: - t, res_q[hn_ind] = run_query(q) - except: - logger.info('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q[hn_ind]: - logger.info('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q[hn_ind]['results']['bindings']: - logger.info('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - - gp_list = get_fixed_path_gp_n_hops( - res_q, gp_, n, direct, gp_.matching_node_pairs, node, hn_ind, hop - ) - - return gp_list - - -# erste Version, komplett straight forward -def mutate_deep_narrow_1( - gp_, gtps, n, direct=None, gp_in=False -): - node = [SOURCE_VAR] - for i in range(n): - node.append(Variable('n%i' % i)) - node.append(TARGET_VAR) - hop = [] - for i in range(n + 1): - hop.append(Variable('p%i' % i)) - if direct is None or len(direct) != n + 1: - logger.debug( - 'No direction chosen, or direction tuple with false length' - ) - direct = [] - for i in range(n + 1): - direct.append(0) - gp_helper = [] - for i in range(n + 1): - if direct[i] == 0: - direct[i] = random.choice([-1, 1]) - if direct[i] == 1: - gp_helper.append( - GraphPattern([(node[i], hop[i], node[i + 1])]) - ) - else: - gp_helper.append( - GraphPattern([(node[i + 1], hop[i], node[i])]) - ) - values = {} - values[SOURCE_VAR] = {(SOURCE_VAR,): [(tup[0],) for tup in gtps]} - values[TARGET_VAR] = {(TARGET_VAR,): [(tup[1],) for tup in gtps]} - values['st'] = {(SOURCE_VAR, TARGET_VAR): gtps} - res_q = [] - for i in range(n + 1): - res_q.append({}) - - # Queries für die Schritte - valueblocks = {} - valueblocks[SOURCE_VAR] = values[SOURCE_VAR] - for i in range(n+1): - q = gp_.to_sparql_deep_narrow_path_query( - hop[i], node[i+1], valueblocks, gp_helper[:i+1], gp_in=gp_in - ) - logger.debug(q) - try: - t, res_q[i] = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q[i]: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q[i]['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - values[hop[i]] = get_values([hop[i]], res_q[i]) - valueblocks[hop[i]] = { - (hop[i],): random.sample( - values[hop[i]][(hop[i],)], - min(10, len(values[hop[i]][(hop[i],)])) - ) - } - - # Query fürs Ergebnis - gp_help = GraphPattern([ - (node[i], hop[i], node[i+1]) if direct[i] == 1 - else (node[i+1], hop[i], node[i]) - for i in range(n+1) - ]) - # gemeinsamer source/target-block, damit nur "richtige" Pfade gefunden - # werden - del valueblocks[SOURCE_VAR] - valueblocks['st'] = values['st'] - q = gp_.to_sparql_deep_narrow_path_inst_query_old(hop, valueblocks, gp_help, gp_in=gp_in) - logger.debug(q) - try: - t, res_q_inst = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q_inst: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q_inst['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - res = [] - res_rows_path = ['results', 'bindings'] - bind = sparql_json_result_bindings_to_rdflib( - get_path(res_q_inst, res_rows_path, default=[]) - ) - for row in bind: - gp_res = GraphPattern([ - (node[i], get_path(row, [hop[i]]), node[i + 1]) if direct[i] == 1 - else (node[i + 1], get_path(row, [hop[i]]), node[i]) - for i in range(n + 1) - ]) - res.append(gp_res) - - return res - - -# zweite Version: Query für letzten step bekommt schon die Targets -def mutate_deep_narrow_2( - gp_, gtps, n, direct=None, gp_in=False -): - node = [SOURCE_VAR] - for i in range(n): - node.append(Variable('n%i' % i)) - node.append(TARGET_VAR) - hop = [] - for i in range(n + 1): - hop.append(Variable('p%i' % i)) - if direct is None or len(direct) != n + 1: - logger.debug( - 'No direction chosen, or direction tuple with false length' - ) - direct = [] - for i in range(n + 1): - direct.append(0) - gp_helper = [] - for i in range(n + 1): - if direct[i] == 0: - direct[i] = random.choice([-1, 1]) - if direct[i] == 1: - gp_helper.append( - GraphPattern([(node[i], hop[i], node[i + 1])]) - ) - else: - gp_helper.append( - GraphPattern([(node[i + 1], hop[i], node[i])]) - ) - values = {} - values[SOURCE_VAR] = {(SOURCE_VAR,): [(tup[0],) for tup in gtps]} - values[TARGET_VAR] = {(TARGET_VAR,): [(tup[1],) for tup in gtps]} - values['st'] = {(SOURCE_VAR, TARGET_VAR): gtps} - res_q = [] - for i in range(n + 1): - res_q.append({}) - - # Queries für die Schritte - valueblocks = {} - valueblocks[SOURCE_VAR] = values[SOURCE_VAR] - for i in range(n): - q = gp_.to_sparql_deep_narrow_path_query( - hop[i], node[i+1], valueblocks, gp_helper[:i+1], gp_in=gp_in - ) - logger.debug(q) - try: - t, res_q[i] = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q[i]: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q[i]['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - values[hop[i]] = get_values([hop[i]], res_q[i]) - valueblocks[hop[i]] = { - (hop[i],): random.sample( - values[hop[i]][(hop[i],)], - min(10, len(values[hop[i]][(hop[i],)])) - ) - } - - # gemeinsamer source/target-block, damit nur "richtige" Pfade gefunden - # werden - del valueblocks[SOURCE_VAR] - valueblocks['st'] = values['st'] - q = gp_.to_sparql_deep_narrow_path_inst_query( - hop, valueblocks, gp_helper, gp_in=gp_in - ) - logger.debug(q) - try: - t, res_q_inst = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q_inst: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q_inst['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - res = [] - res_rows_path = ['results', 'bindings'] - bind = sparql_json_result_bindings_to_rdflib( - get_path(res_q_inst, res_rows_path, default=[]) - ) - for row in bind: - gp_res = GraphPattern([ - (node[i], get_path(row, [hop[i]]), node[i + 1]) if direct[i] == 1 - else (node[i + 1], get_path(row, [hop[i]]), node[i]) - for i in range(n + 1) - ]) - res.append(gp_res) - - return res - - -# dritte Version: BIDI straight forward -def mutate_deep_narrow_3( - gp_, gtps, n, direct=None, gp_in=False -): - node = [SOURCE_VAR] - for i in range(n): - node.append(Variable('n%i' % i)) - node.append(TARGET_VAR) - hop = [] - for i in range(n + 1): - hop.append(Variable('p%i' % i)) - if direct is None or len(direct) != n + 1: - logger.debug( - 'No direction chosen, or direction tuple with false length' - ) - direct = [] - for i in range(n + 1): - direct.append(0) - gp_helper = [] - for i in range(n + 1): - if direct[i] == 0: - direct[i] = random.choice([-1, 1]) - if direct[i] == 1: - gp_helper.append( - GraphPattern([(node[i], hop[i], node[i + 1])]) - ) - else: - gp_helper.append( - GraphPattern([(node[i + 1], hop[i], node[i])]) - ) - values = {} - values[SOURCE_VAR] = {(SOURCE_VAR,): [(tup[0],) for tup in gtps]} - values[TARGET_VAR] = {(TARGET_VAR,): [(tup[1],) for tup in gtps]} - values['st'] = {(SOURCE_VAR, TARGET_VAR): gtps} - res_q = [] - for i in range(n+1): - res_q.append({}) - - # Queries für die Schritte - valueblocks_s = {} - valueblocks_s[SOURCE_VAR] = values[SOURCE_VAR] - valueblocks_t = {} - valueblocks_t[TARGET_VAR] = values[TARGET_VAR] - for i in range(int((n / 2) + 1)): - q = gp_.to_sparql_deep_narrow_path_query( - hop[i], node[i+1], valueblocks_s, gp_helper[:i+1], gp_in=gp_in - ) - logger.debug(q) - try: - t, res_q[i] = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q[i]: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q[i]['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - values[hop[i]] = get_values([hop[i]], res_q[i]) - valueblocks_s[hop[i]] = { - (hop[i],): random.sample( - values[hop[i]][(hop[i],)], - min(10, len(values[hop[i]][(hop[i],)])) - ) - } - if n-i != i: - q = gp_.to_sparql_deep_narrow_path_query( - hop[n-i], - node[n-i], - valueblocks_t, - gp_helper[n-i:], - startvar=TARGET_VAR, - gp_in=gp_in - ) - logger.debug(q) - try: - t, res_q[n-i] = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q[n-i]: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q[n-i]['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - values[hop[n-i]] = get_values([hop[n-i]], res_q[n-i]) - valueblocks_t[hop[n-i]] = { - (hop[n-i],): random.sample( - values[hop[n-i]][(hop[n-i],)], - min(10, len(values[hop[n-i]][(hop[n-i],)])) - ) - } - - # Query fürs Ergebnis - gp_help = GraphPattern([ - (node[i], hop[i], node[i+1]) if direct[i] == 1 - else (node[i+1], hop[i], node[i]) - for i in range(n+1) - ]) - # gemeinsamer source/target-block, damit nur "richtige" Pfade gefunden - # werden - valueblocks = {} - for key in valueblocks_s: - if key is not SOURCE_VAR: - valueblocks[key] = valueblocks_s[key] - for key in valueblocks_t: - if key is not TARGET_VAR: - valueblocks[key] = valueblocks_t[key] - valueblocks['st'] = values['st'] - q = gp_.to_sparql_deep_narrow_path_inst_query_old(hop, valueblocks, gp_help, gp_in=gp_in) - logger.debug(q) - try: - t, res_q_inst = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q_inst: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q_inst['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - res = [] - res_rows_path = ['results', 'bindings'] - bind = sparql_json_result_bindings_to_rdflib( - get_path(res_q_inst, res_rows_path, default=[]) - ) - for row in bind: - gp_res = GraphPattern([ - (node[i], get_path(row, [hop[i]]), node[i + 1]) if direct[i] == 1 - else (node[i + 1], get_path(row, [hop[i]]), node[i]) - for i in range(n + 1) - ]) - res.append(gp_res) - - return res - - -# vierte Version: BIDI with instantiation in last step -def mutate_deep_narrow_4( - gp_, gtps, n, direct=None, gp_in=False -): - node = [SOURCE_VAR] - for i in range(n): - node.append(Variable('n%i' % i)) - node.append(TARGET_VAR) - hop = [] - for i in range(n + 1): - hop.append(Variable('p%i' % i)) - if direct is None or len(direct) != n + 1: - logger.debug( - 'No direction chosen, or direction tuple with false length' - ) - direct = [] - for i in range(n + 1): - direct.append(0) - gp_helper = [] - for i in range(n + 1): - if direct[i] == 0: - direct[i] = random.choice([-1, 1]) - if direct[i] == 1: - gp_helper.append( - GraphPattern([(node[i], hop[i], node[i + 1])]) - ) - else: - gp_helper.append( - GraphPattern([(node[i + 1], hop[i], node[i])]) - ) - values = {} - values[SOURCE_VAR] = {(SOURCE_VAR,): [(tup[0],) for tup in gtps]} - values[TARGET_VAR] = {(TARGET_VAR,): [(tup[1],) for tup in gtps]} - values['st'] = {(SOURCE_VAR, TARGET_VAR): gtps} - res_q = [] - for i in range(n+1): - res_q.append({}) - - # Queries für die Schritte - valueblocks_s = {} - valueblocks_s[SOURCE_VAR] = values[SOURCE_VAR] - valueblocks_t = {} - valueblocks_t[TARGET_VAR] = values[TARGET_VAR] - for i in range(int((n / 2) + 1)): - if i < int(n/2): - q = gp_.to_sparql_deep_narrow_path_query( - hop[i], node[i+1], valueblocks_s, gp_helper[:i+1], SOURCE_VAR, gp_in=gp_in - ) - logger.debug(q) - try: - t, res_q[i] = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q[i]: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q[i]['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - values[hop[i]] = get_values([hop[i]], res_q[i]) - valueblocks_s[hop[i]] = { - (hop[i],): random.sample( - values[hop[i]][(hop[i],)], - min(10, len(values[hop[i]][(hop[i],)])) - ) - } - if n-i > i: - q = gp_.to_sparql_deep_narrow_path_query( - hop[n-i], - node[n-i], - valueblocks_t, - gp_helper[n-i:], - TARGET_VAR, - gp_in=gp_in - ) - logger.debug(q) - try: - t, res_q[n-i] = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q[n-i]: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q[n-i]['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - values[hop[n-i]] = get_values([hop[n-i]], res_q[n-i]) - valueblocks_t[hop[n-i]] = { - (hop[n-i],): random.sample( - values[hop[n-i]][(hop[n-i],)], - min(10, len(values[hop[n-i]][(hop[n-i],)])) - ) - } - - # Query fürs Ergebnis - # gemeinsamer source/target-block, damit nur "richtige" Pfade gefunden - # werden - valueblocks = {} - for key in valueblocks_s: - if key is not SOURCE_VAR: - valueblocks[key] = valueblocks_s[key] - for key in valueblocks_t: - if key is not TARGET_VAR: - valueblocks[key] = valueblocks_t[key] - valueblocks['st'] = values['st'] - q = gp_.to_sparql_deep_narrow_path_inst_query( - hop, valueblocks, gp_helper, gp_in=gp_in - ) - logger.debug(q) - try: - t, res_q_inst = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q_inst: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q_inst['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - res = [] - res_rows_path = ['results', 'bindings'] - bind = sparql_json_result_bindings_to_rdflib( - get_path(res_q_inst, res_rows_path, default=[]) - ) - for row in bind: - gp_res = GraphPattern([ - (node[i], get_path(row, [hop[i]]), node[i + 1]) if direct[i] == 1 - else (node[i + 1], get_path(row, [hop[i]]), node[i]) - for i in range(n + 1) - ]) - res.append(gp_res) - - return res - - -# fünfte Version: filtern nach Count -def mutate_deep_narrow_5( - gp_, gtps, n, direct=None, gp_in=False -): - node = [SOURCE_VAR] - for i in range(n): - node.append(Variable('n%i' % i)) - node.append(TARGET_VAR) - hop = [] - for i in range(n + 1): - hop.append(Variable('p%i' % i)) - if direct is None or len(direct) != n + 1: - logger.debug( - 'No direction chosen, or direction tuple with false length' - ) - direct = [] - for i in range(n + 1): - direct.append(0) - gp_helper = [] - for i in range(n + 1): - if direct[i] == 0: - direct[i] = random.choice([-1, 1]) - if direct[i] == 1: - gp_helper.append( - GraphPattern([(node[i], hop[i], node[i + 1])]) - ) - else: - gp_helper.append( - GraphPattern([(node[i + 1], hop[i], node[i])]) - ) - values = {} - values[SOURCE_VAR] = {(SOURCE_VAR,): [(tup[0],) for tup in gtps]} - values[TARGET_VAR] = {(TARGET_VAR,): [(tup[1],) for tup in gtps]} - values['st'] = {(SOURCE_VAR, TARGET_VAR): gtps} - res_q = [] - for i in range(n + 1): - res_q.append({}) - - # Queries für die Schritte - valueblocks = {} - valueblocks[SOURCE_VAR] = values[SOURCE_VAR] - for i in range(n+1): - q = gp_.to_sparql_deep_narrow_path_query( - hop[i], node[i+1], valueblocks, gp_helper[:i+1], gp_in=gp_in - ) - logger.debug(q) - try: - t, res_q[i] = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q[i]: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q[i]['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - values[hop[i]] = get_values([hop[i]], res_q[i]) - valueblocks[hop[i]] = get_weighted_sample( - hop[i], Variable('avgc'+''.join(node[i+1])), res_q[i] - ) - - # Query fürs Ergebnis - gp_help = GraphPattern([ - (node[i], hop[i], node[i+1]) if direct[i] == 1 - else (node[i+1], hop[i], node[i]) - for i in range(n+1) - ]) - # gemeinsamer source/target-block, damit nur "richtige" Pfade gefunden - # werden - del valueblocks[SOURCE_VAR] - valueblocks['st'] = values['st'] - q = gp_.to_sparql_deep_narrow_path_inst_query_old(hop, valueblocks, gp_help, gp_in=gp_in) - logger.debug(q) - try: - t, res_q_inst = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q_inst: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q_inst['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - res = [] - res_rows_path = ['results', 'bindings'] - bind = sparql_json_result_bindings_to_rdflib( - get_path(res_q_inst, res_rows_path, default=[]) - ) - for row in bind: - gp_res = GraphPattern([ - (node[i], get_path(row, [hop[i]]), node[i + 1]) if direct[i] == 1 - else (node[i + 1], get_path(row, [hop[i]]), node[i]) - for i in range(n + 1) - ]) - res.append(gp_res) - - return res - - -# sechste Version: Query für letzten step bekommt schon die Targets -# => Precheck feasible? -def mutate_deep_narrow_6( - gp_, gtps, n, direct=None, gp_in=False -): - node = [SOURCE_VAR] - for i in range(n): - node.append(Variable('n%i' % i)) - node.append(TARGET_VAR) - hop = [] - for i in range(n + 1): - hop.append(Variable('p%i' % i)) - if direct is None or len(direct) != n + 1: - logger.debug( - 'No direction chosen, or direction tuple with false length' - ) - direct = [] - for i in range(n + 1): - direct.append(0) - gp_helper = [] - for i in range(n + 1): - if direct[i] == 0: - direct[i] = random.choice([-1, 1]) - if direct[i] == 1: - gp_helper.append( - GraphPattern([(node[i], hop[i], node[i + 1])]) - ) - else: - gp_helper.append( - GraphPattern([(node[i + 1], hop[i], node[i])]) - ) - values = {} - values[SOURCE_VAR] = {(SOURCE_VAR,): [(tup[0],) for tup in gtps]} - values[TARGET_VAR] = {(TARGET_VAR,): [(tup[1],) for tup in gtps]} - values['st'] = {(SOURCE_VAR, TARGET_VAR): gtps} - res_q = [] - for i in range(n + 1): - res_q.append({}) - - # Pre-check: - gp_help = GraphPattern([ - (node[i], hop[i], node[i+1]) if direct[i] == 1 - else (node[i+1], hop[i], node[i]) - for i in range(n+1) - ]) - q = gp_help.to_sparql_precheck_query(values['st'], gp_in=gp_in) - logger.debug(q) - try: - t, res_q = run_query(q) - except: - logger.info('Pre-Check hat nicht geklappt') - if not res_q: - logger.info('Pre-Check hat kein Ergebnis') - elif not res_q['results']['bindings']: - logger.info('Pre-Check hat keine gebundenen Variablen') - else: - logger.info('Pre-Check hat einen Treffer') - - # Queries für die Schritte - valueblocks = {} - valueblocks[SOURCE_VAR] = values[SOURCE_VAR] - for i in range(n): - q = gp_.to_sparql_deep_narrow_path_query( - hop[i], node[i+1], valueblocks, gp_helper[:i+1], gp_in=gp_in - ) - logger.debug(q) - try: - t, res_q[i] = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q[i]: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q[i]['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - values[hop[i]] = get_values([hop[i]], res_q[i]) - valueblocks[hop[i]] = { - (hop[i],): random.sample( - values[hop[i]][(hop[i],)], - min(10, len(values[hop[i]][(hop[i],)])) - ) - } - - # gemeinsamer source/target-block, damit nur "richtige" Pfade gefunden - # werden - del valueblocks[SOURCE_VAR] - valueblocks['st'] = values['st'] - q = gp_.to_sparql_deep_narrow_path_inst_query( - hop, valueblocks, gp_helper, gp_in=gp_in - ) - logger.debug(q) - try: - t, res_q_inst = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q_inst: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q_inst['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - res = [] - res_rows_path = ['results', 'bindings'] - bind = sparql_json_result_bindings_to_rdflib( - get_path(res_q_inst, res_rows_path, default=[]) - ) - for row in bind: - gp_res = GraphPattern([ - (node[i], get_path(row, [hop[i]]), node[i + 1]) if direct[i] == 1 - else (node[i + 1], get_path(row, [hop[i]]), node[i]) - for i in range(n + 1) - ]) - res.append(gp_res) - - return res - - -# siebte Version: BIDI with instantiation in last step + ws-sampling -def mutate_deep_narrow_7( - gp_, gtps, n, direct=None, gp_in=False -): - node = [SOURCE_VAR] - for i in range(n): - node.append(Variable('n%i' % i)) - node.append(TARGET_VAR) - hop = [] - for i in range(n + 1): - hop.append(Variable('p%i' % i)) - if direct is None or len(direct) != n + 1: - logger.debug( - 'No direction chosen, or direction tuple with false length' - ) - direct = [] - for i in range(n + 1): - direct.append(0) - gp_helper = [] - for i in range(n + 1): - if direct[i] == 0: - direct[i] = random.choice([-1, 1]) - if direct[i] == 1: - gp_helper.append( - GraphPattern([(node[i], hop[i], node[i + 1])]) - ) - else: - gp_helper.append( - GraphPattern([(node[i + 1], hop[i], node[i])]) - ) - values = {} - values[SOURCE_VAR] = {(SOURCE_VAR,): [(tup[0],) for tup in gtps]} - values[TARGET_VAR] = {(TARGET_VAR,): [(tup[1],) for tup in gtps]} - values['st'] = {(SOURCE_VAR, TARGET_VAR): gtps} - res_q = [] - for i in range(n+1): - res_q.append({}) - - # Queries für die Schritte - valueblocks_s = {} - valueblocks_s[SOURCE_VAR] = values[SOURCE_VAR] - valueblocks_t = {} - valueblocks_t[TARGET_VAR] = values[TARGET_VAR] - for i in range(int((n / 2) + 1)): - if i < int(n/2): - q = gp_.to_sparql_deep_narrow_path_query( - hop[i], node[i+1], valueblocks_s, gp_helper[:i+1], gp_in=gp_in - ) - logger.debug(q) - try: - t, res_q[i] = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q[i]: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q[i]['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - values[hop[i]] = get_values([hop[i]], res_q[i]) - valueblocks_s[hop[i]] = get_weighted_sample( - hop[i], Variable('avgc' + ''.join(node[i + 1])), res_q[i] - ) - if n-i > i: - q = gp_.to_sparql_deep_narrow_path_query( - hop[n-i], - node[n-i], - valueblocks_t, - gp_helper[n-i:], - startvar=TARGET_VAR, - gp_in=gp_in - ) - logger.debug(q) - try: - t, res_q[n-i] = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q[n-i]: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q[n-i]['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - values[hop[n-i]] = get_values([hop[n-i]], res_q[n-i]) - valueblocks_t[hop[n-i]] = get_weighted_sample( - hop[n-i], Variable('avgc' + ''.join(node[n-i])), res_q[n-i] - ) - - # Query fürs Ergebnis - # gemeinsamer source/target-block, damit nur "richtige" Pfade gefunden - # werden - valueblocks = {} - for key in valueblocks_s: - if key is not SOURCE_VAR: - valueblocks[key] = valueblocks_s[key] - for key in valueblocks_t: - if key is not TARGET_VAR: - valueblocks[key] = valueblocks_t[key] - valueblocks['st'] = values['st'] - q = gp_.to_sparql_deep_narrow_path_inst_query( - hop, valueblocks, gp_helper, gp_in=gp_in - ) - logger.debug(q) - try: - t, res_q_inst = run_query(q) - except: - logger.debug('Die Query (s.o.) hat nicht geklappt') - return [] - if not res_q_inst: - logger.debug('Die Query (s.o.) hat kein Ergebnis geliefert') - return [] - elif not res_q_inst['results']['bindings']: - logger.debug('Die Query (s.o.) hat keine gebundenen Variablen') - return [] - res = [] - res_rows_path = ['results', 'bindings'] - bind = sparql_json_result_bindings_to_rdflib( - get_path(res_q_inst, res_rows_path, default=[]) - ) - for row in bind: - gp_res = GraphPattern([ - (node[i], get_path(row, [hop[i]]), node[i + 1]) if direct[i] == 1 - else (node[i + 1], get_path(row, [hop[i]]), node[i]) - for i in range(n + 1) - ]) - res.append(gp_res) - - return res - def main(): ground_truth_pairs = get_semantic_associations() @@ -3111,328 +198,6 @@ def main(): logger.info(i) logger.info(r) res.append(r) - # for key in gp_found.keys(): - # gp_ = gp_found[key] - # eval_gp(gtp_scores, gp_) - # for i in range(100): - # res_ = mutate_deep_narrow_4( - # gp_, gp_.matching_node_pairs, 6, gp_in=False - # ) - # res.append(res_) - # logger.info((i, key)) - # if res_: - # logger.info(res_) - - # res_eval=[] - # res = [] - # - # max_out = 65 - # max_in = 40 - # in_out = 'out' - # richtung = 2 - # ground_truth_pairs = get_semantic_associations() - # ground_truth_pairs, _ = split_training_test_set(ground_truth_pairs) - # # ground_truth_pairs = ground_truth_pairs[0:200] - # gtp_scores = GTPScores(ground_truth_pairs) - # gp = gp_found['140'] - # eval_gp(gtp_scores, gp) - # - # for i in range(20): - # res.append(mutate_deep_narrow_n_hops(gp, 2, max_out=max_out, in_out=in_out)) - # - # logger.info(res) - # - # durchgaenge = [] - # - # for richtung in range(1, 9): - # for max_out in [5, 10, 20, 30, 40, 50, 65, 75, 85, 100, 200]: - # for key in gp_found.keys(): - # durchgaenge.append((richtung, max_out, key)) - # - # random.shuffle(durchgaenge) - # - # for (richtung, max_out, key) in durchgaenge: - # logger.info('Durchgang: richtung = %s, max_out = %s, gp.key = %s' % - # (richtung, max_out, key) - # ) - # ground_truth_pairs = get_semantic_associations() - # ground_truth_pairs, _ = split_training_test_set(ground_truth_pairs) - # # ground_truth_pairs = random.sample(ground_truth_pairs, 100) - # gtp_scores = GTPScores(ground_truth_pairs) - # gp = gp_found[key] - # eval_gp(gtp_scores, gp) - # - # res_gp = mutate_deep_narrow_two_hops( - # gp, - # max_out=max_out, - # max_in=max_in, - # in_out=in_out, - # richtung=richtung - # ) - # res_gp.append(gp) - # res_eval = eval_gp_list(gtp_scores, res_gp) - # gp_eval = res_eval[-1] - # res_eval = sorted( - # res_eval[:-1], key=lambda gp_: -gp_.fitness.values.score - # ) - # if res_eval: - # logger.info(max_out) - # print_graph_pattern(gp) - # for gp_ in res_eval: - # print_graph_pattern(gp_) - # res.append((richtung, key, max_out, gp_eval, res_eval)) - - # f = open('store.pckl', 'wb') - # pickle.dump(res, f) - # f.close() - - # in der Konsole das res nochmal anschauen: - # import pickle - # f = open('tests/store.pckl', 'rb') - # res = pickle.load(f) - # f.close() - - # print('HERE STARTS THE RES_PRINTING:') - # for r in res: - # print('richtung %s, key %s, max_out %s\n' % r[0:3]) - # print('Original GP:\n') - # print_graph_pattern(r[3], print_matching_node_pairs=0) - # print('Top 3 found (if 3 where found, else all found) GP:\n') - # for i in range(min(3, len(r[4]))): - # print_graph_pattern(r[4][i], print_matching_node_pairs=0) - - # ground_truth_pairs = get_semantic_associations() - # ground_truth_pairs, _ = split_training_test_set(ground_truth_pairs) - # ground_truth_pairs = random.sample(ground_truth_pairs, 100) - # gtp_scores = GTPScores(ground_truth_pairs) - # gp = gp_found[random.choice(gp_found.keys())] - # - # max_out = 50 - # max_in = 40 - # in_out = 'out' - # - # res = mutate_deep_narrow_one_hop_s_t_without_direction( - # gp, - # ground_truth_pairs, - # max_out=max_out, - # max_in=max_in, - # in_out=in_out - # ) - # res.append(gp) - # res_eval = eval_gp_list(gtp_scores, res) - # gp_eval = res_eval[-1] - # res_eval = sorted(res_eval[:-1], key=lambda gp_: -gp_.fitness.values.score) - # - # print_graph_pattern(gp_eval) - # for gp_ in res_eval: - # print_graph_pattern(gp_) - - # # Zählfelder für die Statistik (Zugriff über max_in_out) - # # durchschnittliche Anzahl der zurückgegebenen pattern - # avg_num_pat = {} - # # maximal zurückgegebene pattern - # max_num_pat = {} - # # durchschnittlicher Score aller zurückgegebenen pattern - # avg_score_all_pat = {} - # # durchschnittlicher Score des besten zurückgegegebenen pattern - # # (wenn vorhanden) - # avg_score_best_pat = {} - # # druchschnittlicher Score des besten zurückgegebenen patterns - # # (0 wenn keins vorhanden) - # avg_score_best_pat_pun = {} - # # maximaler Score eines zurückgegebenen patterns - # max_score_ovrall = {} - # # Wie oft wurde kein pattern zurückgegeben - # num_no_pattern = {} - # # durchschnittliche abweichung des besten patterns vom Score des - # # Ausgangspatterns, wenn vorhanden - # avg_diff_all_pat = {} - # # durchschnittliche Abweichung vom Score des Ausgangspatterns, - # # wenn vorhanden - # avg_diff_best_pat = {} - # # aufaddierter score von Durchgängen ohne pattern - # punish_avg_diff_best_pat = {} - # # aufaddierter score von Durchgängen ohne pattern mal der durchschnittlichen - # # Anzahl zurückgegebener pattern - # punish_avg_diff_all_pat = {} - # # durchschnittliche Abweichung des besten patterns vom score des - # # Ausgangspatterns mit Strafe für gar kein pattern - # avg_diff_all_pat_punished = {} - # # durchschnittliche Abweichung vom Score des Ausgangspatterns, mit Strafe - # # für gar kein pattern - # avg_diff_best_pat_punished = {} - # # die fünf besten (am stärksten verbessernden) pattern - # five_best_pattern = {} - # - # max_out_steps = [10, 15, 20, 25, 30, 40, 50, 75, 100] - # - # for j in max_out_steps: - # avg_num_pat[j] = 0 - # max_num_pat[j] = 0 - # avg_score_all_pat[j] = 0 - # avg_score_best_pat[j] = 0 - # avg_score_best_pat_pun[j] = 0 - # max_score_ovrall[j] = 0 - # num_no_pattern[j] = 0 - # avg_diff_all_pat[j] = 0 - # avg_diff_best_pat[j] = 0 - # punish_avg_diff_best_pat[j] = 0 - # punish_avg_diff_all_pat[j] = 0 - # avg_diff_all_pat_punished[j] = 0 - # avg_diff_best_pat_punished[j] = 0 - # five_best_pattern[j] = [] - # - # reps = 50 - # - # for i in range(reps): - # ground_truth_pairs = get_semantic_associations() - # ground_truth_pairs, _ = split_training_test_set(ground_truth_pairs) - # ground_truth_pairs = random.sample(ground_truth_pairs, 100) - # gtp_scores = GTPScores(ground_truth_pairs) - # gp = gp_found[random.choice(gp_found.keys())] - # for j in max_out_steps: - # res = mutate_deep_narrow_one_hop_s_t_without_direction( - # gp, ground_truth_pairs, max_out=j, in_out='out' - # ) # TODO: warum kommt oben None rein??? - # res.append(gp) - # res_eval = eval_gp_list(gtp_scores, res) - # gp_eval = res_eval[-1] - # res_eval = sorted( - # res_eval[:-1], key=lambda gp_: -gp_.fitness.values.score - # ) - # - # # Statistik: - # avg_num_pat[j] = avg_num_pat[j] + len(res_eval) / reps - # if len(res_eval) > max_num_pat[j]: - # max_num_pat[j] = len(res_eval) - # for gp_ in res_eval: - # avg_score_all_pat[j] = avg_score_all_pat[j] + \ - # gp_.fitness.values.score / \ - # (len(res_eval) * reps) - # if res_eval: - # avg_score_best_pat[j] = avg_score_best_pat[j] + \ - # res_eval[0].fitness.values.score - # if res_eval: - # if res_eval[0].fitness.values.score > max_score_ovrall[j]: - # max_score_ovrall[j] = res_eval[0].fitness.values.score - # if len(res_eval) == 0: - # num_no_pattern[j] = num_no_pattern[j] + 1 - # if res_eval: - # avg_diff_all_pat[j] = avg_diff_all_pat[j] + \ - # (res_eval[0].fitness.values.score - - # gp_eval.fitness.values.score) / \ - # reps - # for gp_ in res_eval: - # avg_diff_best_pat[j] = avg_diff_best_pat[j] + \ - # (gp_.fitness.values.score - - # gp_eval.fitness.values.score) / \ - # (len(res_eval) * reps) - # if not res_eval: - # punish_avg_diff_best_pat[j] = punish_avg_diff_best_pat[j] + \ - # gp_eval.fitness.values.score - # if res_eval: - # if len(five_best_pattern[j]) < 5: - # five_best_pattern[j].append(( - # res_eval[0].fitness.values.score - - # gp_eval.fitness.values.score, - # res_eval[0], - # gp_eval - # )) - # five_best_pattern[j] = sorted( - # five_best_pattern[j], - # key=lambda tup_: -tup_[0] - # ) - # else: - # five_best_pattern[j][4] = ( - # res_eval[0].fitness.values.score - - # gp_eval.fitness.values.score, - # res_eval[0], - # gp_eval - # ) - # five_best_pattern[j] = sorted( - # five_best_pattern[j], - # key=lambda tup_: -tup_[0] - # ) - # logger.info('Runde %s, min_max = %s' % (i, j)) - # print_graph_pattern(gp) - # if res_eval: - # print_graph_pattern(res_eval[0]) - # - # # print out the five best patterns per min_max: - # logger.info(' The five best new patterns (per min_max): ') - # for j in max_out_steps: - # for i in range(len(five_best_pattern[j])): - # print('min_max: %s\n' % j) - # print('Differenz: %s\n' % five_best_pattern[j][i][0]) - # print_graph_pattern(five_best_pattern[j][i][1]) - # print_graph_pattern(five_best_pattern[j][i][2]) - # - # # more statistics - # for j in max_out_steps: - # avg_score_best_pat_pun[j] = avg_score_best_pat[j] / reps - # if reps - num_no_pattern[j]: - # avg_score_best_pat[j] = avg_score_best_pat[j] / \ - # (reps - num_no_pattern[j]) - # else: - # avg_score_best_pat = -1 - # punish_avg_diff_all_pat[j] = punish_avg_diff_best_pat[j] * \ - # avg_num_pat[j] - # avg_diff_all_pat_punished[j] = avg_diff_all_pat[j] - \ - # punish_avg_diff_best_pat[j] - # avg_diff_best_pat_punished[j] = avg_diff_best_pat[j] - \ - # punish_avg_diff_all_pat[j] - # - # # print the statistics - # logger.info('min_max: %s\n' - # 'avg_num_pat: %s\n' - # 'max_num_pat: %s\n' - # 'avg_score_all_pat: %s\n' - # 'avg_score_best_pat: %s\n' - # 'avg_score_best_pat_pun: %s\n' - # 'max_score_ovrall: %s\n' - # 'num_no_pattern: %s\n' - # 'avg_diff_all_pat: %s\n' - # 'avg_diff_best_pat: %s\n' - # 'punish_avg_diff_best_pat: %s\n' - # 'punish_avg_diff_all_pat: %s\n' - # 'avg_diff_all_pat_punished: %s\n' - # 'avg_diff_best_pat_punished: %s\n' % ( - # ' '.join([str(x) for x in max_out_steps]), - # ' '.join([str(avg_num_pat[x]) for x in max_out_steps]), - # ' '.join([str(max_num_pat[x]) for x in max_out_steps]), - # ' '.join([str(avg_score_all_pat[x]) for x in max_out_steps]), - # ' '.join([str(avg_score_best_pat[x]) for x in max_out_steps]), - # ' '.join( - # [str(avg_score_best_pat_pun[x]) for x in max_out_steps] - # ), - # ' '.join([str(max_score_ovrall[x]) for x in max_out_steps]), - # ' '.join([str(num_no_pattern[x]) for x in max_out_steps]), - # ' '.join([str(avg_diff_all_pat[x]) for x in max_out_steps]), - # ' '.join([str(avg_diff_best_pat[x]) for x in max_out_steps]), - # ' '.join( - # [str(punish_avg_diff_best_pat[x]) for x in max_out_steps] - # ), - # ' '.join( - # [str(punish_avg_diff_all_pat[x]) for x in max_out_steps] - # ), - # ' '.join( - # [str(avg_diff_all_pat_punished[x]) for x in max_out_steps] - # ), - # ' '.join( - # [str(avg_diff_best_pat_punished[x]) for x in max_out_steps] - # ) - # )) - # - # # TODO: Fehler finden, warum die Differenz der gp-scores in - # five_best_patterns nicht stimmt - # - # res = res[0:100] - # for res_ in res: - # # print('max_out:' + str(res_[1])) - # print_graph_pattern(res_) - # - # # TODO: zweite Query auch mit SOURCE TARGET binden und gp in die query - # # dazunehmen, dann spar ich mir auch das suchen nach Treffern ?! if __name__ == '__main__': From 6deb0ad754840fba0cb887561111fc5c43b9104c Mon Sep 17 00:00:00 2001 From: "philipp.neuer" Date: Thu, 6 Sep 2018 10:55:54 +0200 Subject: [PATCH 26/27] Erased the use of private methods in to_sparql_deep_narrow_path_(inst_)query() --- graph_pattern.py | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/graph_pattern.py b/graph_pattern.py index 3cef58b..d46f654 100644 --- a/graph_pattern.py +++ b/graph_pattern.py @@ -652,7 +652,7 @@ def to_sparql_deep_narrow_path_query( avg_var_to_count = Variable('avgc' + ''.join(var_to_count)) res = "SELECT %(vtf)s (AVG(%(cvtc)s) as %(avtc)s) {\n" \ "SELECT %(stv)s %(vtf)s (COUNT (%(vtc)s) as %(cvtc)s) {\n" \ - "%(val)s\n" \ + "%(val)s" \ "%(trip)s }\n" \ "GROUP BY %(stv)s %(vtf)s }\n" \ "GROUP BY %(vtf)s\n" \ @@ -663,13 +663,17 @@ def to_sparql_deep_narrow_path_query( 'stv': ''.join(startvar.n3()), 'vtc': ''.join(var_to_count.n3()), 'val': ''.join([ - self._sparql_values_part( - values=valueblocks[key], indent=' ' - ) for key in valueblocks + 'VALUES (%s) {\n%s }\n' % ( + ' '.join(var.n3() for var in valueblocks[key].keys()[0]), + ''.join(['(%s)\n' % + ' '.join(self.curify(v) for v in vt) + for vt in valueblocks[key][(key,)]]) + ) for key in valueblocks.keys() ]), 'trip': ''.join([ - step._sparql_triples_part(indent=' ') for step in steps - # TODO: don't use private method + '%s %s %s .\n' % (s.n3(), p.n3(), o.n3()) + for step in steps + for s, p, o in step ]) + ''.join([ self._sparql_triples_part(indent=' ') if gp_in else '' ]), @@ -688,19 +692,23 @@ def to_sparql_deep_narrow_path_inst_query( ): # TODO: Maybe use a limit res = "SELECT %(vtf)s (COUNT (?source) as ?cst) {\n" \ - "%(val)s\n" \ + "%(val)s" \ "%(trip)s }\n" \ "GROUP BY %(vtf)s\n" \ "HAVING (COUNT (?source) > 0)" % { 'vtf': ' '.join([var.n3() for var in hop]), 'val': ''.join([ - self._sparql_values_part( - values=valueblocks[key], indent=' ' - ) for key in valueblocks + 'VALUES (%s) {\n%s }\n' % ( + ' '.join(var.n3() for var in valueblocks[key].keys()[0]), + ''.join(['(%s)\n' % + ' '.join(self.curify(v) for v in vt) + for vt in valueblocks[key].values()[0]]) + ) for key in valueblocks.keys() ]), 'trip': ''.join([ - step._sparql_triples_part() for step in steps - # TODO: don't use private method + '%s %s %s .\n' % (s.n3(), p.n3(), o.n3()) + for step in steps + for s, p, o in step ]) + ''.join([ self._sparql_triples_part(indent=' ') if gp_in else '' ]), From 05ae8434cf2551daa0ef8f4df8b9d94cf2647f8d Mon Sep 17 00:00:00 2001 From: "philipp.neuer" Date: Thu, 6 Sep 2018 13:10:36 +0200 Subject: [PATCH 27/27] Changed the alpha/beta-valus for the path-length-distributen and the probabilities for fix_var_mut and deep_narrow_mut --- config/defaults.py | 8 ++++---- gp_learner.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/config/defaults.py b/config/defaults.py index 75d61e9..c697437 100644 --- a/config/defaults.py +++ b/config/defaults.py @@ -84,16 +84,16 @@ MUTPB_EN_OUT_LINK = 0.5 # probability to add an outgoing triple (otherwise in) MUTPB_AE = 0.2 # prob to try adding an edge between two nodes MUTPB_ID = 0.05 # prob to increase distance between source and target by 1 hop -MUTPB_FV = 0.4 # prob to fix a variable (SPARQL) +MUTPB_FV = 0.25 # prob to fix a variable (SPARQL) MUTPB_FV_RGTP_SAMPLE_N = 128 # sample <= n remaining GTPs to fix variables for MUTPB_FV_SAMPLE_MAXN = 32 # max n of instantiations to sample from top k MUTPB_FV_QUERY_LIMIT = 256 # SPARQL query limit for the top k instantiations MUTPB_SP = 0.05 # prob to simplify pattern (warning: can restrict exploration) # TODO: Lower the MUTPB_DN -MUTPB_DN = 0.5 # prob to try adding a deep and narrow path to a pattern +MUTPB_DN = 0.6 # prob to try adding a deep and narrow path to a pattern MUTPB_DN_MAX_HOPS = 10 # Max number of hops in the deep narrow path -MUTPB_DN_MAX_HOPS_ALPHA = 2. # alpha value in a length beta distribution -MUTPB_DN_MAX_HOPS_BETA = 5. # beta value in a length beta distribution +MUTPB_DN_MAX_HOPS_ALPHA = 1.15 # alpha value in a length beta distribution +MUTPB_DN_MAX_HOPS_BETA = 1.85 # beta value in a length beta distribution MUTPB_DN_AVG_DEG_LIMIT = 10 # Max avg. reachable Nodes MUTPB_DN_MAX_HOP_INST = 10 # Max number of hop instances for the next query/ies diff --git a/gp_learner.py b/gp_learner.py index 3765dad..d183978 100644 --- a/gp_learner.py +++ b/gp_learner.py @@ -737,7 +737,7 @@ def mutate_deep_narrow_path( # with default values the distribution is as follows: # PDF: 1: 14 %, 2: 27 %, 3: 25 %, 4: 17 %, 5: 10 %, 6: 5 %, 7: 1.5 %, ... # CDF: 1: 14 %, 2: 40 %, 3: 66 %, 4: 83 %, 5: 93 %, 6: 98 %, 7: 99,6 %, ... - n = int(random.betavariate(alpha, beta) * (max_hops-1) + 1) + n = int(random.betavariate(alpha, beta) * max_hops + 1) nodes = [SOURCE_VAR] + [Variable('n%d' % i) for i in range(n)] + [TARGET_VAR] hops = [Variable('p%d' % i) for i in range(n + 1)] if not directions: