RDFLib · joernhees · Jun 21, 2018 · Jun 25, 2018 · Jul 10, 2018 · Aug 30, 2018
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,6 @@ venv/
 # ignore py compiled etc. files
 *.pyc
 *.pyo
+
+# ignore .idea
+.idea/
diff --git a/config/defaults.py b/config/defaults.py
@@ -89,6 +89,9 @@
 MUTPB_FV_SAMPLE_MAXN = 32  # max n of instantiations to sample from top k
 MUTPB_FV_QUERY_LIMIT = 256  # SPARQL query limit for the top k instantiations
 MUTPB_SP = 0.05  # prob to simplify pattern (warning: can restrict exploration)
+MUTPB_DN = 0.5  # prob to try adding a deep and narrow path to a pattern
+MUTPB_DN_PS_MAX_N = 10  # Max steps in the deep narrow path
+MUTPB_DN_AVG_LIMIT = 10  # Max avg. reachable Nodes
 
 # fusion of target candidates:
 FUSION_SAMPLES_PER_CLASS = 500  # only use up to n training samples per class

diff --git a/gp_learner.py b/gp_learner.py
@@ -54,6 +54,8 @@
 from gp_query import query_stats
 from gp_query import query_time_hard_exceeded
 from gp_query import query_time_soft_exceeded
+from gp_query import useful_path_query
+from gp_query import useful_path_inst_query
 from gp_query import variable_substitution_query
 from graph_pattern import canonicalize
 from graph_pattern import gen_random_var
@@ -684,6 +686,121 @@ def mutate_fix_var(
     ]
     return res
 
+def mutate_deep_narrow(
+        sparql,
+        timeout,
+        child,
+        gtp_scores,
+        dn_path_steps_max_n=config.MUTPB_DN_PS_MAX_N,
+        direct=None, 
+        childin=False,
+        limit=config.MUTPB_FV_QUERY_LIMIT,  # TODO: Limit benutzen?
+):
+    if not child.matching_node_pairs:
+        ev = evaluate(
+            sparql, timeout, gtp_scores, child)  # TODO: Muss hier run/gen dazu?
+        update_individuals([child], [ev])
+    gtps = child.matching_node_pairs
+    if not gtps:
+        return [child]
+    #TODO: testen, wie die Verteilung gut ist
+    n = random.choice(range(dn_path_steps_max_n))+1
+    n = 2
+    node = [SOURCE_VAR]
+    for i in range(n):
+        node.append(Variable('n%i' % i))
+    node.append(TARGET_VAR)
+    hop = [Variable('p%i' % i) for i in range(n + 1)]
+    # TODO: Entfernern, wenn direct einfach immer random gewählt werden soll
+    if direct is None or len(direct) != n + 1:
+        logger.debug(
+            'No direction chosen, or direction tuple with false length'
+        )
+        direct = [0 for _ in range(n + 1)]
+    gp_helper = []
+    for i in range(n + 1):
+        if direct[i] == 0:
+            direct[i] = random.choice([-1, 1])
+        if direct[i] == 1:
+            gp_helper.append(
+                GraphPattern([(node[i], hop[i], node[i + 1])])
+            )
+        else:
+            gp_helper.append(
+                GraphPattern([(node[i + 1], hop[i], node[i])])
+            )
+    # Queries für die Schritte
+    valueblocks_s = {}
+    valueblocks_t = {}
+    for i in range(int((n / 2) + 1)):
+        if i < int(n/2):
+            t, q_res = useful_path_query(
+                sparql,
+                timeout,
+                child,
+                hop[i],
+                node[i+1],
+                valueblocks_s,
+                gp_helper[:i + 1],
+                SOURCE_VAR,
+                gp_in=childin,
+            )
+            if not q_res:
+                return [child]
+            valueblocks_s[hop[i]] = {
+                (hop[i],): random.sample(
+                    [(q_r,) for q_r in q_res],
+                    min(10, len(q_res))
+                )
+            }
+        if n-i > i:
+            t, q_res = useful_path_query(
+                sparql,
+                timeout,
+                child,
+                hop[n-i],
+                node[n-i],
+                valueblocks_t,
+                gp_helper[n - i:],
+                TARGET_VAR,
+                gp_in=childin,
+            )
+            if not q_res:
+                return [child]
+            valueblocks_t[hop[n-i]] = {
+                (hop[n-i],): random.sample(
+                    [(q_r,) for q_r in q_res],
+                    min(10, len(q_res))
+                )
+            }
+
+    # Query fürs Ergebnis
+    # gemeinsamer source/target-block, damit nur "richtige" Pfade gefunden
+    # werden
+    valueblocks = {}
+    valueblocks.update(valueblocks_s)
+    valueblocks.update(valueblocks_t)
+    t, q_res = useful_path_inst_query(
+        sparql,
+        timeout,
+        child,
+        hop,
+        valueblocks,
+        gp_helper,
+        gp_in=childin
+    )
+    if not q_res:
+        return [child]
+    res = []
+    for inst in q_res:
+        child_inst = GraphPattern([
+            (node[i], inst[i], node[i + 1]) if direct[i] == 1
+            else (node[i + 1], inst[i], node[i])
+            for i in range(n + 1)
+        ])
+        res.append(GraphPattern(child + child_inst))
+    return res
+
 
 def mutate_simplify_pattern(gp):
     if len(gp) < 2:
@@ -797,6 +914,7 @@ def mutate(
         pb_mv=config.MUTPB_MV,
         pb_sp=config.MUTPB_SP,
         pb_sv=config.MUTPB_SV,
+        pb_dn=config.MUTPB_DN,
 ):
     # mutate patterns:
     # grow: select random identifier and convert them into a var (local)
@@ -837,8 +955,14 @@ def mutate(
     else:
         children = [child]
 
-
-    # TODO: deep & narrow paths mutation
+    helper = []
+    for child in children:
+        if random.random() < pb_dn:
+            res = mutate_deep_narrow(sparql, timeout, gtp_scores, child)
+            helper += res
+        else:
+            helper.append(child)
+    children = helper
 
     children = {
         c if fit_to_live(c) else orig_child

diff --git a/gp_query.py b/gp_query.py
@@ -62,6 +62,8 @@ def __init__(self):
         self.ask_multi_query_count = 0
         self.combined_ask_count_multi_query_count = 0
         self.variable_substitution_query_count = 0
+        self.useful_path_query_count = 0
+        self.useful_path_inst_query_count = 0
         self.predict_query_count = 0
         self.count_query_count = 0
 
@@ -695,6 +697,145 @@ def _var_subst_chunk_result_ext(q_res, _sel_var_and_vars, _, **kwds):
 
 def _var_subst_res_update(res, update, **_):
     res += update
+
+
+def useful_path_query(
+        sparql,
+        timeout,
+        graph_pattern,
+        var_to_fix,
+        var_to_count,
+        valueblocks,
+        steps,
+        startvar,
+        avglimit=config.MUTPB_DN_AVG_LIMIT,
+        gp_in=False,
+        batch_size=None
+):
+    _query_stats.useful_path_query_count += 1
+    # TODO: evtl. je 10 pro 'gefixter' Variable von batch-size abziehen
+    # (weil der Block ja mit rein geht)
+    _values = graph_pattern.matching_node_pairs
+    # TODO: evtl. Schnitt mit noch nicht abgedeckten
+    _ret_val_mapping = {stp: [stp] for stp in graph_pattern.matching_node_pairs}
+    _vars_steps_and_stuff = (
+        var_to_fix, var_to_count, startvar, valueblocks, steps, avglimit, gp_in
+    )
+    return _multi_query(
+        sparql, timeout, graph_pattern, graph_pattern.matching_node_pairs,
+        batch_size, _vars_steps_and_stuff, _values, _ret_val_mapping,
+        _usef_path_res_init, _usef_path_chunk_q, _usef_path_chunk_result_ext,
+        _usef_path_res_update
+    )
+
+
+# noinspection PyUnusedLocal
+def _usef_path_res_init(_, **kwds):
+    return []
+
+
+def _usef_path_chunk_q(gp, _vars_steps_and_stuff, values_chunk):
+    var_to_fix, var_to_count, startvar, _valueblocks, steps, avglimit, gp_in \
+        = _vars_steps_and_stuff
+    valueblocks = {
+        startvar: {
+            (startvar,):
+                [(tup[0],) for tup in values_chunk] if startvar == SOURCE_VAR
+                else [(tup[1],) for tup in values_chunk]
+        }
+    }
+    valueblocks.update(_valueblocks)
+    return gp.to_sparql_useful_path_query(
+            var_to_fix,
+            var_to_count,
+            valueblocks,
+            steps,
+            startvar,
+            avglimit=avglimit,
+            gp_in=gp_in
+    )
+
+
+# noinspection PyUnusedLocal
+def _usef_path_chunk_result_ext(q_res, _vars_steps_and_stuff, _, **kwds):
+    var_to_fix, var_to_count, startvar, _valueblocks, steps, avglimit, gp_in \
+        = _vars_steps_and_stuff
+    chunk_res = []
+    res_rows_path = ['results', 'bindings']
+    bindings = sparql_json_result_bindings_to_rdflib(
+        get_path(q_res, res_rows_path, default=[])
+    )
+
+    for row in bindings:
+        # TODO: Drüber nachdenken, ob iwie die avg-outgoing auch mit
+        # zurückgegeben werden sollen
+        chunk_res.append(get_path(row, [var_to_fix]))
+    return chunk_res
+
+
+def _usef_path_res_update(res, update, **_):
+    res += update
+
+
+def useful_path_inst_query(
+        sparql,
+        timeout,
+        graph_pattern,
+        hop,
+        valueblocks,
+        steps,
+        gp_in=False,
+        batch_size=None
+):
+    _query_stats.useful_path_inst_query_count += 1
+    # TODO: evtl. je 10 pro 'gefixter' Variable von batch-size abziehen
+    # (weil der Block ja mit rein geht)
+    _values = graph_pattern.matching_node_pairs
+    # evtl. Schnitt mit noch nicht abgedeckten
+    _ret_val_mapping = {stp: [stp] for stp in graph_pattern.matching_node_pairs}
+    _vars_steps_and_stuff = (hop, valueblocks, steps, gp_in)
+    return _multi_query(
+        sparql, timeout, graph_pattern, graph_pattern.matching_node_pairs,
+        batch_size, _vars_steps_and_stuff, _values, _ret_val_mapping,
+        _usef_path_inst_res_init, _usef_path_inst_chunk_q,
+        _usef_path_inst_chunk_result_ext, _usef_path_inst_res_update
+    )
+
+
+# noinspection PyUnusedLocal
+def _usef_path_inst_res_init(_, **kwds):
+    return []
+
+
+def _usef_path_inst_chunk_q(gp, _vars_steps_and_stuff, values_chunk):
+    hop, _valueblocks, steps, gp_in = _vars_steps_and_stuff
+    valueblocks = {
+        'st': {
+            (SOURCE_VAR, TARGET_VAR): values_chunk
+        }
+    }
+    valueblocks.update(_valueblocks)
+    return gp.to_sparql_useful_path_inst_query(
+        hop, valueblocks, steps, gp_in=gp_in
+    )
+
+
+# noinspection PyUnusedLocal
+def _usef_path_inst_chunk_result_ext(q_res, _vars_steps_and_stuff, _, **kwds):
+    hop, _valueblocks, steps, gp_in = _vars_steps_and_stuff
+    chunk_res = []
+    res_rows_path = ['results', 'bindings']
+    bindings = sparql_json_result_bindings_to_rdflib(
+        get_path(q_res, res_rows_path, default=[])
+    )
+
+    for row in bindings:
+        chunk_res.append([get_path(row, [h]) for h in hop])
+    return chunk_res
+
+
+def _usef_path_inst_res_update(res, update, **_):
+    res += update
 
 
 def generate_stps_from_gp(sparql, gp):
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,3 +7,6 @@ venv/ @@
     # ignore py compiled etc. files
     *.pyc
     *.pyo
+    # ignore .idea
+    .idea/