Skip to content

Commit 0ec39a7

Browse files
committed
refactoring of deep-narrow-paths query, sparql (uses template now), and args passing, also tons of minor things
1 parent 62f45c1 commit 0ec39a7

File tree

3 files changed

+134
-105
lines changed

3 files changed

+134
-105
lines changed

gp_learner.py

+20-18
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
from gp_query import predict_query
4949
from gp_query import query_time_hard_exceeded
5050
from gp_query import query_time_soft_exceeded
51-
from gp_query import variable_substitution_deep_narrow_mut_query
51+
from gp_query import dnp_query
5252
from gp_query import variable_substitution_query
5353
from graph_pattern import canonicalize
5454
from graph_pattern import gen_random_var
@@ -655,15 +655,12 @@ def mutate_fix_var(
655655

656656

657657
def _mutate_deep_narrow_path_helper(
658-
sparql,
659-
timeout,
660-
gtp_scores,
661-
child,
662-
edge_var,
663-
node_var,
664-
gtp_sample_n=config.MUTPB_FV_RGTP_SAMPLE_N,
665-
limit_res=config.MUTPB_DN_QUERY_LIMIT,
666-
sample_n=config.MUTPB_FV_SAMPLE_MAXN,
658+
sparql, timeout, gtp_scores, child, edge_var, node_var,
659+
gtp_sample_n=config.MUTPB_FV_RGTP_SAMPLE_N,
660+
max_node_count=config.MUTPB_DN_MAX_NODE_COUNT,
661+
min_edge_count=config.MUTPB_DN_MIN_EDGE_COUNT,
662+
limit=config.MUTPB_DN_QUERY_LIMIT,
663+
sample_n=config.MUTPB_FV_SAMPLE_MAXN,
667664
):
668665
assert isinstance(child, GraphPattern)
669666
assert isinstance(gtp_scores, GTPScores)
@@ -675,10 +672,15 @@ def _mutate_deep_narrow_path_helper(
675672
gtp_sample_n = random.randint(1, gtp_sample_n)
676673

677674
ground_truth_pairs = gtp_scores.remaining_gain_sample_gtps(
678-
n=gtp_sample_n)
679-
t, substitution_counts = variable_substitution_deep_narrow_mut_query(
680-
sparql, timeout, child, edge_var, node_var, ground_truth_pairs,
681-
limit_res)
675+
max_n=gtp_sample_n)
676+
t, substitution_counts = dnp_query(
677+
sparql, timeout, child, ground_truth_pairs,
678+
edge_var=edge_var,
679+
node_var=node_var,
680+
max_node_count=max_node_count,
681+
min_edge_count=min_edge_count,
682+
limit=limit,
683+
)
682684
edge_count, node_sum_count = substitution_counts
683685
if not node_sum_count:
684686
# the current pattern is unfit, as we can't find anything fulfilling it
@@ -698,7 +700,7 @@ def _mutate_deep_narrow_path_helper(
698700
for edge, node_sum in node_sum_count.items():
699701
ec = edge_count[edge]
700702
prio[edge] = ec / (node_sum / ec) # ec / AVG degree
701-
# randomly pick n of the substitutions with a prob ~ to their counts
703+
# randomly pick n of the substitutions with a prob ~ to their prios
702704
edges, prios = zip(*prio.most_common())
703705

704706
substs = sample_from_list(edges, prios, sample_n)
@@ -715,9 +717,9 @@ def _mutate_deep_narrow_path_helper(
715717
fixed = True
716718
orig_child = child
717719
children = [
718-
GraphPattern(child, mapping={edge_var: subst})
719-
for subst in substs
720-
]
720+
GraphPattern(child, mapping={edge_var: subst})
721+
for subst in substs
722+
]
721723
children = [
722724
c if fit_to_live(c) else orig_child
723725
for c in children

gp_query.py

+32-22
Original file line numberDiff line numberDiff line change
@@ -458,43 +458,55 @@ def _var_subst_res_update(res, update, **_):
458458
res += update
459459

460460

461-
def variable_substitution_deep_narrow_mut_query(
462-
sparql, timeout, graph_pattern, edge_var, node_var,
463-
source_target_pairs, limit_res, batch_size=config.BATCH_SIZE):
461+
def dnp_query(
462+
sparql, timeout, graph_pattern, source_target_pairs,
463+
edge_var, node_var, max_node_count, min_edge_count, limit,
464+
batch_size=config.BATCH_SIZE
465+
):
464466
_vars, _values, _ret_val_mapping = _get_vars_values_mapping(
465467
graph_pattern, source_target_pairs)
466-
_edge_var_node_var_and_vars = (edge_var, node_var, _vars)
467468
return _multi_query(
468469
sparql, timeout, graph_pattern, source_target_pairs, batch_size,
469-
_edge_var_node_var_and_vars, _values, _ret_val_mapping,
470-
_var_subst_dnp_res_init, _var_subst_dnp_chunk_q,
471-
_var_subst_dnp_chunk_result_ext,
472-
_res_update=_var_subst_dnp_update,
473-
limit=limit_res,
470+
_vars, _values, _ret_val_mapping,
471+
_dnp_res_init, _dnp_chunk_q,
472+
_dnp_chunk_result_ext,
473+
_res_update=_dnp_res_update,
474+
edge_var=edge_var,
475+
node_var=node_var,
476+
max_node_count=max_node_count,
477+
min_edge_count=min_edge_count,
478+
limit=limit,
474479
# non standard, passed via **kwds, see handling below
475480
)
476481

477482

478483
# noinspection PyUnusedLocal
479-
def _var_subst_dnp_res_init(_, **kwds):
484+
def _dnp_res_init(_, **kwds):
480485
return Counter(), Counter()
481486

482487

483-
def _var_subst_dnp_chunk_q(gp, _edge_var_node_var_and_vars,
484-
values_chunk, limit):
485-
edge_var, node_var, _vars = _edge_var_node_var_and_vars
486-
return gp.to_find_edge_var_for_narrow_path_query(
488+
def _dnp_chunk_q(
489+
gp, _vars, values_chunk,
490+
edge_var, node_var, max_node_count, min_edge_count, limit,
491+
**_
492+
):
493+
return gp.to_deep_narrow_path_query(
487494
edge_var=edge_var,
488495
node_var=node_var,
489496
vars_=_vars,
490497
values={_vars: values_chunk},
491-
limit_res=limit)
498+
max_node_count=max_node_count,
499+
min_edge_count=min_edge_count,
500+
limit=limit,
501+
)
492502

493503

494504
# noinspection PyUnusedLocal
495-
def _var_subst_dnp_chunk_result_ext(
496-
q_res, _edge_var_node_var_and_vars, _, **kwds):
497-
edge_var, node_var, _vars = _edge_var_node_var_and_vars
505+
def _dnp_chunk_result_ext(
506+
q_res, _vars, _,
507+
edge_var,
508+
**kwds
509+
):
498510
chunk_edge_count, chunk_node_sum = Counter(), Counter()
499511
res_rows_path = ['results', 'bindings']
500512
bindings = sparql_json_result_bindings_to_rdflib(
@@ -510,14 +522,12 @@ def _var_subst_dnp_chunk_result_ext(
510522
return chunk_edge_count, chunk_node_sum,
511523

512524

513-
def _var_subst_dnp_update(res, up, **_):
525+
def _dnp_res_update(res, up, **_):
514526
edge_count, node_sum_count = res
515-
try:
527+
if up:
516528
chunk_edge_count, chunk_node_sum = up
517529
edge_count.update(chunk_edge_count)
518530
node_sum_count.update(chunk_node_sum)
519-
except ValueError:
520-
pass
521531

522532

523533
def generate_stps_from_gp(sparql, gp):

graph_pattern.py

+82-65
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import logging
1717
import random
1818
import string
19+
import textwrap
1920

2021
import deap
2122
import deap.base
@@ -31,7 +32,6 @@
3132
import six
3233

3334
from utils import URIShortener
34-
import config
3535

3636
logger = logging.getLogger(__name__)
3737

@@ -717,84 +717,101 @@ def to_count_var_over_values_query(self, var, vars_, values, limit):
717717
res += 'LIMIT %d\n' % limit
718718
return self._sparql_prefix(res)
719719

720-
def to_find_edge_var_for_narrow_path_query(
721-
self, edge_var, node_var, vars_, values, limit_res,
722-
filter_node_count=config.MUTPB_DN_FILTER_NODE_COUNT,
723-
filter_edge_count=config.MUTPB_DN_FILTER_EDGE_COUNT,
720+
def to_deep_narrow_path_query(
721+
self, edge_var, node_var, vars_, values,
722+
limit, max_node_count, min_edge_count,
724723
):
725-
"""Counts possible substitutions for edge_var to get a narrow path
724+
"""Counts possible substitutions for edge_var to get a narrow path.
726725
727726
Meant to perform a query like this:
728-
SELECT *
729-
{
727+
PREFIX dbr: <http://dbpedia.org/resource/>
728+
SELECT * WHERE {
730729
{
731-
SELECT
732-
?edge_var
733-
(COUNT(*) AS ?edge_var_count)
734-
(MAX(?node_var_count) AS ?max_node_count)
735-
(COUNT(*)/AVG(?node_var_count) as ?prio_var)
736-
{
737-
SELECT DISTINCT
738-
?source ?target ?edge_var (COUNT(?node_var) AS ?node_var_count)
739-
{
740-
VALUES (?source ?target) {
741-
(dbr:Adolescence dbr:Youth)
742-
(dbr:Adult dbr:Child)
743-
(dbr:Angel dbr:Heaven)
744-
(dbr:Arithmetic dbr:Mathematics)
745-
}
746-
?node_var ?edge_var ?source .
747-
?source dbo:wikiPageWikiLink ?target .
748-
}
730+
SELECT ?edge_var
731+
(SUM(?node_var_count) AS ?node_var_sum)
732+
(COUNT(?source && ?target) AS ?edge_var_count)
733+
(MAX(?node_var_count) AS ?max_node_count)
734+
WHERE {
735+
SELECT DISTINCT ?source ?target ?edge_var
736+
(COUNT(?node_var) AS ?node_var_count)
737+
WHERE {
738+
VALUES (?source ?target) {
739+
(dbr:Barrel dbr:Wine)
740+
(dbr:Barrister dbr:Law)
741+
(dbr:Beak dbr:Bird)
742+
(dbr:Blanket dbr:Bed)
743+
}
744+
?node_var ?edge_var ?source .
745+
?source <http://dbpedia.org/ontology/wikiPageWikiLink> ?target .
749746
}
750-
GROUP BY ?edge_var
751-
ORDER BY DESC(?edge_var_count)
747+
}
748+
GROUP BY ?edge_var
752749
}
753-
FILTER(?max_node_count < 10 && ?edge_var_count > 1)
754-
}
755-
ORDER BY DESC(?prio_var)
756-
LIMIT 32
750+
FILTER(?max_node_count <= 10
751+
&& ?edge_var_count >= 2)
752+
}
753+
ORDER BY DESC(?edge_var_count) ASC(?node_var_sum)
754+
LIMIT 32
755+
756+
The idea here is to expand a random node (?source in the example above)
757+
with new variable triple and then try to fix its edge in a way that the
758+
degree (?node_var_count) isn't too high (<= max_node_count). We're also
759+
interested in the avg degree being low. In light of query chunking the
760+
sum is returned here (instead of AVG).
761+
762+
Apart from minimizing the degrees, we would also like to maximize the
763+
number of stps an ?edge_var fixation is valid for (?edge_var_count).
764+
765+
See gp_learner.mutate_deep_narrow_path() for more.
757766
758767
:param edge_var: Edge variable to find substitution for.
759768
:param node_var: Node variable to count.
760769
:param vars_: List of vars to fix values for (e.g. ?source, ?target).
761770
:param values: List of value lists for vars_.
762-
:param filter_node_count: Filter on node count of edge variable.
763-
:param filter_edge_count: Filter for edge count of triples.
764-
:param limit_res : limit result size
771+
:param max_node_count: Filter on node count of edge variable.
772+
:param min_edge_count: Filter for edge count of triples.
773+
:param limit : limit result size.
765774
:return: Query String.
766775
"""
767776

768-
res = 'SELECT * WHERE {\n'
769-
res += ' {\n'\
770-
' SELECT %s (SUM (?node_var_count) AS %s) (COUNT(%s) AS %s) ' \
771-
'(MAX(?node_var_count) AS ?max_node_count) WHERE {\n' % (
772-
edge_var.n3(),
773-
NODE_VAR_SUM.n3(),
774-
' && '.join([v.n3() for v in vars_]),
775-
EDGE_VAR_COUNT.n3(), )
776-
res += ' SELECT DISTINCT %s %s (COUNT(%s) AS ?node_var_count) ' \
777-
'WHERE {\n ' % (' '.join([v.n3() for v in vars_]),
778-
edge_var.n3(), node_var.n3(), )
779-
res += self._sparql_values_part(values)
780-
781-
# triples part
782-
tres = []
783-
for s, p, o in self:
784-
tres.append('%s %s %s .' % (s.n3(), p.n3(), o.n3()))
785-
indent = ' ' * 3
786-
triples = indent + ('\n' + indent).join(tres) + '\n'
787-
res += triples
788-
res += ' }\n'\
789-
' }\n'
790-
res += ' GROUP BY %s\n' % edge_var.n3()
791-
res += ' }\n'
792-
res += ' FILTER(?max_node_count < %d && %s > %d)\n' \
793-
% (filter_node_count, EDGE_VAR_COUNT.n3(),
794-
filter_edge_count)
795-
res += '}\n'
796-
res += 'ORDER BY ASC(%s)\n' % NODE_VAR_SUM.n3()
797-
res += 'LIMIT %d' % limit_res
777+
res = '''\
778+
SELECT * WHERE {
779+
{
780+
SELECT %(edge_var)s
781+
(SUM(?node_var_count) AS %(node_var_sum)s)
782+
(COUNT(%(vars_and)s) AS %(edge_var_count)s)
783+
(MAX(?node_var_count) AS ?max_node_count)
784+
WHERE {
785+
SELECT DISTINCT %(vars)s %(edge_var)s
786+
(COUNT(%(node_var)s) AS ?node_var_count)
787+
WHERE {\n%(values_part)s %(triples)s
788+
}
789+
}
790+
GROUP BY %(edge_var)s
791+
}
792+
FILTER(?max_node_count <= %(max_node_count)d
793+
&& %(edge_var_count)s >= %(min_edge_count)d)
794+
}
795+
ORDER BY DESC(%(edge_var_count)s) ASC(%(node_var_sum)s)
796+
LIMIT %(limit)d
797+
''' % {
798+
# TODO: adapt self._sparql_values_part for template use (indent)
799+
'edge_var': edge_var.n3(),
800+
'node_var_sum': NODE_VAR_SUM.n3(),
801+
'vars_and': ' && '.join([v.n3() for v in vars_]),
802+
'edge_var_count': EDGE_VAR_COUNT.n3(),
803+
'vars': ' '.join([v.n3() for v in vars_]),
804+
'node_var': node_var.n3(),
805+
'values_part': self._sparql_values_part(
806+
values, indent=' '),
807+
'triples': '\n '.join(
808+
'%s %s %s .' % (s.n3(), p.n3(), o.n3()) for s, p, o in self
809+
),
810+
'limit': limit,
811+
'max_node_count': max_node_count,
812+
'min_edge_count': min_edge_count,
813+
}
814+
res = textwrap.dedent(res)
798815
return self._sparql_prefix(res)
799816

800817
def to_dict(self):

0 commit comments

Comments
 (0)