Skip to content

WIP: deep narrow paths mutation #10

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 27 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
8fbd1f1
Test for evaluate() and mutate_fix_var()
pneuer Jun 21, 2018
db78db4
modified test_fv_eval.py
pneuer Jun 25, 2018
07d1f39
Test to find one hop patterns with SAMPLE-Queries
pneuer Jul 10, 2018
0837c10
test finished, alg.not yet in learner
pneuer Aug 30, 2018
f67c730
Deep-and-Narrow-Path-Mutation schould be runnable
pneuer Aug 31, 2018
c908caf
kleine Aenderung und Fehlerbehebung in gp_learner.py
pneuer Sep 3, 2018
adbc215
Undone modifying unrelated stuff
pneuer Sep 4, 2018
20e5b34
Renamed two values and added, alpha beta values for the path lentght …
pneuer Sep 4, 2018
22a786e
Changed values MUTPB_DN_MAX_HOPS_ALPHA / BETA
pneuer Sep 4, 2018
91cbde0
Changed order in mutate_deep_narrow()
pneuer Sep 5, 2018
9c3238a
Renamed MUTPB_DN_AVG_LIMIT
pneuer Sep 5, 2018
6362dc8
added betadistribution for mut-length and dnp-mut only if not fixvar
pneuer Sep 5, 2018
1130da4
Code-Style changes and renamed mutate_deep_narrow to mutate_deep_narr…
pneuer Sep 5, 2018
12a95ae
Renamed useful_path_(inst_)query to deep_narrow_path_(inst_)query
pneuer Sep 5, 2018
22ca6aa
Renamed to_sparql_useful_path/_inst_query() to to_sparql_deep_narrow_…
pneuer Sep 5, 2018
126e84d
Undone the changes in requirements.txt
pneuer Sep 5, 2018
331e06f
Added default-value for max instances of hops
pneuer Sep 5, 2018
49b5c4d
Renamed the correct to_sparql_deep_narrow_path_inst_query()
pneuer Sep 5, 2018
c0617ea
Added docsring for mutate_deep_narrow_path() AND Renamed direct and c…
pneuer Sep 5, 2018
82cdacf
Renamed the correct to_sparql_depp_narrow_path_inst_query()
pneuer Sep 5, 2018
9117d05
Comments -> english
pneuer Sep 5, 2018
d792d10
Erased all unused to_sparql_*_query()
pneuer Sep 5, 2018
75bd1ea
Comments -> english
pneuer Sep 5, 2018
72f2fee
deleted test_fv_eval.py and SPARQL-query.py
pneuer Sep 5, 2018
e2e09a4
Erased everything except the test for the mutation in the learner
pneuer Sep 5, 2018
6deb0ad
Erased the use of private methods in to_sparql_deep_narrow_path_(inst…
pneuer Sep 6, 2018
05ae843
Changed the alpha/beta-valus for the path-length-distributen and the …
pneuer Sep 6, 2018
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@ venv/
# ignore py compiled etc. files
*.pyc
*.pyo

# ignore .idea
.idea/
3 changes: 3 additions & 0 deletions config/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,9 @@
MUTPB_FV_SAMPLE_MAXN = 32 # max n of instantiations to sample from top k
MUTPB_FV_QUERY_LIMIT = 256 # SPARQL query limit for the top k instantiations
MUTPB_SP = 0.05 # prob to simplify pattern (warning: can restrict exploration)
MUTPB_DN = 0.5 # prob to try adding a deep and narrow path to a pattern
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

probably should be a lot lower in final version

MUTPB_DN_PS_MAX_N = 10 # Max steps in the deep narrow path
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

MUTPB_DN_MAX_HOPS ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe also use ALPHA, BETA as in INIT_POP_LEN_ALPHA|BETA

MUTPB_DN_AVG_LIMIT = 10 # Max avg. reachable Nodes
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

MUTP_DN_AVG_DEG_LIMIT


# fusion of target candidates:
FUSION_SAMPLES_PER_CLASS = 500 # only use up to n training samples per class
Expand Down
128 changes: 126 additions & 2 deletions gp_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@
from gp_query import query_stats
from gp_query import query_time_hard_exceeded
from gp_query import query_time_soft_exceeded
from gp_query import useful_path_query
from gp_query import useful_path_inst_query
from gp_query import variable_substitution_query
from graph_pattern import canonicalize
from graph_pattern import gen_random_var
Expand Down Expand Up @@ -684,6 +686,121 @@ def mutate_fix_var(
]
return res

def mutate_deep_narrow(
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mutate_deep_narrow_path

sparql,
timeout,
child,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

gtp_scores first

gtp_scores,
dn_path_steps_max_n=config.MUTPB_DN_PS_MAX_N,
direct=None,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

directions? should it be a list???

childin=False,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

?

limit=config.MUTPB_FV_QUERY_LIMIT, # TODO: Limit benutzen?
):
if not child.matching_node_pairs:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

probably doesn't exist on uneval children

ev = evaluate(
sparql, timeout, gtp_scores, child) # TODO: Muss hier run/gen dazu?
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

set run=-1, gen=-1, it's only used for logging atm

update_individuals([child], [ev])
gtps = child.matching_node_pairs
if not gtps:
return [child]
#TODO: testen, wie die Verteilung gut ist
n = random.choice(range(dn_path_steps_max_n))+1
n = 2
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rather configurable beta distribution

node = [SOURCE_VAR]
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not simply nodes = [SOURCE_VAR] + [Variable('n%d' % i) for i in range(n)] + [TARGET_VAR]?

for i in range(n):
node.append(Variable('n%i' % i))
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use %d for consistency please

node.append(TARGET_VAR)
hop = [Variable('p%i' % i) for i in range(n + 1)]
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use %d for consistency please

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also: if it's a list it's a convention to use plural, so hops

# TODO: Entfernern, wenn direct einfach immer random gewählt werden soll
if direct is None or len(direct) != n + 1:
logger.debug(
'No direction chosen, or direction tuple with false length'
)
direct = [0 for _ in range(n + 1)]
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pull below here:
directions = [random.choice([-1, 1]) for _ in range(n+1)]

gp_helper = []
for i in range(n + 1):
if direct[i] == 0:
direct[i] = random.choice([-1, 1])
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not just init as above?

if direct[i] == 1:
gp_helper.append(
GraphPattern([(node[i], hop[i], node[i + 1])])
)
else:
gp_helper.append(
GraphPattern([(node[i + 1], hop[i], node[i])])
)
# Queries für die Schritte
valueblocks_s = {}
valueblocks_t = {}
for i in range(int((n / 2) + 1)):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

n // 2

if i < int(n/2):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

comments explaining case

t, q_res = useful_path_query(
sparql,
timeout,
child,
hop[i],
node[i+1],
valueblocks_s,
gp_helper[:i + 1],
SOURCE_VAR,
gp_in=childin,
)
if not q_res:
return [child]
valueblocks_s[hop[i]] = {
(hop[i],): random.sample(
[(q_r,) for q_r in q_res],
min(10, len(q_res))
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

10 magic number? use limit from above?

)
}
if n-i > i:
t, q_res = useful_path_query(
sparql,
timeout,
child,
hop[n-i],
node[n-i],
valueblocks_t,
gp_helper[n - i:],
TARGET_VAR,
gp_in=childin,
)
if not q_res:
return [child]
valueblocks_t[hop[n-i]] = {
(hop[n-i],): random.sample(
[(q_r,) for q_r in q_res],
min(10, len(q_res))
)
}

# Query fürs Ergebnis
# gemeinsamer source/target-block, damit nur "richtige" Pfade gefunden
# werden
valueblocks = {}
valueblocks.update(valueblocks_s)
valueblocks.update(valueblocks_t)
t, q_res = useful_path_inst_query(
sparql,
timeout,
child,
hop,
valueblocks,
gp_helper,
gp_in=childin
)
if not q_res:
return [child]
res = []
for inst in q_res:
child_inst = GraphPattern([
(node[i], inst[i], node[i + 1]) if direct[i] == 1
else (node[i + 1], inst[i], node[i])
for i in range(n + 1)
])
res.append(GraphPattern(child + child_inst))
return res


def mutate_simplify_pattern(gp):
if len(gp) < 2:
Expand Down Expand Up @@ -797,6 +914,7 @@ def mutate(
pb_mv=config.MUTPB_MV,
pb_sp=config.MUTPB_SP,
pb_sv=config.MUTPB_SV,
pb_dn=config.MUTPB_DN,
):
# mutate patterns:
# grow: select random identifier and convert them into a var (local)
Expand Down Expand Up @@ -837,8 +955,14 @@ def mutate(
else:
children = [child]


# TODO: deep & narrow paths mutation
helper = []
for child in children:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i'd actually only do this if not pb_fv, as otherwise we might have n*m children here...

if random.random() < pb_dn:
res = mutate_deep_narrow(sparql, timeout, gtp_scores, child)
helper += res
else:
helper.append(child)
children = helper

children = {
c if fit_to_live(c) else orig_child
Expand Down
141 changes: 141 additions & 0 deletions gp_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ def __init__(self):
self.ask_multi_query_count = 0
self.combined_ask_count_multi_query_count = 0
self.variable_substitution_query_count = 0
self.useful_path_query_count = 0
self.useful_path_inst_query_count = 0
self.predict_query_count = 0
self.count_query_count = 0

Expand Down Expand Up @@ -695,6 +697,145 @@ def _var_subst_chunk_result_ext(q_res, _sel_var_and_vars, _, **kwds):

def _var_subst_res_update(res, update, **_):
res += update


def useful_path_query(
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please rename to deep_narrow_path_query, helps to make the connection between query and mutation

sparql,
timeout,
graph_pattern,
var_to_fix,
var_to_count,
valueblocks,
steps,
startvar,
avglimit=config.MUTPB_DN_AVG_LIMIT,
gp_in=False,
batch_size=None
):
_query_stats.useful_path_query_count += 1
# TODO: evtl. je 10 pro 'gefixter' Variable von batch-size abziehen
# (weil der Block ja mit rein geht)
_values = graph_pattern.matching_node_pairs
# TODO: evtl. Schnitt mit noch nicht abgedeckten
_ret_val_mapping = {stp: [stp] for stp in graph_pattern.matching_node_pairs}
_vars_steps_and_stuff = (
var_to_fix, var_to_count, startvar, valueblocks, steps, avglimit, gp_in
)
return _multi_query(
sparql, timeout, graph_pattern, graph_pattern.matching_node_pairs,
batch_size, _vars_steps_and_stuff, _values, _ret_val_mapping,
_usef_path_res_init, _usef_path_chunk_q, _usef_path_chunk_result_ext,
_usef_path_res_update
)


# noinspection PyUnusedLocal
def _usef_path_res_init(_, **kwds):
return []


def _usef_path_chunk_q(gp, _vars_steps_and_stuff, values_chunk):
var_to_fix, var_to_count, startvar, _valueblocks, steps, avglimit, gp_in \
= _vars_steps_and_stuff
valueblocks = {
startvar: {
(startvar,):
[(tup[0],) for tup in values_chunk] if startvar == SOURCE_VAR
else [(tup[1],) for tup in values_chunk]
}
}
valueblocks.update(_valueblocks)
return gp.to_sparql_useful_path_query(
var_to_fix,
var_to_count,
valueblocks,
steps,
startvar,
avglimit=avglimit,
gp_in=gp_in
)


# noinspection PyUnusedLocal
def _usef_path_chunk_result_ext(q_res, _vars_steps_and_stuff, _, **kwds):
var_to_fix, var_to_count, startvar, _valueblocks, steps, avglimit, gp_in \
= _vars_steps_and_stuff
chunk_res = []
res_rows_path = ['results', 'bindings']
bindings = sparql_json_result_bindings_to_rdflib(
get_path(q_res, res_rows_path, default=[])
)

for row in bindings:
# TODO: Drüber nachdenken, ob iwie die avg-outgoing auch mit
# zurückgegeben werden sollen
chunk_res.append(get_path(row, [var_to_fix]))
return chunk_res


def _usef_path_res_update(res, update, **_):
res += update


def useful_path_inst_query(
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

deep_query_path_inst_query

sparql,
timeout,
graph_pattern,
hop,
valueblocks,
steps,
gp_in=False,
batch_size=None
):
_query_stats.useful_path_inst_query_count += 1
# TODO: evtl. je 10 pro 'gefixter' Variable von batch-size abziehen
# (weil der Block ja mit rein geht)
_values = graph_pattern.matching_node_pairs
# evtl. Schnitt mit noch nicht abgedeckten
_ret_val_mapping = {stp: [stp] for stp in graph_pattern.matching_node_pairs}
_vars_steps_and_stuff = (hop, valueblocks, steps, gp_in)
return _multi_query(
sparql, timeout, graph_pattern, graph_pattern.matching_node_pairs,
batch_size, _vars_steps_and_stuff, _values, _ret_val_mapping,
_usef_path_inst_res_init, _usef_path_inst_chunk_q,
_usef_path_inst_chunk_result_ext, _usef_path_inst_res_update
)


# noinspection PyUnusedLocal
def _usef_path_inst_res_init(_, **kwds):
return []


def _usef_path_inst_chunk_q(gp, _vars_steps_and_stuff, values_chunk):
hop, _valueblocks, steps, gp_in = _vars_steps_and_stuff
valueblocks = {
'st': {
(SOURCE_VAR, TARGET_VAR): values_chunk
}
}
valueblocks.update(_valueblocks)
return gp.to_sparql_useful_path_inst_query(
hop, valueblocks, steps, gp_in=gp_in
)


# noinspection PyUnusedLocal
def _usef_path_inst_chunk_result_ext(q_res, _vars_steps_and_stuff, _, **kwds):
hop, _valueblocks, steps, gp_in = _vars_steps_and_stuff
chunk_res = []
res_rows_path = ['results', 'bindings']
bindings = sparql_json_result_bindings_to_rdflib(
get_path(q_res, res_rows_path, default=[])
)

for row in bindings:
chunk_res.append([get_path(row, [h]) for h in hop])
return chunk_res


def _usef_path_inst_res_update(res, update, **_):
res += update


def generate_stps_from_gp(sparql, gp):
Expand Down
Loading