|
16 | 16 | import logging
|
17 | 17 | import random
|
18 | 18 | import string
|
| 19 | +import textwrap |
19 | 20 |
|
20 | 21 | import deap
|
21 | 22 | import deap.base
|
|
31 | 32 | import six
|
32 | 33 |
|
33 | 34 | from utils import URIShortener
|
34 |
| -import config |
35 | 35 |
|
36 | 36 | logger = logging.getLogger(__name__)
|
37 | 37 |
|
@@ -717,84 +717,101 @@ def to_count_var_over_values_query(self, var, vars_, values, limit):
|
717 | 717 | res += 'LIMIT %d\n' % limit
|
718 | 718 | return self._sparql_prefix(res)
|
719 | 719 |
|
720 |
| - def to_find_edge_var_for_narrow_path_query( |
721 |
| - self, edge_var, node_var, vars_, values, limit_res, |
722 |
| - filter_node_count=config.MUTPB_DN_FILTER_NODE_COUNT, |
723 |
| - filter_edge_count=config.MUTPB_DN_FILTER_EDGE_COUNT, |
| 720 | + def to_deep_narrow_path_query( |
| 721 | + self, edge_var, node_var, vars_, values, |
| 722 | + limit, max_node_count, min_edge_count, |
724 | 723 | ):
|
725 |
| - """Counts possible substitutions for edge_var to get a narrow path |
| 724 | + """Counts possible substitutions for edge_var to get a narrow path. |
726 | 725 |
|
727 | 726 | Meant to perform a query like this:
|
728 |
| - SELECT * |
729 |
| - { |
| 727 | + PREFIX dbr: <http://dbpedia.org/resource/> |
| 728 | + SELECT * WHERE { |
730 | 729 | {
|
731 |
| - SELECT |
732 |
| - ?edge_var |
733 |
| - (COUNT(*) AS ?edge_var_count) |
734 |
| - (MAX(?node_var_count) AS ?max_node_count) |
735 |
| - (COUNT(*)/AVG(?node_var_count) as ?prio_var) |
736 |
| - { |
737 |
| - SELECT DISTINCT |
738 |
| - ?source ?target ?edge_var (COUNT(?node_var) AS ?node_var_count) |
739 |
| - { |
740 |
| - VALUES (?source ?target) { |
741 |
| - (dbr:Adolescence dbr:Youth) |
742 |
| - (dbr:Adult dbr:Child) |
743 |
| - (dbr:Angel dbr:Heaven) |
744 |
| - (dbr:Arithmetic dbr:Mathematics) |
745 |
| - } |
746 |
| - ?node_var ?edge_var ?source . |
747 |
| - ?source dbo:wikiPageWikiLink ?target . |
748 |
| - } |
| 730 | + SELECT ?edge_var |
| 731 | + (SUM(?node_var_count) AS ?node_var_sum) |
| 732 | + (COUNT(?source && ?target) AS ?edge_var_count) |
| 733 | + (MAX(?node_var_count) AS ?max_node_count) |
| 734 | + WHERE { |
| 735 | + SELECT DISTINCT ?source ?target ?edge_var |
| 736 | + (COUNT(?node_var) AS ?node_var_count) |
| 737 | + WHERE { |
| 738 | + VALUES (?source ?target) { |
| 739 | + (dbr:Barrel dbr:Wine) |
| 740 | + (dbr:Barrister dbr:Law) |
| 741 | + (dbr:Beak dbr:Bird) |
| 742 | + (dbr:Blanket dbr:Bed) |
| 743 | + } |
| 744 | + ?node_var ?edge_var ?source . |
| 745 | + ?source <http://dbpedia.org/ontology/wikiPageWikiLink> ?target . |
749 | 746 | }
|
750 |
| - GROUP BY ?edge_var |
751 |
| - ORDER BY DESC(?edge_var_count) |
| 747 | + } |
| 748 | + GROUP BY ?edge_var |
752 | 749 | }
|
753 |
| - FILTER(?max_node_count < 10 && ?edge_var_count > 1) |
754 |
| - } |
755 |
| - ORDER BY DESC(?prio_var) |
756 |
| - LIMIT 32 |
| 750 | + FILTER(?max_node_count <= 10 |
| 751 | + && ?edge_var_count >= 2) |
| 752 | + } |
| 753 | + ORDER BY DESC(?edge_var_count) ASC(?node_var_sum) |
| 754 | + LIMIT 32 |
| 755 | +
|
| 756 | + The idea here is to expand a random node (?source in the example above) |
| 757 | + with new variable triple and then try to fix its edge in a way that the |
| 758 | + degree (?node_var_count) isn't too high (<= max_node_count). We're also |
| 759 | + interested in the avg degree being low. In light of query chunking the |
| 760 | + sum is returned here (instead of AVG). |
| 761 | +
|
| 762 | + Apart from minimizing the degrees, we would also like to maximize the |
| 763 | + number of stps an ?edge_var fixation is valid for (?edge_var_count). |
| 764 | +
|
| 765 | + See gp_learner.mutate_deep_narrow_path() for more. |
757 | 766 |
|
758 | 767 | :param edge_var: Edge variable to find substitution for.
|
759 | 768 | :param node_var: Node variable to count.
|
760 | 769 | :param vars_: List of vars to fix values for (e.g. ?source, ?target).
|
761 | 770 | :param values: List of value lists for vars_.
|
762 |
| - :param filter_node_count: Filter on node count of edge variable. |
763 |
| - :param filter_edge_count: Filter for edge count of triples. |
764 |
| - :param limit_res : limit result size |
| 771 | + :param max_node_count: Filter on node count of edge variable. |
| 772 | + :param min_edge_count: Filter for edge count of triples. |
| 773 | + :param limit : limit result size. |
765 | 774 | :return: Query String.
|
766 | 775 | """
|
767 | 776 |
|
768 |
| - res = 'SELECT * WHERE {\n' |
769 |
| - res += ' {\n'\ |
770 |
| - ' SELECT %s (SUM (?node_var_count) AS %s) (COUNT(%s) AS %s) ' \ |
771 |
| - '(MAX(?node_var_count) AS ?max_node_count) WHERE {\n' % ( |
772 |
| - edge_var.n3(), |
773 |
| - NODE_VAR_SUM.n3(), |
774 |
| - ' && '.join([v.n3() for v in vars_]), |
775 |
| - EDGE_VAR_COUNT.n3(), ) |
776 |
| - res += ' SELECT DISTINCT %s %s (COUNT(%s) AS ?node_var_count) ' \ |
777 |
| - 'WHERE {\n ' % (' '.join([v.n3() for v in vars_]), |
778 |
| - edge_var.n3(), node_var.n3(), ) |
779 |
| - res += self._sparql_values_part(values) |
780 |
| - |
781 |
| - # triples part |
782 |
| - tres = [] |
783 |
| - for s, p, o in self: |
784 |
| - tres.append('%s %s %s .' % (s.n3(), p.n3(), o.n3())) |
785 |
| - indent = ' ' * 3 |
786 |
| - triples = indent + ('\n' + indent).join(tres) + '\n' |
787 |
| - res += triples |
788 |
| - res += ' }\n'\ |
789 |
| - ' }\n' |
790 |
| - res += ' GROUP BY %s\n' % edge_var.n3() |
791 |
| - res += ' }\n' |
792 |
| - res += ' FILTER(?max_node_count < %d && %s > %d)\n' \ |
793 |
| - % (filter_node_count, EDGE_VAR_COUNT.n3(), |
794 |
| - filter_edge_count) |
795 |
| - res += '}\n' |
796 |
| - res += 'ORDER BY ASC(%s)\n' % NODE_VAR_SUM.n3() |
797 |
| - res += 'LIMIT %d' % limit_res |
| 777 | + res = '''\ |
| 778 | + SELECT * WHERE { |
| 779 | + { |
| 780 | + SELECT %(edge_var)s |
| 781 | + (SUM(?node_var_count) AS %(node_var_sum)s) |
| 782 | + (COUNT(%(vars_and)s) AS %(edge_var_count)s) |
| 783 | + (MAX(?node_var_count) AS ?max_node_count) |
| 784 | + WHERE { |
| 785 | + SELECT DISTINCT %(vars)s %(edge_var)s |
| 786 | + (COUNT(%(node_var)s) AS ?node_var_count) |
| 787 | + WHERE {\n%(values_part)s %(triples)s |
| 788 | + } |
| 789 | + } |
| 790 | + GROUP BY %(edge_var)s |
| 791 | + } |
| 792 | + FILTER(?max_node_count <= %(max_node_count)d |
| 793 | + && %(edge_var_count)s >= %(min_edge_count)d) |
| 794 | + } |
| 795 | + ORDER BY DESC(%(edge_var_count)s) ASC(%(node_var_sum)s) |
| 796 | + LIMIT %(limit)d |
| 797 | + ''' % { |
| 798 | + # TODO: adapt self._sparql_values_part for template use (indent) |
| 799 | + 'edge_var': edge_var.n3(), |
| 800 | + 'node_var_sum': NODE_VAR_SUM.n3(), |
| 801 | + 'vars_and': ' && '.join([v.n3() for v in vars_]), |
| 802 | + 'edge_var_count': EDGE_VAR_COUNT.n3(), |
| 803 | + 'vars': ' '.join([v.n3() for v in vars_]), |
| 804 | + 'node_var': node_var.n3(), |
| 805 | + 'values_part': self._sparql_values_part( |
| 806 | + values, indent=' '), |
| 807 | + 'triples': '\n '.join( |
| 808 | + '%s %s %s .' % (s.n3(), p.n3(), o.n3()) for s, p, o in self |
| 809 | + ), |
| 810 | + 'limit': limit, |
| 811 | + 'max_node_count': max_node_count, |
| 812 | + 'min_edge_count': min_edge_count, |
| 813 | + } |
| 814 | + res = textwrap.dedent(res) |
798 | 815 | return self._sparql_prefix(res)
|
799 | 816 |
|
800 | 817 | def to_dict(self):
|
|
0 commit comments