examples/program_analysis/analyzelib.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244

"""Helper methods for analyzing program source code."""
from collections import defaultdict
from tactic_utils import ApplyRulesMatching, SearchRules, GetMatcher, Fix
from mapper import MapperCodelet
from lazy_structure import LazyTextDocument, SPECIAL_CHARACTERS
from ts_utils import RegisterRule, AssertNodesEqual
from analogy_utils import Analogy

def LoadDocument(path, extra_chunks=None, extra_special=None):
    """Load a document from the file system into a LazyTextDocument.

    @extra_chunks: list of ranges of characters in the document that should be
        chunked together.
    @extra_special: list of special tokens that should be chunked together.
    """
    extra_special = extra_special or []
    with open(path, "r") as in_file:
        text = in_file.readlines()
    chunks = []
    for start, length in (extra_chunks or []):
        chunks.append("".join(text[start:(start + length)]).lstrip(" "))
    text = "".join(text)
    return LazyTextDocument(text, chunks + extra_special + SPECIAL_CHARACTERS)

def CompleteAnalogyTactic(structure, sources):
    """Heuristic for completing an analogy involving code transformations.

    @sources should be a list of before/after LazyTextDocuments.
    @sources[0][0] and @sources[1][0] should be the before-code of
    @sources[0][1] and @sources[1][1] respectively.
    @sources[2][0] should be the prompt before code, and @sources[2][1] should
    be a currently-empty LazyTextDocument to be generated by Sifter.
    """
    structure.ts.commit()
    print("Identifying word pairs...")
    ApplyRulesMatching(structure.rt, "SameWord", dict())

    print("Mapping the examples...")
    analogy = Analogy.Begin(structure.rt, dict({
        "/:Mapper:NoSlipRules:A":
            structure.NodeOfChunk(sources[0][0], sources[0][0].chunks[0]),
        "/:Mapper:NoSlipRules:B":
            structure.NodeOfChunk(sources[1][0], sources[1][0].chunks[0]),
        "/:Mapper:NoSlipRules:C": "/:Chunk",
    }), extend_here=["/:Document"])
    ExtendAnalogyTactic(structure, analogy)

    print("Mapping the prompt...")
    analogy = Analogy.Begin(structure.rt, dict({
        "/:Mapper:NoSlipRules:A":
            structure.NodeOfChunk(sources[0][0], sources[0][0].chunks[0]),
        "/:Mapper:NoSlipRules:B":
            structure.NodeOfChunk(sources[2][0], sources[2][0].chunks[0]),
        "/:Mapper:NoSlipRules:C": "/:Chunk",
    }), exists=True, extend_here=["/:Document"])
    ExtendAnalogyTactic(structure, analogy)

    print("Solving the prompt...")
    analogy.state = "concretize"
    Fix(analogy.ExtendMap, ["/:Document"])
    Fix(analogy.ExtendMap)
    Fix(analogy.ExtendMap, no_follow=True)
    Fix(ApplyRulesMatching, structure.rt, "SolveWord", dict())

def ExtendAnalogyTactic(structure, analogy):
    """Tactic to extend an analogy involving source code.

    See AnalogyUtils.md for the basic idea. This method describes a certain
    ordering of nodes in the breadth-first-search. We first connect the
    before-code to its documentation and after-code. Then we follow tokens that
    are adjacent to each other. Finally, we look at any facts about words being
    the same as each other. CrossDoc is used to identify associated chunks
    across documents.
    """
    Fix(analogy.ExtendMap, ["/:TransformPair:Before", "/:Documentation:Code"])
    Fix(analogy.ExtendMap, ["/:Follower:Before", "/:Follower:After"])
    CrossDoc(structure, analogy)
    Fix(analogy.ExtendMap, ["/:Follower:Before", "/:Follower:After"])
    CrossDoc(structure, analogy)
    Fix(analogy.ExtendMap, ["/:Follower:Before", "/:Follower:After"])
    Fix(analogy.ExtendMap, ["/:Chunk", "/:SameWord"], no_follow=True)
    # If we have no_top=False, then it will map Documents:...:_IsMember before
    # we start concretizing. But then the concretization pass will completely
    # ignore _IsMember, so it will never concretize the contents. Basically, we
    # just want to deal with the TOP nodes later, when concretizing.
    Fix(analogy.ExtendMap, no_follow=True, no_top=True)

FOLLOW_RELATIONS = [
    "/:Follower:Before", "/:Follower:After", "/:Document",
    "/:TransformPair:After",
]

def CrossDoc(structure, analogy):
    """Tactic to bootstrap an analogy on a new document.

    Basically, the idea here is if you've already marked two tokens as
    corresponding between Documents 0 and 2, we may want to use that
    information to mark two different tokens as corresponding in Document 1 and
    3. We would usually want to do this by following facts like 'SameWord.' But
    these facts may not be unique (there could be 100 instances of the same
    token), so CrossDoc looks to only follow the facts that are unique. See
    AnalogyUtils.md for a more complete description.
    """
    ts, rt = structure.ts, structure.rt
    scope = ts.scope("/:Rules:TagCrossDoc:MustMap", protect=True)
    no_slip = rt.ts.scope("/:Mapper:NoSlipRules", protect=True)

    rule = SearchRules(rt, "Tag")[0]
    new_partial = dict({
        "/:Rules:TagCrossDoc:MustMap:MAlphaA": analogy.MAlphaA,
        "/:Rules:TagCrossDoc:MustMap:MAlphaB": analogy.MAlphaB,
    })
    matcher = GetMatcher(rt, rule, new_partial)
    matcher.sync()
    found = defaultdict(set)
    for assign in matcher.assignments():
        assigned = assign.assignment
        key = (assigned[scope[":ChunkA"]], assigned[scope[":ChunkMapA"]], assigned[scope[":DocA'"]])
        found[key].add(assigned[scope[":ChunkMapB"]])
    for assign in matcher.assignments():
        assigned = assign.assignment
        key = (assigned[scope[":ChunkA"]], assigned[scope[":ChunkMapA"]], assigned[scope[":DocA'"]])
        if len(found[key]) != 1:
            continue
        analogy.ExtendMap(partial=dict({
            no_slip[":A"]: assigned[scope[":ChunkA"]],
            no_slip[":B"]: assigned[scope[":ChunkB"]],
            no_slip[":MA"]: assigned[scope[":ChunkMapA"]],
            no_slip[":MB"]: assigned[scope[":ChunkMapB"]],
            no_slip[":C"]: assigned[scope[":ChunkType"]],
        }))

def AnalyzeCodelets(ts):
    """Adds helper codelets to identify patterns used in the tactics above."""
    mapper = MapperCodelet(ts)
    ts.add_node("/:TransformPair:Before")
    ts.add_node("/:TransformPair:After")
    ts.add_node("/:Documentation:Code")
    ts.add_node("/:Documentation:Docs")
    with ts.scope("/:Rules:SameWord"):
        with ts.scope(":MustMap") as exists:
            ts[":MA"].map({ts[":A"]: ts[":Word"]})
            ts[":MB"].map({ts[":B"]: ts[":Word"]})
            ts[":IsWord"].map({ts[":Word"]: ts["/:Word"]})
            ts[":AMember"].map({
                ts[":A"]: ts["/:Chunk"],
                ts[":DocA"]: ts["/:Document"],
            })
            ts[":BMember"].map({
                ts[":B"]: ts["/:Chunk"],
                ts[":DocB"]: ts["/:Document"],
            })
            ts[":AreRelated"].map({
                ts[":DocA"]: ts[":Doc1Relation"],
                ts[":DocB"]: ts[":Doc2Relation"],
            })
        with ts.scope(":NoMap:Insert"):
            ts[":AreSame"].map({
                exists[":A"]: ts["/:SameWord"],
                exists[":B"]: ts["/:SameWord"],
            })
        # Some of these may be unnecessary, they seem to slow things down as
        # well. For the transformation ones we don't need relations within a
        # doc.
        ts[":??"].map({
            ts[":_"]: ts["/RULE"],
            exists[":AMember"]: ts["/MAYBE="],
            exists[":BMember"]: ts["/MAYBE="],
            exists[":AreRelated"]: ts["/MAYBE="],
        })
        ts[":??"].map({
            ts[":_"]: ts["/RULE"],
            exists[":DocA"]: ts["/MAYBE="],
            exists[":DocB"]: ts["/MAYBE="],
        })
        ts[":??"].map({
            ts[":_"]: ts["/RULE"],
            exists[":Doc1Relation"]: ts["/MAYBE="],
            exists[":Doc2Relation"]: ts["/MAYBE="],
        })
        RegisterRule(ts)
    with ts.scope("/:Rules:SolveWord"):
        with ts.scope(":MustMap") as exists:
            ts[":MA"].map({ts[":A"]: ts[":Word"]})
            ts[":IsWord"].map({ts[":Word"]: ts["/:Word"]})
            ts[":AreSame"].map({
                ts[":A"]: ts["/:SameWord"],
                ts[":B"]: ts["/:SameWord"],
            })
        with ts.scope(":NoMap:Insert"):
            ts[":MB"].map({exists[":B"]: exists[":Word"]})
        RegisterRule(ts)

    """Crossdoc tagger.

    The idea here is to find chunks ChunkA in document A and ChunkB in document
    B which are currently mapped together. We then want to find documents A',
    B' with chunks ChunkA' and ChunkB' such that A and A' are related in the
    same way as B and B', and ChunkA and ChunkA' are related in the same ways
    as ChunkB and ChunkB'.

    Furthermore, we don't want there to be any _other_ ChunkA', ChunkB' which
    are related in that way. I don't think it's actually possible to express
    that as a TS rule using the current set up, so we'll enforce that later in
    the Python tactic.
    """
    mapper = ts.scope("/:Mapper", protect=True)
    with ts.scope("/:Rules:TagCrossDoc"):
        with ts.scope(":MustMap") as exists:
            for x in ("A", "B"):
                ts[f":IsAbstraction{x}"].map({
                    ts[f":MAlpha{x}"]: ts[mapper[":Abstraction"]]
                })
            for x in ("A", "A'", "B", "B'"):
                ts[f":Doc{x}IsDoc"].map({
                    ts[f":Doc{x}"]: ts["/:Document"],
                    ts[f":Chunk{x}"]: ts["/:Chunk"],
                })
            for x in ("A", "B"):
                ts[f":DocMap{x}"].map({
                    ts[f":Doc{x}"]: ts[":DocType"],
                    ts[f":Doc{x}'"]: ts[":Doc'Type"],
                })
                ts[f":ChunkMap{x}"].map({
                    ts[f":Chunk{x}"]: ts[":ChunkType"],
                    ts[f":Chunk{x}'"]: ts[":Chunk'Type"],
                })
                ts[f":MAlpha{x}"].map({
                    ts[f":Doc{x}"]: ts[":AlphaDoc"],
                    ts[f":Doc{x}'"]: ts[":AlphaDoc'"],
                    ts[f":Chunk{x}"]: ts[":AlphaChunk"],
                })
        RegisterRule(ts)
        equivalence_classes = [
            [":ChunkType", ":Chunk'Type"],
            [":DocType", ":Doc'Type"],
            [":DocA", ":DocB", ":DocA'", ":DocB'"],
            [":DocMapA", ":DocMapB"],
            [":DocAIsDoc", ":DocBIsDoc", ":DocA'IsDoc", ":DocB'IsDoc"],
            [":ChunkA", ":ChunkB", ":ChunkA'", ":ChunkB'"],
        ]
        for equivalence_class in equivalence_classes:
            equivalence_class = [exists[name] for name in equivalence_class]
            AssertNodesEqual(ts, equivalence_class, "", equal_type="/MAYBE=")