diff options
Diffstat (limited to 'examples/program_analysis/analyzelib.py')
-rw-r--r-- | examples/program_analysis/analyzelib.py | 244 |
1 files changed, 244 insertions, 0 deletions
diff --git a/examples/program_analysis/analyzelib.py b/examples/program_analysis/analyzelib.py new file mode 100644 index 0000000..b7177f3 --- /dev/null +++ b/examples/program_analysis/analyzelib.py @@ -0,0 +1,244 @@ +"""Helper methods for analyzing program source code.""" +from collections import defaultdict +from tactic_utils import ApplyRulesMatching, SearchRules, GetMatcher, Fix +from mapper import MapperCodelet +from lazy_structure import LazyTextDocument, SPECIAL_CHARACTERS +from ts_utils import RegisterRule, AssertNodesEqual +from analogy_utils import Analogy + +def LoadDocument(path, extra_chunks=None, extra_special=None): + """Load a document from the file system into a LazyTextDocument. + + @extra_chunks: list of ranges of characters in the document that should be + chunked together. + @extra_special: list of special tokens that should be chunked together. + """ + extra_special = extra_special or [] + with open(path, "r") as in_file: + text = in_file.readlines() + chunks = [] + for start, length in (extra_chunks or []): + chunks.append("".join(text[start:(start + length)]).lstrip(" ")) + text = "".join(text) + return LazyTextDocument(text, chunks + extra_special + SPECIAL_CHARACTERS) + +def CompleteAnalogyTactic(structure, sources): + """Heuristic for completing an analogy involving code transformations. + + @sources should be a list of before/after LazyTextDocuments. + @sources[0][0] and @sources[1][0] should be the before-code of + @sources[0][1] and @sources[1][1] respectively. + @sources[2][0] should be the prompt before code, and @sources[2][1] should + be a currently-empty LazyTextDocument to be generated by Sifter. + """ + structure.ts.commit() + print("Identifying word pairs...") + ApplyRulesMatching(structure.rt, "SameWord", dict()) + + print("Mapping the examples...") + analogy = Analogy.Begin(structure.rt, dict({ + "/:Mapper:NoSlipRules:A": + structure.NodeOfChunk(sources[0][0], sources[0][0].chunks[0]), + "/:Mapper:NoSlipRules:B": + structure.NodeOfChunk(sources[1][0], sources[1][0].chunks[0]), + "/:Mapper:NoSlipRules:C": "/:Chunk", + }), extend_here=["/:Document"]) + ExtendAnalogyTactic(structure, analogy) + + print("Mapping the prompt...") + analogy = Analogy.Begin(structure.rt, dict({ + "/:Mapper:NoSlipRules:A": + structure.NodeOfChunk(sources[0][0], sources[0][0].chunks[0]), + "/:Mapper:NoSlipRules:B": + structure.NodeOfChunk(sources[2][0], sources[2][0].chunks[0]), + "/:Mapper:NoSlipRules:C": "/:Chunk", + }), exists=True, extend_here=["/:Document"]) + ExtendAnalogyTactic(structure, analogy) + + print("Solving the prompt...") + analogy.state = "concretize" + Fix(analogy.ExtendMap, ["/:Document"]) + Fix(analogy.ExtendMap) + Fix(analogy.ExtendMap, no_follow=True) + Fix(ApplyRulesMatching, structure.rt, "SolveWord", dict()) + +def ExtendAnalogyTactic(structure, analogy): + """Tactic to extend an analogy involving source code. + + See AnalogyUtils.md for the basic idea. This method describes a certain + ordering of nodes in the breadth-first-search. We first connect the + before-code to its documentation and after-code. Then we follow tokens that + are adjacent to each other. Finally, we look at any facts about words being + the same as each other. CrossDoc is used to identify associated chunks + across documents. + """ + Fix(analogy.ExtendMap, ["/:TransformPair:Before", "/:Documentation:Code"]) + Fix(analogy.ExtendMap, ["/:Follower:Before", "/:Follower:After"]) + CrossDoc(structure, analogy) + Fix(analogy.ExtendMap, ["/:Follower:Before", "/:Follower:After"]) + CrossDoc(structure, analogy) + Fix(analogy.ExtendMap, ["/:Follower:Before", "/:Follower:After"]) + Fix(analogy.ExtendMap, ["/:Chunk", "/:SameWord"], no_follow=True) + # If we have no_top=False, then it will map Documents:...:_IsMember before + # we start concretizing. But then the concretization pass will completely + # ignore _IsMember, so it will never concretize the contents. Basically, we + # just want to deal with the TOP nodes later, when concretizing. + Fix(analogy.ExtendMap, no_follow=True, no_top=True) + +FOLLOW_RELATIONS = [ + "/:Follower:Before", "/:Follower:After", "/:Document", + "/:TransformPair:After", +] + +def CrossDoc(structure, analogy): + """Tactic to bootstrap an analogy on a new document. + + Basically, the idea here is if you've already marked two tokens as + corresponding between Documents 0 and 2, we may want to use that + information to mark two different tokens as corresponding in Document 1 and + 3. We would usually want to do this by following facts like 'SameWord.' But + these facts may not be unique (there could be 100 instances of the same + token), so CrossDoc looks to only follow the facts that are unique. See + AnalogyUtils.md for a more complete description. + """ + ts, rt = structure.ts, structure.rt + scope = ts.scope("/:Rules:TagCrossDoc:MustMap", protect=True) + no_slip = rt.ts.scope("/:Mapper:NoSlipRules", protect=True) + + rule = SearchRules(rt, "Tag")[0] + new_partial = dict({ + "/:Rules:TagCrossDoc:MustMap:MAlphaA": analogy.MAlphaA, + "/:Rules:TagCrossDoc:MustMap:MAlphaB": analogy.MAlphaB, + }) + matcher = GetMatcher(rt, rule, new_partial) + matcher.sync() + found = defaultdict(set) + for assign in matcher.assignments(): + assigned = assign.assignment + key = (assigned[scope[":ChunkA"]], assigned[scope[":ChunkMapA"]], assigned[scope[":DocA'"]]) + found[key].add(assigned[scope[":ChunkMapB"]]) + for assign in matcher.assignments(): + assigned = assign.assignment + key = (assigned[scope[":ChunkA"]], assigned[scope[":ChunkMapA"]], assigned[scope[":DocA'"]]) + if len(found[key]) != 1: + continue + analogy.ExtendMap(partial=dict({ + no_slip[":A"]: assigned[scope[":ChunkA"]], + no_slip[":B"]: assigned[scope[":ChunkB"]], + no_slip[":MA"]: assigned[scope[":ChunkMapA"]], + no_slip[":MB"]: assigned[scope[":ChunkMapB"]], + no_slip[":C"]: assigned[scope[":ChunkType"]], + })) + +def AnalyzeCodelets(ts): + """Adds helper codelets to identify patterns used in the tactics above.""" + mapper = MapperCodelet(ts) + ts.add_node("/:TransformPair:Before") + ts.add_node("/:TransformPair:After") + ts.add_node("/:Documentation:Code") + ts.add_node("/:Documentation:Docs") + with ts.scope("/:Rules:SameWord"): + with ts.scope(":MustMap") as exists: + ts[":MA"].map({ts[":A"]: ts[":Word"]}) + ts[":MB"].map({ts[":B"]: ts[":Word"]}) + ts[":IsWord"].map({ts[":Word"]: ts["/:Word"]}) + ts[":AMember"].map({ + ts[":A"]: ts["/:Chunk"], + ts[":DocA"]: ts["/:Document"], + }) + ts[":BMember"].map({ + ts[":B"]: ts["/:Chunk"], + ts[":DocB"]: ts["/:Document"], + }) + ts[":AreRelated"].map({ + ts[":DocA"]: ts[":Doc1Relation"], + ts[":DocB"]: ts[":Doc2Relation"], + }) + with ts.scope(":NoMap:Insert"): + ts[":AreSame"].map({ + exists[":A"]: ts["/:SameWord"], + exists[":B"]: ts["/:SameWord"], + }) + # Some of these may be unnecessary, they seem to slow things down as + # well. For the transformation ones we don't need relations within a + # doc. + ts[":??"].map({ + ts[":_"]: ts["/RULE"], + exists[":AMember"]: ts["/MAYBE="], + exists[":BMember"]: ts["/MAYBE="], + exists[":AreRelated"]: ts["/MAYBE="], + }) + ts[":??"].map({ + ts[":_"]: ts["/RULE"], + exists[":DocA"]: ts["/MAYBE="], + exists[":DocB"]: ts["/MAYBE="], + }) + ts[":??"].map({ + ts[":_"]: ts["/RULE"], + exists[":Doc1Relation"]: ts["/MAYBE="], + exists[":Doc2Relation"]: ts["/MAYBE="], + }) + RegisterRule(ts) + with ts.scope("/:Rules:SolveWord"): + with ts.scope(":MustMap") as exists: + ts[":MA"].map({ts[":A"]: ts[":Word"]}) + ts[":IsWord"].map({ts[":Word"]: ts["/:Word"]}) + ts[":AreSame"].map({ + ts[":A"]: ts["/:SameWord"], + ts[":B"]: ts["/:SameWord"], + }) + with ts.scope(":NoMap:Insert"): + ts[":MB"].map({exists[":B"]: exists[":Word"]}) + RegisterRule(ts) + + """Crossdoc tagger. + + The idea here is to find chunks ChunkA in document A and ChunkB in document + B which are currently mapped together. We then want to find documents A', + B' with chunks ChunkA' and ChunkB' such that A and A' are related in the + same way as B and B', and ChunkA and ChunkA' are related in the same ways + as ChunkB and ChunkB'. + + Furthermore, we don't want there to be any _other_ ChunkA', ChunkB' which + are related in that way. I don't think it's actually possible to express + that as a TS rule using the current set up, so we'll enforce that later in + the Python tactic. + """ + mapper = ts.scope("/:Mapper", protect=True) + with ts.scope("/:Rules:TagCrossDoc"): + with ts.scope(":MustMap") as exists: + for x in ("A", "B"): + ts[f":IsAbstraction{x}"].map({ + ts[f":MAlpha{x}"]: ts[mapper[":Abstraction"]] + }) + for x in ("A", "A'", "B", "B'"): + ts[f":Doc{x}IsDoc"].map({ + ts[f":Doc{x}"]: ts["/:Document"], + ts[f":Chunk{x}"]: ts["/:Chunk"], + }) + for x in ("A", "B"): + ts[f":DocMap{x}"].map({ + ts[f":Doc{x}"]: ts[":DocType"], + ts[f":Doc{x}'"]: ts[":Doc'Type"], + }) + ts[f":ChunkMap{x}"].map({ + ts[f":Chunk{x}"]: ts[":ChunkType"], + ts[f":Chunk{x}'"]: ts[":Chunk'Type"], + }) + ts[f":MAlpha{x}"].map({ + ts[f":Doc{x}"]: ts[":AlphaDoc"], + ts[f":Doc{x}'"]: ts[":AlphaDoc'"], + ts[f":Chunk{x}"]: ts[":AlphaChunk"], + }) + RegisterRule(ts) + equivalence_classes = [ + [":ChunkType", ":Chunk'Type"], + [":DocType", ":Doc'Type"], + [":DocA", ":DocB", ":DocA'", ":DocB'"], + [":DocMapA", ":DocMapB"], + [":DocAIsDoc", ":DocBIsDoc", ":DocA'IsDoc", ":DocB'IsDoc"], + [":ChunkA", ":ChunkB", ":ChunkA'", ":ChunkB'"], + ] + for equivalence_class in equivalence_classes: + equivalence_class = [exists[name] for name in equivalence_class] + AssertNodesEqual(ts, equivalence_class, "", equal_type="/MAYBE=") |