1 files changed, 244 insertions, 0 deletions
diff --git a/examples/program_analysis/analyzelib.py b/examples/program_analysis/analyzelib.py
new file mode 100644
index 0000000..b7177f3
--- /dev/null
+++ b/examples/program_analysis/analyzelib.py
@@ -0,0 +1,244 @@
+"""Helper methods for analyzing program source code."""
+from collections import defaultdict
+from tactic_utils import ApplyRulesMatching, SearchRules, GetMatcher, Fix
+from mapper import MapperCodelet
+from lazy_structure import LazyTextDocument, SPECIAL_CHARACTERS
+from ts_utils import RegisterRule, AssertNodesEqual
+from analogy_utils import Analogy
+
+def LoadDocument(path, extra_chunks=None, extra_special=None):
+    """Load a document from the file system into a LazyTextDocument.
+
+    @extra_chunks: list of ranges of characters in the document that should be
+        chunked together.
+    @extra_special: list of special tokens that should be chunked together.
+    """
+    extra_special = extra_special or []
+    with open(path, "r") as in_file:
+        text = in_file.readlines()
+    chunks = []
+    for start, length in (extra_chunks or []):
+        chunks.append("".join(text[start:(start + length)]).lstrip(" "))
+    text = "".join(text)
+    return LazyTextDocument(text, chunks + extra_special + SPECIAL_CHARACTERS)
+
+def CompleteAnalogyTactic(structure, sources):
+    """Heuristic for completing an analogy involving code transformations.
+
+    @sources should be a list of before/after LazyTextDocuments.
+    @sources[0][0] and @sources[1][0] should be the before-code of
+    @sources[0][1] and @sources[1][1] respectively.
+    @sources[2][0] should be the prompt before code, and @sources[2][1] should
+    be a currently-empty LazyTextDocument to be generated by Sifter.
+    """
+    structure.ts.commit()
+    print("Identifying word pairs...")
+    ApplyRulesMatching(structure.rt, "SameWord", dict())
+
+    print("Mapping the examples...")
+    analogy = Analogy.Begin(structure.rt, dict({
+        "/:Mapper:NoSlipRules:A":
+            structure.NodeOfChunk(sources[0][0], sources[0][0].chunks[0]),
+        "/:Mapper:NoSlipRules:B":
+            structure.NodeOfChunk(sources[1][0], sources[1][0].chunks[0]),
+        "/:Mapper:NoSlipRules:C": "/:Chunk",
+    }), extend_here=["/:Document"])
+    ExtendAnalogyTactic(structure, analogy)
+
+    print("Mapping the prompt...")
+    analogy = Analogy.Begin(structure.rt, dict({
+        "/:Mapper:NoSlipRules:A":
+            structure.NodeOfChunk(sources[0][0], sources[0][0].chunks[0]),
+        "/:Mapper:NoSlipRules:B":
+            structure.NodeOfChunk(sources[2][0], sources[2][0].chunks[0]),
+        "/:Mapper:NoSlipRules:C": "/:Chunk",
+    }), exists=True, extend_here=["/:Document"])
+    ExtendAnalogyTactic(structure, analogy)
+
+    print("Solving the prompt...")
+    analogy.state = "concretize"
+    Fix(analogy.ExtendMap, ["/:Document"])
+    Fix(analogy.ExtendMap)
+    Fix(analogy.ExtendMap, no_follow=True)
+    Fix(ApplyRulesMatching, structure.rt, "SolveWord", dict())
+
+def ExtendAnalogyTactic(structure, analogy):
+    """Tactic to extend an analogy involving source code.
+
+    See AnalogyUtils.md for the basic idea. This method describes a certain
+    ordering of nodes in the breadth-first-search. We first connect the
+    before-code to its documentation and after-code. Then we follow tokens that
+    are adjacent to each other. Finally, we look at any facts about words being
+    the same as each other. CrossDoc is used to identify associated chunks
+    across documents.
+    """
+    Fix(analogy.ExtendMap, ["/:TransformPair:Before", "/:Documentation:Code"])
+    Fix(analogy.ExtendMap, ["/:Follower:Before", "/:Follower:After"])
+    CrossDoc(structure, analogy)
+    Fix(analogy.ExtendMap, ["/:Follower:Before", "/:Follower:After"])
+    CrossDoc(structure, analogy)
+    Fix(analogy.ExtendMap, ["/:Follower:Before", "/:Follower:After"])
+    Fix(analogy.ExtendMap, ["/:Chunk", "/:SameWord"], no_follow=True)
+    # If we have no_top=False, then it will map Documents:...:_IsMember before
+    # we start concretizing. But then the concretization pass will completely
+    # ignore _IsMember, so it will never concretize the contents. Basically, we
+    # just want to deal with the TOP nodes later, when concretizing.
+    Fix(analogy.ExtendMap, no_follow=True, no_top=True)
+
+FOLLOW_RELATIONS = [
+    "/:Follower:Before", "/:Follower:After", "/:Document",
+    "/:TransformPair:After",
+]
+
+def CrossDoc(structure, analogy):
+    """Tactic to bootstrap an analogy on a new document.
+
+    Basically, the idea here is if you've already marked two tokens as
+    corresponding between Documents 0 and 2, we may want to use that
+    information to mark two different tokens as corresponding in Document 1 and
+    3. We would usually want to do this by following facts like 'SameWord.' But
+    these facts may not be unique (there could be 100 instances of the same
+    token), so CrossDoc looks to only follow the facts that are unique. See
+    AnalogyUtils.md for a more complete description.
+    """
+    ts, rt = structure.ts, structure.rt
+    scope = ts.scope("/:Rules:TagCrossDoc:MustMap", protect=True)
+    no_slip = rt.ts.scope("/:Mapper:NoSlipRules", protect=True)
+
+    rule = SearchRules(rt, "Tag")[0]
+    new_partial = dict({
+        "/:Rules:TagCrossDoc:MustMap:MAlphaA": analogy.MAlphaA,
+        "/:Rules:TagCrossDoc:MustMap:MAlphaB": analogy.MAlphaB,
+    })
+    matcher = GetMatcher(rt, rule, new_partial)
+    matcher.sync()
+    found = defaultdict(set)
+    for assign in matcher.assignments():
+        assigned = assign.assignment
+        key = (assigned[scope[":ChunkA"]], assigned[scope[":ChunkMapA"]], assigned[scope[":DocA'"]])
+        found[key].add(assigned[scope[":ChunkMapB"]])
+    for assign in matcher.assignments():
+        assigned = assign.assignment
+        key = (assigned[scope[":ChunkA"]], assigned[scope[":ChunkMapA"]], assigned[scope[":DocA'"]])
+        if len(found[key]) != 1:
+            continue
+        analogy.ExtendMap(partial=dict({
+            no_slip[":A"]: assigned[scope[":ChunkA"]],
+            no_slip[":B"]: assigned[scope[":ChunkB"]],
+            no_slip[":MA"]: assigned[scope[":ChunkMapA"]],
+            no_slip[":MB"]: assigned[scope[":ChunkMapB"]],
+            no_slip[":C"]: assigned[scope[":ChunkType"]],
+        }))
+
+def AnalyzeCodelets(ts):
+    """Adds helper codelets to identify patterns used in the tactics above."""
+    mapper = MapperCodelet(ts)
+    ts.add_node("/:TransformPair:Before")
+    ts.add_node("/:TransformPair:After")
+    ts.add_node("/:Documentation:Code")
+    ts.add_node("/:Documentation:Docs")
+    with ts.scope("/:Rules:SameWord"):
+        with ts.scope(":MustMap") as exists:
+            ts[":MA"].map({ts[":A"]: ts[":Word"]})
+            ts[":MB"].map({ts[":B"]: ts[":Word"]})
+            ts[":IsWord"].map({ts[":Word"]: ts["/:Word"]})
+            ts[":AMember"].map({
+                ts[":A"]: ts["/:Chunk"],
+                ts[":DocA"]: ts["/:Document"],
+            })
+            ts[":BMember"].map({
+                ts[":B"]: ts["/:Chunk"],
+                ts[":DocB"]: ts["/:Document"],
+            })
+            ts[":AreRelated"].map({
+                ts[":DocA"]: ts[":Doc1Relation"],
+                ts[":DocB"]: ts[":Doc2Relation"],
+            })
+        with ts.scope(":NoMap:Insert"):
+            ts[":AreSame"].map({
+                exists[":A"]: ts["/:SameWord"],
+                exists[":B"]: ts["/:SameWord"],
+            })
+        # Some of these may be unnecessary, they seem to slow things down as
+        # well. For the transformation ones we don't need relations within a
+        # doc.
+        ts[":??"].map({
+            ts[":_"]: ts["/RULE"],
+            exists[":AMember"]: ts["/MAYBE="],
+            exists[":BMember"]: ts["/MAYBE="],
+            exists[":AreRelated"]: ts["/MAYBE="],
+        })
+        ts[":??"].map({
+            ts[":_"]: ts["/RULE"],
+            exists[":DocA"]: ts["/MAYBE="],
+            exists[":DocB"]: ts["/MAYBE="],
+        })
+        ts[":??"].map({
+            ts[":_"]: ts["/RULE"],
+            exists[":Doc1Relation"]: ts["/MAYBE="],
+            exists[":Doc2Relation"]: ts["/MAYBE="],
+        })
+        RegisterRule(ts)
+    with ts.scope("/:Rules:SolveWord"):
+        with ts.scope(":MustMap") as exists:
+            ts[":MA"].map({ts[":A"]: ts[":Word"]})
+            ts[":IsWord"].map({ts[":Word"]: ts["/:Word"]})
+            ts[":AreSame"].map({
+                ts[":A"]: ts["/:SameWord"],
+                ts[":B"]: ts["/:SameWord"],
+            })
+        with ts.scope(":NoMap:Insert"):
+            ts[":MB"].map({exists[":B"]: exists[":Word"]})
+        RegisterRule(ts)
+
+    """Crossdoc tagger.
+
+    The idea here is to find chunks ChunkA in document A and ChunkB in document
+    B which are currently mapped together. We then want to find documents A',
+    B' with chunks ChunkA' and ChunkB' such that A and A' are related in the
+    same way as B and B', and ChunkA and ChunkA' are related in the same ways
+    as ChunkB and ChunkB'.
+
+    Furthermore, we don't want there to be any _other_ ChunkA', ChunkB' which
+    are related in that way. I don't think it's actually possible to express
+    that as a TS rule using the current set up, so we'll enforce that later in
+    the Python tactic.
+    """
+    mapper = ts.scope("/:Mapper", protect=True)
+    with ts.scope("/:Rules:TagCrossDoc"):
+        with ts.scope(":MustMap") as exists:
+            for x in ("A", "B"):
+                ts[f":IsAbstraction{x}"].map({
+                    ts[f":MAlpha{x}"]: ts[mapper[":Abstraction"]]
+                })
+            for x in ("A", "A'", "B", "B'"):
+                ts[f":Doc{x}IsDoc"].map({
+                    ts[f":Doc{x}"]: ts["/:Document"],
+                    ts[f":Chunk{x}"]: ts["/:Chunk"],
+                })
+            for x in ("A", "B"):
+                ts[f":DocMap{x}"].map({
+                    ts[f":Doc{x}"]: ts[":DocType"],
+                    ts[f":Doc{x}'"]: ts[":Doc'Type"],
+                })
+                ts[f":ChunkMap{x}"].map({
+                    ts[f":Chunk{x}"]: ts[":ChunkType"],
+                    ts[f":Chunk{x}'"]: ts[":Chunk'Type"],
+                })
+                ts[f":MAlpha{x}"].map({
+                    ts[f":Doc{x}"]: ts[":AlphaDoc"],
+                    ts[f":Doc{x}'"]: ts[":AlphaDoc'"],
+                    ts[f":Chunk{x}"]: ts[":AlphaChunk"],
+                })
+        RegisterRule(ts)
+        equivalence_classes = [
+            [":ChunkType", ":Chunk'Type"],
+            [":DocType", ":Doc'Type"],
+            [":DocA", ":DocB", ":DocA'", ":DocB'"],
+            [":DocMapA", ":DocMapB"],
+            [":DocAIsDoc", ":DocBIsDoc", ":DocA'IsDoc", ":DocB'IsDoc"],
+            [":ChunkA", ":ChunkB", ":ChunkA'", ":ChunkB'"],
+        ]
+        for equivalence_class in equivalence_classes:
+            equivalence_class = [exists[name] for name in equivalence_class]
+            AssertNodesEqual(ts, equivalence_class, "", equal_type="/MAYBE=")