From 55efcc61150bbd305bea54434ef895b51c36727d Mon Sep 17 00:00:00 2001 From: Reid 'arrdem' McKenzie Date: Wed, 3 May 2023 22:46:26 -0600 Subject: [PATCH] Sketching towards a prototype lisp --- components/hydra/BUILD | 5 + components/hydra/src/python/hydra/__main__.py | 9 + .../hydra/src/python/hydra/interpreter.py | 54 ++++++ components/hydra/src/python/hydra/models.py | 52 ++++++ components/hydra/src/python/hydra/syntax.py | 154 ++++++++++++++++++ components/milkshake/BUILD | 6 + .../src/python/milkshake/__init__.py | 45 +++++ .../src/python/milkshake/grammar.lark | 77 +++++++++ .../milkshake/test/python/test_examples.py | 101 ++++++++++++ components/typhon/BUILD | 7 + components/typhon/notes.md | 51 ++++++ .../typhon/src/python/typhon/__init__.py | 147 +++++++++++++++++ components/typhon/test/python/__init__.py | 1 + components/typhon/test/python/test.ty | 20 +++ 14 files changed, 729 insertions(+) create mode 100644 components/hydra/src/python/hydra/interpreter.py create mode 100644 components/hydra/src/python/hydra/models.py create mode 100644 components/hydra/src/python/hydra/syntax.py create mode 100644 components/milkshake/BUILD create mode 100644 components/milkshake/src/python/milkshake/__init__.py create mode 100644 components/milkshake/src/python/milkshake/grammar.lark create mode 100644 components/milkshake/test/python/test_examples.py create mode 100644 components/typhon/BUILD create mode 100644 components/typhon/notes.md create mode 100644 components/typhon/src/python/typhon/__init__.py create mode 100644 components/typhon/test/python/__init__.py create mode 100644 components/typhon/test/python/test.ty diff --git a/components/hydra/BUILD b/components/hydra/BUILD index 286d36b..d32713c 100644 --- a/components/hydra/BUILD +++ b/components/hydra/BUILD @@ -1,3 +1,8 @@ py_project( name = "hydra", + lib_deps = [ + py_requirement("attrs"), + py_requirement("click"), + ], + main = "src/python/hydra/__main__.py", ) diff --git a/components/hydra/src/python/hydra/__main__.py b/components/hydra/src/python/hydra/__main__.py index 95890e5..20f39dd 100644 --- a/components/hydra/src/python/hydra/__main__.py +++ b/components/hydra/src/python/hydra/__main__.py @@ -1,12 +1,21 @@ #!/usr/bin/env python3.10 +from pathlib import Path import click +from hydra.interpreter import run_script + @click.group() def cli(): pass +@cli.command() +@click.argument("script", type=Path) +def run(script): + run_script(script) + + if __name__ == "__main__": cli.main(prog_name="hydra") diff --git a/components/hydra/src/python/hydra/interpreter.py b/components/hydra/src/python/hydra/interpreter.py new file mode 100644 index 0000000..abf3733 --- /dev/null +++ b/components/hydra/src/python/hydra/interpreter.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 + +"""The Hydra interpreter. + +Hydra is an odd implementation of Python interpretation. Control is modeled in +terms of continuations, each continuation having a classical stack and a control +pointer. However rather than converting Python ASTs to a bytecode format +suitable for sequential indexing as in the CPython VM, Hydra simply maintains a +"cursor" into the AST. This allows for simultaneous implementation of direct AST +interpretation (for simplicity) and continuations. + +""" + +import ast +from pathlib import Path +from typing import Dict, List, Optional + +from hydra.models import Cursor, Continuation, Module, Runtime, Namespace +from hydra.syntax import validate_mod + + +def run_script( + script: Path, + pypath: Optional[List[Path]] = None, + pyenv: Optional[Dict[str, str]] = None, +): + """Given a script, start a thread and run it to 'completion'. + + Mostly a stub designed to get me rolling. + + """ + + pypath = pypath or [] + pyenv = pyenv or {} + + with open(script, "r", encoding="utf-8") as fp: + mod_ast: ast.Module = ast.parse(fp.read()) + # Locate unsupported syntax, if any + validate_mod(mod_ast) + + # Create the module object + mod = Module("__main__", mod_ast) + + # Create the main continuation + main = Continuation(entrypoint=Cursor(module="__main__", path=[0]), stack=[]) + + # Create the runtime + rt = Runtime(modules=Namespace({"__main__": mod})) + + # Register the main continuation + rt.conts[rt.main] = main + + for stmt in mod_ast.body: + print(stmt) diff --git a/components/hydra/src/python/hydra/models.py b/components/hydra/src/python/hydra/models.py new file mode 100644 index 0000000..fdb1d6b --- /dev/null +++ b/components/hydra/src/python/hydra/models.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 + +import ast +from pathlib import Path +from typing import Any, Dict, List, Optional + +from attrs import define, field +from uuid import UUID, uuid4 + + +@define +class Cursor: + module: str + path: List[int] + + +@define +class Continuation: + entrypoint: Cursor + stack: List[Cursor] + parent: Optional["Continuation"] = None + + +@define +class Namespace: + """Namespaces name ... values. + + Modules, classes and other forms of scopes are all Namespaces. + + """ + + d: Dict[str, Any] = field(default=dict) + + +@define +class Module: + """A 'module" maps names to code objects such as constants, classes and functions (also classes).""" + + name: str + ast: ast.Module + d: Dict[str, Any] = field(default=dict) + + +@define +class Runtime: + """A Runtime represents a loaded and potentially runnable""" + + env: Dict[str, str] = field(default=dict) + path: List[Path] = field(default=list) + main: UUID = field(default=uuid4) + conts: Dict[UUID, Continuation] = field(default=dict) + modules: Namespace = field(default=Namespace) diff --git a/components/hydra/src/python/hydra/syntax.py b/components/hydra/src/python/hydra/syntax.py new file mode 100644 index 0000000..e9eb930 --- /dev/null +++ b/components/hydra/src/python/hydra/syntax.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 + +"""The syntax checker. + +The Python AST module will gladly parse a variety of meaningless statements such +as top-level `pass` and `return` statements. These are, technically, illegal and +cannot be evaluated. + +Furthermore, Hydra doesn't support (currently a lot) of Python's syntax. Match +and async for instance are right out. + +The syntax module exists to provide "validation" that a given AST tree is +"legitimate". + +""" + + +import ast +from typing import Union + +# 'stmt' from the AST def. (3.11) +Statement = Union[ + ast.FunctionDef, + ast.AsyncFunctionDef, + ast.ClassDef, + ast.Return, + ast.Delete, + ast.Assign, + ast.AugAssign, + ast.AnnAssign, + ast.For, + ast.AsyncFor, + ast.While, + ast.If, + ast.With, + ast.AsyncWith, + ast.Match, + ast.Raise, + ast.Try, + ast.Assert, + ast.Import, + ast.ImportFrom, + ast.Global, + ast.Nonlocal, + ast.Expr, + ast.Pass, + ast.Break, + ast.Continue, +] + +# 'expr' from the AST def. (3.11) +Expression = Union[ + ast.BoolOp, + ast.NamedExpr, + ast.BinOp, + ast.UnaryOp, + ast.Lambda, + ast.IfExp, + ast.Dict, + ast.Set, + ast.ListComp, + ast.SetComp, + ast.DictComp, + ast.GeneratorExp, + ast.Await, + ast.Yield, + ast.YieldFrom, + ast.Compare, + ast.Call, + ast.FormattedValue, + ast.JoinedStr, + ast.Constant, + ast.Attribute, + ast.Subscript, + ast.Starred, + ast.Name, + ast.List, + ast.Tuple, + ast.Slice, +] + + +def collect(ctor): + """Coerce a generator style function into returning a collection.""" + + def decorator(f): + def helper(*args, **kwargs): + return ctor(f(*args, **kwargs)) + + return helper + + return decorator + + +@collect(list) +def validate_stmt(stmt: Statement, top=False): + if isinstance(stmt, (ast.AsyncFunctionDef, ast.AsyncFor, ast.AsyncWith, ast.Match)): + # Stuff we don't support anywhere + yield stmt + + elif top and isinstance( + stmt, + ( + ast.AsyncFor, + ast.AsyncWith, + ast.Break, + ast.Continue, + ast.Nonlocal, + ast.Pass, + ast.Return, + ), + ): + # Statements that are specifically meaningless at the top level + yield stmt + + elif isinstance(stmt, (ast.FunctionDef, ast.ClassDef)): + # Recur into function/class definitions + for it in stmt.body: + yield from validate_stmt(it) + + elif isinstance(stmt, ast.If): + for it in stmt.body: + yield from validate_stmt(it) + + for it in stmt.orelse or []: + yield from validate_stmt(it) + + elif isinstance(stmt, ast.Try): + for it in stmt.body: + yield from validate_stmt(it) + + for handler in stmt.handlers or []: + for it in handler.body: + yield from validate_stmt(it) + + for it in stmt.orelse or []: + yield from validate_stmt(it) + + for it in stmt.finalbody or []: + yield from validate_stmt(it) + + elif isinstance(stmt, (ast.With, ast.While, ast.For)): + for it in stmt.body: + yield from validate_stmt(it) + + elif isinstance(stmt, ast.Expression): + yield from validate_expr(stmt) + + +def validate_mod(mod: ast.Module): + problems = [] + for stmt in mod.body: + if ps := validate_stmt(stmt, top=True): + problems.extend(ps) diff --git a/components/milkshake/BUILD b/components/milkshake/BUILD new file mode 100644 index 0000000..bcd7ae6 --- /dev/null +++ b/components/milkshake/BUILD @@ -0,0 +1,6 @@ +py_project( + name = "milkshake", + lib_deps = [ + py_requirement("lark"), + ], +) diff --git a/components/milkshake/src/python/milkshake/__init__.py b/components/milkshake/src/python/milkshake/__init__.py new file mode 100644 index 0000000..34410c1 --- /dev/null +++ b/components/milkshake/src/python/milkshake/__init__.py @@ -0,0 +1,45 @@ +"""A quick and dirty Lisp reader""" + + +from importlib.resources import files + +from lark import Lark, Tree, Token, Transformer, v_args + + +with files(__package__).joinpath("grammar.lark").open("r", encoding="utf-8") as fp: + GRAMMAR = fp.read() + + +@v_args(tree=True) +class T(Transformer): + """A prepackaged transformer that cleans up the quoting details.""" + + def un_qq(self, obj): + return Tree( + Token(obj.data.type, obj.data.value.replace("qq_", "", 1)), obj.children + ) + + qq_list = un_qq + qq_tuple = un_qq + qq_vecotor = un_qq + qq_map = un_qq + qq_set = un_qq + qq_atom = un_qq + qq_symbol = un_qq + + def qq_quote(self, obj): + return self.quote(self.un_qq(obj)) + + def quote(self, obj): + return Tree(obj.data, obj.children[1:]) + + quasiquote = quote + unquote = quote + unquote_splicing = quote + + +PARSER = Lark(GRAMMAR, start=["module", "expr"]) + + +def slurp(text: str, start="module") -> Tree: + return T().transform(PARSER.parse(text, start=start)) diff --git a/components/milkshake/src/python/milkshake/grammar.lark b/components/milkshake/src/python/milkshake/grammar.lark new file mode 100644 index 0000000..2f8b90d --- /dev/null +++ b/components/milkshake/src/python/milkshake/grammar.lark @@ -0,0 +1,77 @@ +start: module +module: expr+ + +?expr: quote + | quasiquote + | list + | tuple + | vector + | set + | map + | atom + +quote: QUOTE expr + +quasiquote: QUASIQUOTE qq_expr +unquote: UNQUOTE qq_expr +unquote_splicing: UNQUOTE_SPLICING qq_expr +qq_quote: QUOTE qq_expr + +?qq_expr: qq_quote + | unquote + | unquote_splicing + | qq_list + | qq_tuple + | qq_vector + | qq_set + | qq_map + | qq_atom + +list: "(" expr* ")" +qq_list: "(" qq_expr* ")" + +tuple: "[" expr "]" +qq_tuple: "[" qq_expr "]" + +vector: "#[" expr "]" +qq_vector: "#[" qq_expr "]" + +map: "{" (expr expr)* "}" +qq_map: "{" (qq_expr qq_expr)* "}" + +set: "#{" expr* "}" +qq_set: "#{" qq_expr* "}" + +?atom: string + | pattern + | number + | symbol + +?qq_atom: string + | pattern + | number + | qq_symbol + +string: /"([^"]|\\")+"/ + +pattern: /\/([^\/]|\\\/)+\// + +number: /[+-]?(\d+r)?(\d[\d,_\.]*)([\.,][\d,_\.]*)?(e[+-]?\d+)?/ + +// Note that we're demoting Symbol from the default parse priority of 0 to -1 +// This because _anything more specific_ should be chosen over symbol +symbol.-1: /[^\d\s'"`()\[\]:]+/ + +// Note that qq symbols explicitly forbid leading ~, unlike normal symbols +qq_symbol.-1: /[^\d\s'"`()\[\]:~]+/ + +QUOTE: /'/ +QUASIQUOTE: /`/ +UNQUOTE: /~/ +UNQUOTE_SPLICING: /~@/ + +COMMENT: /;.*?\n/ +WHITESPACE: /\s+/ + +%ignore COMMENT +%ignore WHITESPACE diff --git a/components/milkshake/test/python/test_examples.py b/components/milkshake/test/python/test_examples.py new file mode 100644 index 0000000..94a54ba --- /dev/null +++ b/components/milkshake/test/python/test_examples.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 + +from milkshake import slurp + +from lark import Tree, Token +import pytest + + +@pytest.mark.parametrize( + "input, val", + [ + ("()", Tree("list", [])), + ("nil", nil := Tree("symbol", ["nil"])), + ("(nil nil nil)", Tree("list", [nil, nil, nil])), + ( + "(/ + - * % ^ \\ & # @ ! = |)", + Tree( + "list", + [ + Tree("symbol", ["/"]), + Tree("symbol", ["+"]), + Tree("symbol", ["-"]), + Tree("symbol", ["*"]), + Tree("symbol", ["%"]), + Tree("symbol", ["^"]), + Tree("symbol", ["\\"]), + Tree("symbol", ["&"]), + Tree("symbol", ["#"]), + Tree("symbol", ["@"]), + Tree("symbol", ["!"]), + Tree("symbol", ["="]), + Tree("symbol", ["|"]), + ], + ), + ), + ("1.0", Tree("number", ["1.0"])), + ("+1.0", Tree("number", ["+1.0"])), + ("-1.0", Tree("number", ["-1.0"])), + ("-10,000,000.0", Tree("number", ["-10,000,000.0"])), + ("-10.000.000,0", Tree("number", ["-10.000.000,0"])), + ("-10_000_000,0", Tree("number", ["-10_000_000,0"])), + ("-10_000_000.0", Tree("number", ["-10_000_000.0"])), + ("1e50", Tree("number", ["1e50"])), + ("-1e-50", Tree("number", ["-1e-50"])), + ("-1e+50", Tree("number", ["-1e+50"])), + ( + "(+inf -inf inf nan +nan -nan)", + Tree( + "list", + [ + Tree("symbol", ["+inf"]), + Tree("symbol", ["-inf"]), + Tree("symbol", ["inf"]), + Tree("symbol", ["nan"]), + Tree("symbol", ["+nan"]), + Tree("symbol", ["-nan"]), + ], + ), + ), + ("'()", Tree("quote", [Tree("list", [])])), + ( + "`(nil ~'() ~@'())", + Tree( + "quasiquote", + [ + Tree( + "list", + [ + Tree("symbol", ["nil"]), + Tree( + "unquote", + [ + Tree( + "quote", + [ + Tree("list", []), + ], + ), + ], + ), + Tree( + "unquote_splicing", + [ + Tree( + "quote", + [ + Tree("list", []), + ], + ), + ], + ), + ], + ), + ], + ), + ), + (r"/foo\S+/", Tree("pattern", [r"/foo\S+/"])), + ], +) +def test_examples(input, val): + assert slurp(input, start="expr") == val diff --git a/components/typhon/BUILD b/components/typhon/BUILD new file mode 100644 index 0000000..7552004 --- /dev/null +++ b/components/typhon/BUILD @@ -0,0 +1,7 @@ +py_project( + name = "typhon", + lib_deps = [ + "//components/milkshake", + py_requirement("attrs"), + ] +) diff --git a/components/typhon/notes.md b/components/typhon/notes.md new file mode 100644 index 0000000..544ae92 --- /dev/null +++ b/components/typhon/notes.md @@ -0,0 +1,51 @@ +# Notes + +Natural gradient; interpreter -> semi-VM -> tracing JIT/translator -> optimizing JIT/translator + -> abstract interpreter -> static compiler/translator + +A semi-VM which demand translates AST nodes into a stack of demanded evaluation +terms and then walks the evaluation stack as if it were a bytecode or +semi-bytecode evaluator. The advantage of this strategy is that the demanded +operation / control stack and paired data stack eliminate the need to leverage +the system control stack. This gets you serializable stacks for Flowmetal. But +you write two half interpreters. + +Now the natural question is why write a hosted VM to get serializable stacks +when Python has a perfectly good bytecode VM already? Sticking one VM atop +another is ... a bit silly especially since the goal of doing so is to be able +to "drop down" from the one to the other to ensure compatibility. + +Is there a lens through which the serialization requirements of Flowmental can +be satisfied from "normal" Python using the "normal" Python bytecode +interpreter? + +Consider - function call points and function return points are in a sense +language safe points. Rather than trying to capture the entire evaluation +"image" at some point, one could instead track the call/return evaluation log +for replay. Such a scheme would allow Flowmetal to be implemented using static +rewrites of Python ASTs. Any function call becomes a checkpoint as does +receiving the return result. + +Any `__call__` invocation needs to be evaluated as something like + + x = runtime.call(const_gen_call_id, instance, args) + +This tactic specifically leans on `yield` being a statement _with a return +value_. This pattern would let the "runtime" as the root evaluation routine +'continue' any given call site with the return result. `runtime.call` would be +some incantation for producing a sentinel value to the runtime noting that a +function call had been requested - and that its result should either be computed +or replayed from a log. + +There are a lot of opportunities for optimization here. Not every function call +needs its value persisted into the log. Most function calls depend only on the +direct live state of the program. Exceptions are things like interacting with +file descriptors/sockets and clocks. But strictly data-dependent operations like +dictionary mutations are entirely safe under replay. They're only path +dependent. So really we only need to "de-optimize" or spy on "key" function +calls which occur against _unsafe_ operations. Or which occur against captured +function/method instances which cannot be statically identified. + +There may be games to be played with yield/coroutines here, but that could play +heck with normal generators. Intercepting "normal" calls with "normal" calls is +probably the easy strategy. diff --git a/components/typhon/src/python/typhon/__init__.py b/components/typhon/src/python/typhon/__init__.py new file mode 100644 index 0000000..da55d3e --- /dev/null +++ b/components/typhon/src/python/typhon/__init__.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 + +""" +A prototype 'flat' interpreter for a continuation based Lisp. + +Not intended to be anything real, just intended to serve as a prototype for how +to implement a flat, stackless, serializable interpreter. +""" + +import operator +import typing as t + +import attrs + + +@attrs.define() +class Vm: + """ + :field log: All evaluated expressions, in order + :field mod_ns: A map from symbols to currently bound expressions + :field continuations: A list of coroutines/continuations of control + """ + + log: t.List[list] + mod_ns: t.Dict[str, t.Any] = attrs.field(factory=dict) + continuations: t.List["Cont"] = attrs.field(factory=list) + + +@attrs.define() +class Cont: + """ + Continuations represent sequences of evaluation. + + :field frames: A list of call frames + """ + + frames: t.List["Frame"] = attrs.field(factory=list) + + +@attrs.define() +class Frame: + """ + Frames represent function call boundaries. + + :field pc: Program points within the AST being interpreted + :field frame_ns: Frame-local bindings + """ + + pc: list = attrs.field(factory=lambda: [0]) + frame_ns: dict = attrs.field(factory=dict) + op_stack: list = attrs.field(factory=list) + data_stack: list = attrs.field(factory=list) + + +def chain(its): + for it in its: + yield from it + + +def get_in(idxable, idxs): + for idx in idxs: + idxable = idxable[idx] + return idxable + + +class Operator: + """Stack operators.""" + + +class Task: + @attrs.define() + class Complete: + """Signals program termination.""" + + pass + + @attrs.define() + class Eval: + """Evaluate the current term.""" + + pass + + @attrs.define() + class Apply: + """Apply an operator to the stack.""" + + operator: "Operator" = attrs.field() + + @attrs.define() + class Next: + """Advance the program counter to the 'next' statement.""" + + pc: list + + +def step(vm: Vm, cont: Cont, frame: Frame): + """Step moves the VM forwards one "cycle". + + This VM is built as a semi-VM. + + Interpretation occurs at two levels - the first over a direct syntax tree. + This allows the interpreter to forego a formal VM or any compilation step. + + This model however poses a challenge for AST nodes with data dependencies. + Sub-expressions are the most obivous example. For these, we need semi-VM + behavior where the AST node is translated into a sequence of concrete + sub-step demanded operations which can actually be executed. Note that this + provides a neat natural definition of a tracing JIT or optimizing compiler. + + This makes the execution mode well bimodal. + + - On the one hand if we have demanded operations on the stack (`op_stack`) + then we want to execute the "top" demanded operation. + + - On the other, if we have no demanded operations we want to + + """ + + match expr: + case ["native", native_fn, arg_exprs, vararg_expr]: + pass + + case ["invoke", fn_expr, arg_exprs, vararg_expr]: + pass + + case ["var", name]: + pass + + case int() | float() | str() | bool(): + pass + + +def run(program, state=None): + vm = state or Vm([], {}, []) + replc = Cont([Frame([0], {})]) + vm.continuations.append(replc) + + for form in program: + # Enter the form into the log + vm.log.append(form) + # Set the REPL continuation's point to the new form + replc.frames = [Frame([len(vm.log) - 1])] + # Evaluate the continuation to completion (or something) + while True: + match (state := step(vm, replc, replc.frames[-1])): + case _: + pass diff --git a/components/typhon/test/python/__init__.py b/components/typhon/test/python/__init__.py new file mode 100644 index 0000000..e5a0d9b --- /dev/null +++ b/components/typhon/test/python/__init__.py @@ -0,0 +1 @@ +#!/usr/bin/env python3 diff --git a/components/typhon/test/python/test.ty b/components/typhon/test/python/test.ty new file mode 100644 index 0000000..baf0f8b --- /dev/null +++ b/components/typhon/test/python/test.ty @@ -0,0 +1,20 @@ +; -*- mode: scheme -*- + +(define + + (lambda (x y) + (py "lambda x, y: x + y" [x y] []))) + +(define format + (lambda (&args) + (py "lambda fmt, *args: fmt % args" [fmt] &args))) + +(define print + (lambda (&args) + (py "print" [] &args))) + +(define add2 + (lambda (x) + (+ (2 x)))) + +(print + (format "%d" (add2 2)))