CS153/hw4/bin/frontend.ml
jmug b24a264f7e Update hw4 to a newer version.
Signed-off-by: jmug <u.g.a.mariano@gmail.com>
2025-01-24 21:01:32 -08:00

464 lines
18 KiB
OCaml

open Ll
open Llutil
open Ast
(* This file is where much of the work of the project will be carried out.
Follow the instructions on the project web site, but first skim through
this file to see what it contains.
*)
(* instruction streams ------------------------------------------------------ *)
(* As in the last project, we'll be working with a flattened representation
of LLVMlite programs to make emitting code easier. This version
additionally makes it possible to emit elements that will be gathered up and
"hoisted" to specific parts of the constructed CFG
- G of gid * Ll.gdecl: allows you to output global definitions in the middle
of the instruction stream. You will find this useful for compiling string
literals
- E of uid * insn: allows you to emit an instruction that will be moved up
to the entry block of the current function. This will be useful for
compiling local variable declarations
*)
type elt =
| L of Ll.lbl (* block labels *)
| I of uid * Ll.insn (* instruction *)
| T of Ll.terminator (* block terminators *)
| G of gid * Ll.gdecl (* hoisted globals (usually strings) *)
| E of uid * Ll.insn (* hoisted entry block instructions *)
(* The type of streams of LLVMLite instructions. Note that to improve performance,
* we will emit the instructions in reverse order. That is, the LLVMLite code:
* %1 = mul i64 2, 2
* %2 = add i64 1, %1
* br label %l1
* would be constructed as a stream as follows:
* I ("1", Binop (Mul, I64, Const 2L, Const 2L))
* >:: I ("2", Binop (Add, I64, Const 1L, Id "1"))
* >:: T (Br "l1")
*)
type stream = elt list
let ( >@ ) x y = y @ x
let ( >:: ) x y = y :: x
let lift : (uid * insn) list -> stream = List.rev_map (fun (x,i) -> I (x,i))
(* Build a CFG and collection of global variable definitions from a stream *)
let cfg_of_stream (code:stream) : Ll.cfg * (Ll.gid * Ll.gdecl) list =
let gs, einsns, insns, term_opt, blks = List.fold_left
(fun (gs, einsns, insns, term_opt, blks) e ->
match e with
| L l ->
begin match term_opt with
| None ->
if (List.length insns) = 0 then (gs, einsns, [], None, blks)
else failwith @@ Printf.sprintf "build_cfg: block labeled %s has\
no terminator" l
| Some term ->
(gs, einsns, [], None, (l, {insns; term})::blks)
end
| T t -> (gs, einsns, [], Some (Llutil.Parsing.gensym "tmn", t), blks)
| I (uid,insn) -> (gs, einsns, (uid,insn)::insns, term_opt, blks)
| G (gid,gdecl) -> ((gid,gdecl)::gs, einsns, insns, term_opt, blks)
| E (uid,i) -> (gs, (uid, i)::einsns, insns, term_opt, blks)
) ([], [], [], None, []) code
in
match term_opt with
| None -> failwith "build_cfg: entry block has no terminator"
| Some term ->
let insns = einsns @ insns in
({insns; term}, blks), gs
(* compilation contexts ----------------------------------------------------- *)
(* To compile OAT variables, we maintain a mapping of source identifiers to the
corresponding LLVMlite operands. Bindings are added for global OAT variables
and local variables that are in scope. *)
module Ctxt = struct
type t = (Ast.id * (Ll.ty * Ll.operand)) list
let empty = []
(* Add a binding to the context *)
let add (c:t) (id:id) (bnd:Ll.ty * Ll.operand) : t = (id,bnd)::c
(* Lookup a binding in the context *)
let lookup (id:Ast.id) (c:t) : Ll.ty * Ll.operand =
List.assoc id c
(* Lookup a function, fail otherwise *)
let lookup_function (id:Ast.id) (c:t) : Ll.ty * Ll.operand =
match List.assoc id c with
| Ptr (Fun (args, ret)), g -> Ptr (Fun (args, ret)), g
| _ -> failwith @@ id ^ " not bound to a function"
let lookup_function_option (id:Ast.id) (c:t) : (Ll.ty * Ll.operand) option =
try Some (lookup_function id c) with _ -> None
end
(* compiling OAT types ------------------------------------------------------ *)
(* The mapping of source types onto LLVMlite is straightforward. Booleans and ints
are represented as the corresponding integer types. OAT strings are
pointers to bytes (I8). Arrays are the most interesting type: they are
represented as pointers to structs where the first component is the number
of elements in the following array.
The trickiest part of this project will be satisfying LLVM's rudimentary type
system. Recall that global arrays in LLVMlite need to be declared with their
length in the type to statically allocate the right amount of memory. The
global strings and arrays you emit will therefore have a more specific type
annotation than the output of cmp_rty. You will have to carefully bitcast
gids to satisfy the LLVM type checker.
*)
let rec cmp_ty : Ast.ty -> Ll.ty = function
| Ast.TBool -> I1
| Ast.TInt -> I64
| Ast.TRef r -> Ptr (cmp_rty r)
and cmp_rty : Ast.rty -> Ll.ty = function
| Ast.RString -> I8
| Ast.RArray u -> Struct [I64; Array(0, cmp_ty u)]
| Ast.RFun (ts, t) ->
let args, ret = cmp_fty (ts, t) in
Fun (args, ret)
and cmp_ret_ty : Ast.ret_ty -> Ll.ty = function
| Ast.RetVoid -> Void
| Ast.RetVal t -> cmp_ty t
and cmp_fty (ts, r) : Ll.fty =
List.map cmp_ty ts, cmp_ret_ty r
let typ_of_binop : Ast.binop -> Ast.ty * Ast.ty * Ast.ty = function
| Add | Mul | Sub | Shl | Shr | Sar | IAnd | IOr -> (TInt, TInt, TInt)
| Eq | Neq | Lt | Lte | Gt | Gte -> (TInt, TInt, TBool)
| And | Or -> (TBool, TBool, TBool)
let typ_of_unop : Ast.unop -> Ast.ty * Ast.ty = function
| Neg | Bitnot -> (TInt, TInt)
| Lognot -> (TBool, TBool)
(* Compiler Invariants
The LLVM IR type of a variable (whether global or local) that stores an Oat
array value (or any other reference type, like "string") will always be a
double pointer. In general, any Oat variable of Oat-type t will be
represented by an LLVM IR value of type Ptr (cmp_ty t). So the Oat variable
x : int will be represented by an LLVM IR value of type i64*, y : string will
be represented by a value of type i8**, and arr : int[] will be represented
by a value of type {i64, [0 x i64]}**. Whether the LLVM IR type is a
"single" or "double" pointer depends on whether t is a reference type.
We can think of the compiler as paying careful attention to whether a piece
of Oat syntax denotes the "value" of an expression or a pointer to the
"storage space associated with it". This is the distinction between an
"expression" and the "left-hand-side" of an assignment statement. Compiling
an Oat variable identifier as an expression ("value") does the load, so
cmp_exp called on an Oat variable of type t returns (code that) generates a
LLVM IR value of type cmp_ty t. Compiling an identifier as a left-hand-side
does not do the load, so cmp_lhs called on an Oat variable of type t returns
and operand of type (cmp_ty t)*. Extending these invariants to account for
array accesses: the assignment e1[e2] = e3; treats e1[e2] as a
left-hand-side, so we compile it as follows: compile e1 as an expression to
obtain an array value (which is of pointer of type {i64, [0 x s]}* ).
compile e2 as an expression to obtain an operand of type i64, generate code
that uses getelementptr to compute the offset from the array value, which is
a pointer to the "storage space associated with e1[e2]".
On the other hand, compiling e1[e2] as an expression (to obtain the value of
the array), we can simply compile e1[e2] as a left-hand-side and then do the
load. So cmp_exp and cmp_lhs are mutually recursive. [[Actually, as I am
writing this, I think it could make sense to factor the Oat grammar in this
way, which would make things clearerhw, I may do that for next time around.]]
Consider globals7.oat (in hw4programs)
/--------------- globals7.oat ------------------
global arr = int[] null;
int foo() {
var x = new int[3];
arr = x;
x[2] = 3;
return arr[2];
}
/------------------------------------------------
The translation (given by cmp_ty) of the type int[] is {i64, [0 x i64}* so
the corresponding LLVM IR declaration will look like:
@arr = global { i64, [0 x i64] }* null
This means that the type of the LLVM IR identifier @arr is {i64, [0 x i64]}**
which is consistent with the type of a locally-declared array variable.
The local variable x would be allocated and initialized by (something like)
the following code snippet. Here %_x7 is the LLVM IR uid containing the
pointer to the "storage space" for the Oat variable x.
%_x7 = alloca { i64, [0 x i64] }* ;; (1)
%_raw_array5 = call i64* @oat_alloc_array(i64 3) ;; (2)
%_array6 = bitcast i64* %_raw_array5 to { i64, [0 x i64] }* ;; (3)
store { i64, [0 x i64]}* %_array6, { i64, [0 x i64] }** %_x7 ;; (4)
(1) note that alloca uses cmp_ty (int[]) to find the type, so %_x7 has
the same type as @arr
(2) @oat_alloc_array allocates len+1 i64's
(3) we have to bitcast the result of @oat_alloc_array so we can store it
in %_x7
(4) stores the resulting array value (itself a pointer) into %_x7
The assignment arr = x; gets compiled to (something like):
%_x8 = load { i64, [0 x i64] }*, { i64, [0 x i64] }** %_x7 ;; (5)
store {i64, [0 x i64] }* %_x8, { i64, [0 x i64] }** @arr ;; (6)
(5) load the array value (a pointer) that is stored in the address pointed
to by %_x7
(6) store the array value (a pointer) into @arr
The assignment x[2] = 3; gets compiled to (something like):
%_x9 = load { i64, [0 x i64] }*, { i64, [0 x i64] }** %_x7 ;; (7)
%_index_ptr11 = getelementptr { i64, [0 x i64] },
{ i64, [0 x i64] }* %_x9, i32 0, i32 1, i32 2 ;; (8)
store i64 3, i64* %_index_ptr11 ;; (9)
(7) as above, load the array value that is stored %_x7
(8) calculate the offset from the array using GEP
(9) store 3 into the array
Finally, return arr[2]; gets compiled to (something like) the following.
Note that the way arr is treated is identical to x. (Once we set up the
translation, there is no difference between Oat globals and locals, except
how their storage space is initially allocated.)
%_arr12 = load { i64, [0 x i64] }*, { i64, [0 x i64] }** @arr ;; (10)
%_index_ptr14 = getelementptr { i64, [0 x i64] },
{ i64, [0 x i64] }* %_arr12, i32 0, i32 1, i32 2 ;; (11)
%_index15 = load i64, i64* %_index_ptr14 ;; (12)
ret i64 %_index15
(10) just like for %_x9, load the array value that is stored in @arr
(11) calculate the array index offset
(12) load the array value at the index
*)
(* Global initialized arrays:
There is another wrinkle: to compile global initialized arrays like in the
globals4.oat, it is helpful to do a bitcast once at the global scope to
convert the "precise type" required by the LLVM initializer to the actual
translation type (which sets the array length to 0). So for globals4.oat,
the arr global would compile to (something like):
@arr = global { i64, [0 x i64] }* bitcast
({ i64, [4 x i64] }* @_global_arr5 to { i64, [0 x i64] }* )
@_global_arr5 = global { i64, [4 x i64] }
{ i64 4, [4 x i64] [ i64 1, i64 2, i64 3, i64 4 ] }
*)
(* Some useful helper functions *)
(* Generate a fresh temporary identifier. Since OAT identifiers cannot begin
with an underscore, these should not clash with any source variables *)
let gensym : string -> string =
let c = ref 0 in
fun (s:string) -> incr c; Printf.sprintf "_%s%d" s (!c)
(* Amount of space an Oat type takes when stored in the satck, in bytes.
Note that since structured values are manipulated by reference, all
Oat values take 8 bytes on the stack.
*)
let size_oat_ty (t : Ast.ty) = 8L
(* Generate code to allocate a zero-initialized array of source type TRef (RArray t) of the
given size. Note "size" is an operand whose value can be computed at
runtime *)
let oat_alloc_array (t:Ast.ty) (size:Ll.operand) : Ll.ty * operand * stream =
let ans_id, arr_id = gensym "array", gensym "raw_array" in
let ans_ty = cmp_ty @@ TRef (RArray t) in
let arr_ty = Ptr I64 in
ans_ty, Id ans_id, lift
[ arr_id, Call(arr_ty, Gid "oat_alloc_array", [I64, size])
; ans_id, Bitcast(arr_ty, Id arr_id, ans_ty) ]
(* Compiles an expression exp in context c, outputting the Ll operand that will
recieve the value of the expression, and the stream of instructions
implementing the expression.
Tips:
- use the provided cmp_ty function!
- string literals (CStr s) should be hoisted. You'll need to make sure
either that the resulting gid has type (Ptr I8), or, if the gid has type
[n x i8] (where n is the length of the string), convert the gid to a
(Ptr I8), e.g., by using getelementptr.
- use the provided "oat_alloc_array" function to implement literal arrays
(CArr) and the (NewArr) expressions
*)
let rec cmp_exp (c:Ctxt.t) (exp:Ast.exp node) : Ll.ty * Ll.operand * stream =
failwith "cmp_exp unimplemented"
(* Compile a statement in context c with return typ rt. Return a new context,
possibly extended with new local bindings, and the instruction stream
implementing the statement.
Left-hand-sides of assignment statements must either be OAT identifiers,
or an index into some arbitrary expression of array type. Otherwise, the
program is not well-formed and your compiler may throw an error.
Tips:
- for local variable declarations, you will need to emit Allocas in the
entry block of the current function using the E() constructor.
- don't forget to add a bindings to the context for local variable
declarations
- you can avoid some work by translating For loops to the corresponding
While loop, building the AST and recursively calling cmp_stmt
- you might find it helpful to reuse the code you wrote for the Call
expression to implement the SCall statement
- compiling the left-hand-side of an assignment is almost exactly like
compiling the Id or Index expression. Instead of loading the resulting
pointer, you just need to store to it!
*)
let rec cmp_stmt (c:Ctxt.t) (rt:Ll.ty) (stmt:Ast.stmt node) : Ctxt.t * stream =
failwith "cmp_stmt not implemented"
(* Compile a series of statements *)
and cmp_block (c:Ctxt.t) (rt:Ll.ty) (stmts:Ast.block) : Ctxt.t * stream =
List.fold_left (fun (c, code) s ->
let c, stmt_code = cmp_stmt c rt s in
c, code >@ stmt_code
) (c,[]) stmts
(* Adds each function identifer to the context at an
appropriately translated type.
NOTE: The Gid of a function is just its source name
*)
let cmp_function_ctxt (c:Ctxt.t) (p:Ast.prog) : Ctxt.t =
List.fold_left (fun c -> function
| Ast.Gfdecl { elt={ frtyp; fname; args } } ->
let ft = TRef (RFun (List.map fst args, frtyp)) in
Ctxt.add c fname (cmp_ty ft, Gid fname)
| _ -> c
) c p
(* Populate a context with bindings for global variables
mapping OAT identifiers to LLVMlite gids and their types.
Only a small subset of OAT expressions can be used as global initializers
in well-formed programs. (The constructors starting with C).
*)
let cmp_global_ctxt (c:Ctxt.t) (p:Ast.prog) : Ctxt.t =
failwith "cmp_global_ctxt unimplemented"
(* Compile a function declaration in global context c. Return the LLVMlite cfg
and a list of global declarations containing the string literals appearing
in the function.
You will need to
1. Allocate stack space for the function parameters using Alloca
2. Store the function arguments in their corresponding alloca'd stack slot
3. Extend the context with bindings for function variables
4. Compile the body of the function using cmp_block
5. Use cfg_of_stream to produce a LLVMlite cfg from
*)
let cmp_fdecl (c:Ctxt.t) (f:Ast.fdecl node) : Ll.fdecl * (Ll.gid * Ll.gdecl) list =
failwith "cmp_fdecl not implemented"
(* Compile a global initializer, returning the resulting LLVMlite global
declaration, and a list of additional global declarations.
Tips:
- Only CNull, CBool, CInt, CStr, and CArr can appear as global initializers
in well-formed OAT programs. Your compiler may throw an error for the other
cases
- OAT arrays are always handled via pointers. A global array of arrays will
be an array of pointers to arrays emitted as additional global declarations.
*)
let rec cmp_gexp c (e:Ast.exp node) : Ll.gdecl * (Ll.gid * Ll.gdecl) list =
failwith "cmp_init not implemented"
(* Oat internals function context ------------------------------------------- *)
let internals = [
"oat_alloc_array", Ll.Fun ([I64], Ptr I64)
]
(* Oat builtin function context --------------------------------------------- *)
let builtins =
[ "array_of_string", cmp_rty @@ RFun ([TRef RString], RetVal (TRef(RArray TInt)))
; "string_of_array", cmp_rty @@ RFun ([TRef(RArray TInt)], RetVal (TRef RString))
; "length_of_string", cmp_rty @@ RFun ([TRef RString], RetVal TInt)
; "string_of_int", cmp_rty @@ RFun ([TInt], RetVal (TRef RString))
; "string_cat", cmp_rty @@ RFun ([TRef RString; TRef RString], RetVal (TRef RString))
; "print_string", cmp_rty @@ RFun ([TRef RString], RetVoid)
; "print_int", cmp_rty @@ RFun ([TInt], RetVoid)
; "print_bool", cmp_rty @@ RFun ([TBool], RetVoid)
]
(* Compile a OAT program to LLVMlite *)
let cmp_prog (p:Ast.prog) : Ll.prog =
(* add built-in functions to context *)
let init_ctxt =
List.fold_left (fun c (i, t) -> Ctxt.add c i (Ll.Ptr t, Gid i))
Ctxt.empty builtins
in
let fc = cmp_function_ctxt init_ctxt p in
(* build global variable context *)
let c = cmp_global_ctxt fc p in
(* compile functions and global variables *)
let fdecls, gdecls =
List.fold_right (fun d (fs, gs) ->
match d with
| Ast.Gvdecl { elt=gd } ->
let ll_gd, gs' = cmp_gexp c gd.init in
(fs, (gd.name, ll_gd)::gs' @ gs)
| Ast.Gfdecl fd ->
let fdecl, gs' = cmp_fdecl c fd in
(fd.elt.fname,fdecl)::fs, gs' @ gs
) p ([], [])
in
(* gather external declarations *)
let edecls = internals @ builtins in
{ tdecls = []; gdecls; fdecls; edecls }