534 lines
17 KiB
OCaml
534 lines
17 KiB
OCaml
(* X86lite Simulator *)
|
|
|
|
(* See the documentation in the X86lite specification, available on the
|
|
course web pages, for a detailed explanation of the instruction
|
|
semantics.
|
|
*)
|
|
|
|
open X86
|
|
|
|
(* simulator machine state -------------------------------------------------- *)
|
|
|
|
let mem_bot = 0x400000L (* lowest valid address *)
|
|
let mem_top = 0x410000L (* one past the last byte in memory *)
|
|
let mem_size = Int64.to_int (Int64.sub mem_top mem_bot)
|
|
let nregs = 17 (* including Rip *)
|
|
let ins_size = 8L (* assume we have a 8-byte encoding *)
|
|
let exit_addr = 0xfdeadL (* halt when m.regs(%rip) = exit_addr *)
|
|
|
|
(* The simulator memory maps addresses to symbolic bytes. Symbolic
|
|
bytes are either actual data indicated by the Byte constructor or
|
|
'symbolic instructions' that take up eight bytes for the purposes of
|
|
layout.
|
|
|
|
The symbolic bytes abstract away from the details of how
|
|
instructions are represented in memory. Each instruction takes
|
|
exactly eight consecutive bytes, where the first byte InsB0 stores
|
|
the actual instruction, and the next sevent bytes are InsFrag
|
|
elements, which aren't valid data.
|
|
|
|
For example, the two-instruction sequence:
|
|
at&t syntax ocaml syntax
|
|
movq %rdi, (%rsp) Movq, [~%Rdi; Ind2 Rsp]
|
|
decq %rdi Decq, [~%Rdi]
|
|
|
|
is represented by the following elements of the mem array (starting
|
|
at address 0x400000):
|
|
|
|
0x400000 : InsB0 (Movq, [~%Rdi; Ind2 Rsp])
|
|
0x400001 : InsFrag
|
|
0x400002 : InsFrag
|
|
0x400003 : InsFrag
|
|
0x400004 : InsFrag
|
|
0x400005 : InsFrag
|
|
0x400006 : InsFrag
|
|
0x400007 : InsFrag
|
|
0x400008 : InsB0 (Decq, [~%Rdi])
|
|
0x40000A : InsFrag
|
|
0x40000B : InsFrag
|
|
0x40000C : InsFrag
|
|
0x40000D : InsFrag
|
|
0x40000E : InsFrag
|
|
0x40000F : InsFrag
|
|
0x400010 : InsFrag
|
|
*)
|
|
type sbyte =
|
|
| InsB0 of ins (* 1st byte of an instruction *)
|
|
| InsFrag (* 2nd - 8th bytes of an instruction *)
|
|
| Byte of char (* non-instruction byte *)
|
|
|
|
(* memory maps addresses to symbolic bytes *)
|
|
type mem = sbyte array
|
|
|
|
(* Flags for condition codes *)
|
|
type flags =
|
|
{ mutable fo : bool
|
|
; mutable fs : bool
|
|
; mutable fz : bool
|
|
}
|
|
|
|
(* Register files *)
|
|
type regs = int64 array
|
|
|
|
(* Complete machine state *)
|
|
type mach =
|
|
{ flags : flags
|
|
; regs : regs
|
|
; mem : mem
|
|
}
|
|
|
|
(* simulator helper functions ----------------------------------------------- *)
|
|
|
|
(* The index of a register in the regs array *)
|
|
let rind : reg -> int = function
|
|
| Rip -> 16
|
|
| Rax -> 0
|
|
| Rbx -> 1
|
|
| Rcx -> 2
|
|
| Rdx -> 3
|
|
| Rsi -> 4
|
|
| Rdi -> 5
|
|
| Rbp -> 6
|
|
| Rsp -> 7
|
|
| R08 -> 8
|
|
| R09 -> 9
|
|
| R10 -> 10
|
|
| R11 -> 11
|
|
| R12 -> 12
|
|
| R13 -> 13
|
|
| R14 -> 14
|
|
| R15 -> 15
|
|
;;
|
|
|
|
(* Helper functions for reading/writing sbytes *)
|
|
|
|
(* Convert an int64 to its sbyte representation *)
|
|
let sbytes_of_int64 (i : int64) : sbyte list =
|
|
let open Char in
|
|
let open Int64 in
|
|
List.map
|
|
(fun n -> Byte (shift_right i n |> logand 0xffL |> to_int |> chr))
|
|
[ 0; 8; 16; 24; 32; 40; 48; 56 ]
|
|
;;
|
|
|
|
(* Convert an sbyte representation to an int64 *)
|
|
let int64_of_sbytes (bs : sbyte list) : int64 =
|
|
let open Char in
|
|
let open Int64 in
|
|
let f b i =
|
|
match b with
|
|
| Byte c -> logor (shift_left i 8) (c |> code |> of_int)
|
|
| _ -> 0L
|
|
in
|
|
List.fold_right f bs 0L
|
|
;;
|
|
|
|
(* Convert a string to its sbyte representation *)
|
|
let sbytes_of_string (s : string) : sbyte list =
|
|
let rec loop acc = function
|
|
| i when i < 0 -> acc
|
|
| i -> loop (Byte s.[i] :: acc) (pred i)
|
|
in
|
|
loop [ Byte '\x00' ] @@ (String.length s - 1)
|
|
;;
|
|
|
|
(* Serialize an instruction to sbytes *)
|
|
let sbytes_of_ins ((op, args) : ins) : sbyte list =
|
|
let check = function
|
|
| Imm (Lbl _) | Ind1 (Lbl _) | Ind3 (Lbl _, _) ->
|
|
invalid_arg "sbytes_of_ins: tried to serialize a label!"
|
|
| _ -> ()
|
|
in
|
|
List.iter check args;
|
|
[ InsB0 (op, args); InsFrag; InsFrag; InsFrag; InsFrag; InsFrag; InsFrag; InsFrag ]
|
|
;;
|
|
|
|
(* Serialize a data element to sbytes *)
|
|
let sbytes_of_data : data -> sbyte list = function
|
|
| Quad (Lit i) -> sbytes_of_int64 i
|
|
| Asciz s -> sbytes_of_string s
|
|
| Quad (Lbl _) -> invalid_arg "sbytes_of_data: tried to serialize a label!"
|
|
;;
|
|
|
|
(* It might be useful to toggle printing of intermediate states of your
|
|
simulator. Our implementation uses this mutable flag to turn on/off
|
|
printing. For instance, you might write something like:
|
|
|
|
[if !debug_simulator then print_endline @@ string_of_ins u; ...]
|
|
*)
|
|
let debug_simulator = ref false
|
|
|
|
(* override some useful operators *)
|
|
let ( +. ) = Int64.add
|
|
let ( -. ) = Int64.sub
|
|
let ( *. ) = Int64.mul
|
|
let ( <. ) a b = Int64.compare a b < 0
|
|
let ( >. ) a b = Int64.compare a b > 0
|
|
let ( <=. ) a b = Int64.compare a b <= 0
|
|
let ( >=. ) a b = Int64.compare a b >= 0
|
|
|
|
(* Interpret a condition code with respect to the given flags. *)
|
|
(* !!! Check the Specification for Help *)
|
|
let interp_cnd { fo; fs; fz } : cnd -> bool = function
|
|
| Eq -> fz
|
|
| Neq -> not fz
|
|
| Gt -> fo = fs && not fz
|
|
| Ge -> fo = fs
|
|
| Lt -> fo <> fs
|
|
| Le -> fo <> fs || fz
|
|
|
|
(* Maps an X86lite address into Some OCaml array index,
|
|
or None if the address is not within the legal address space. *)
|
|
let map_addr (addr : quad) : int option =
|
|
if addr >=. mem_bot && addr <. mem_top then
|
|
Some (Int64.to_int (addr -. mem_bot))
|
|
else
|
|
None
|
|
|
|
|
|
(* Your simulator should raise this exception if it tries to read from or
|
|
store to an address not within the valid address space. *)
|
|
exception X86lite_segfault
|
|
|
|
(* Raise X86lite_segfault when addr is invalid. *)
|
|
let map_addr_segfault (addr : quad) : int =
|
|
let idx = map_addr addr in
|
|
match idx with
|
|
| Some i -> i
|
|
| None -> raise X86lite_segfault
|
|
|
|
(* Simulates one step of the machine:
|
|
- fetch the instruction at %rip
|
|
- compute the source and/or destination information from the operands
|
|
- simulate the instruction semantics
|
|
- update the registers and/or memory appropriately
|
|
- set the condition flags
|
|
|
|
We provide the basic structure of step function and helper functions.
|
|
Implement the subroutine below to complete the step function.
|
|
See step function to understand each subroutine and how they
|
|
are glued together.
|
|
*)
|
|
|
|
let readquad (m : mach) (addr : quad) : quad =
|
|
int64_of_sbytes @@ Array.to_list @@ Array.sub m.mem (map_addr_segfault addr) 8
|
|
|
|
let writequad (m : mach) (addr : quad) (w : quad) : unit =
|
|
Array.blit (Array.of_list @@ sbytes_of_int64 w) 0 m.mem (map_addr_segfault addr) 8
|
|
|
|
exception InvalidInstructinAlignment
|
|
|
|
let fetchins (m : mach) (addr : quad) : ins =
|
|
let ins_byte = m.mem.(map_addr_segfault addr) in
|
|
match ins_byte with
|
|
| InsB0 i -> i
|
|
| _ -> raise InvalidInstructinAlignment
|
|
|
|
|
|
exception MalformedOpcodeArgs
|
|
(* Compute the instruction result.
|
|
* NOTE: See int64_overflow.ml for the definition of the return type
|
|
* Int64_overflow.t. *)
|
|
let interp_opcode (m : mach) (o : opcode) (args : int64 list) : Int64_overflow.t =
|
|
let open Int64 in
|
|
let open Int64_overflow in
|
|
match o, args with
|
|
| Negq, [ n ] -> neg n
|
|
| Incq, [ n ] -> succ n
|
|
| Decq, [ n ] -> pred n
|
|
| Addq, [n1; n2] -> add n1 n2
|
|
| Subq, [n1; n2] -> sub n2 n1
|
|
| Imulq, [n1; n2] -> mul n1 n2
|
|
| Notq, [ n ] -> ok @@ lognot n
|
|
| Andq, [n1; n2] -> ok @@ logand n1 n2
|
|
| Orq, [n1; n2] -> ok @@ logor n1 n2
|
|
| Xorq, [n1; n2] -> ok @@ logxor n1 n2
|
|
| Sarq, [a; n1] -> a |> to_int |> shift_right n1 |> ok
|
|
(* Passing the sign of the original value as the overflow is hacky, but it's
|
|
the only way to communicate information about the original operand to
|
|
the set_flags function *)
|
|
| Shrq, [a; n1] -> a |> to_int |> shift_right_logical n1 |> withok (n1 <. 0L)
|
|
(* Passing the sign of the original value as the overflow is hacky, but it's
|
|
the only way to communicate information about the original operand to
|
|
the set_flags function *)
|
|
| Shlq, [a; n1] -> a |> to_int |> shift_left n1 |> withok (n1 <. 0L)
|
|
| Set c, [ n1 ] -> if interp_cnd m.flags c then ok @@ logand n1 0xFFFFFFFFFFFFFF01L else ok @@ logand n1 0xFFFFFFFFFFFFFF00L
|
|
| Leaq, [i; _] -> ok i
|
|
| Movq, [s; _] -> ok s
|
|
| Cmpq, [n1; n2] -> sub n2 n1
|
|
| Jmp, [i] -> ok i
|
|
| J c, [d] -> if interp_cnd m.flags c then ok d else ok m.regs.(rind Rip)
|
|
| _ -> raise MalformedOpcodeArgs
|
|
;;
|
|
|
|
exception FoundLabelInAsm
|
|
exception UnknownWriteBack
|
|
|
|
(** Update machine state with instruction results. *)
|
|
let ins_writeback (m : mach) : ins -> int64 -> unit =
|
|
let write_to_dest q d = match d with
|
|
| Reg r -> m.regs.(rind r) <- q
|
|
| Ind1 (Lit a) -> writequad m a q
|
|
| Ind2 r -> writequad m m.regs.(rind r) q
|
|
| Ind3 (Lit dis, r) -> writequad m (m.regs.(rind r) +. dis) q
|
|
| _ -> raise FoundLabelInAsm
|
|
in
|
|
fun i q -> match i with
|
|
| Jmp, _ -> m.regs.(rind Rip) <- q
|
|
| J _, _ -> m.regs.(rind Rip) <- q
|
|
| Cmpq, _ -> ()
|
|
| _, [_; d] -> write_to_dest q d
|
|
| _, [ d ] -> write_to_dest q d
|
|
| _ -> raise UnknownWriteBack
|
|
;;
|
|
|
|
exception NonIndirectOrLabel
|
|
|
|
(* mem addr ---> mem array index *)
|
|
let interp_operands (m : mach) : ins -> int64 list =
|
|
let inter_value = function
|
|
| Imm ( Lit q ) -> q
|
|
| Reg r -> m.regs.(rind r)
|
|
| Ind1 ( Lit q ) -> readquad m q
|
|
| Ind2 r -> readquad m m.regs.(rind r)
|
|
| Ind3 (Lit q, r) -> readquad m @@ m.regs.(rind r) +. q
|
|
| _ -> raise FoundLabelInAsm
|
|
in
|
|
let inter_addr = function
|
|
| Ind1 (Lit q) -> q
|
|
| Ind2 r -> m.regs.(rind r)
|
|
| Ind3 (Lit q, r) -> m.regs.(rind r) +. q
|
|
| _ -> raise NonIndirectOrLabel
|
|
in
|
|
function
|
|
| Leaq, [i; d] -> [ inter_addr i; inter_value d ]
|
|
| _, ops -> List.map inter_value ops
|
|
;;
|
|
|
|
let validate_operands : ins -> unit = function
|
|
(* TODO: Add validations, leaq can only take indirect operands for example *)
|
|
| _ -> ()
|
|
;;
|
|
|
|
let crack : ins -> ins list = function
|
|
| Pushq, [ op ] -> [Subq, [Imm (Lit 8L); Reg Rsp]; Movq, [op; Ind2 Rsp]]
|
|
| Popq, [ op ] -> [Movq, [Ind2 Rsp; op]; Addq, [Imm (Lit 8L); Reg Rsp]]
|
|
(* TODO: These two have a more elegant implementation if we allow crack to be recursive *)
|
|
| Callq, [ op ] -> [Subq, [Imm (Lit 8L); Reg Rsp]; Movq, [Reg Rip; Ind2 Rsp]; Jmp, [op]]
|
|
| Retq, [] -> [Movq, [Ind2 Rsp; Reg Rip]; Addq, [Imm (Lit 8L); Reg Rsp]]
|
|
| i -> [ i ]
|
|
;;
|
|
|
|
(* TODO: double check against spec *)
|
|
let set_flags (m : mach) (op : opcode) (ws : quad list) (w : Int64_overflow.t) : unit =
|
|
let fz = w.value = 0L in
|
|
let fs = w.value <. 0L in
|
|
let fo = w.overflow in
|
|
let set_zf_and_sf = function
|
|
| Notq | Set _ | Leaq | Movq | Jmp | J _ | Callq | Retq | Pushq | Popq -> ()
|
|
| Sarq | Shlq | Shrq -> if 0L = (List.hd ws) then () else m.flags.fz <- fz; m.flags.fs <- fs
|
|
(* Technically, fz and fs are not defined for Imulq, but it's fine to set them to these values *)
|
|
| _ -> m.flags.fz <- fz; m.flags.fs <- fs
|
|
in
|
|
let set_of = function
|
|
| Notq | Set _ | Leaq | Movq | Jmp | J _ | Callq | Retq | Pushq | Popq -> ()
|
|
| Andq | Orq | Xorq -> m.flags.fo <- false
|
|
| Sarq -> if 1L = (List.hd ws) then m.flags.fo <- false else ()
|
|
(* fo and fs are calculated after the shift. When AMT == 1, comparing fs and fo
|
|
is equivalent to comparing the 2 most significant bits of the original value *)
|
|
| Shlq -> if 1L = (List.hd ws) then m.flags.fo <- (fs = fo) else ()
|
|
| Shrq -> if 1L = (List.hd ws) then m.flags.fo <- fo else ()
|
|
(* This covers Negq, Addq, Subq, Imulq, Incq, Decq and Cmpq *)
|
|
| _ -> m.flags.fo <- fo
|
|
in
|
|
set_zf_and_sf op;
|
|
set_of op
|
|
;;
|
|
|
|
let step (m : mach) : unit =
|
|
(* execute an instruction *)
|
|
let ((op, args) as ins) = fetchins m m.regs.(rind Rip) in
|
|
validate_operands ins;
|
|
(* Some instructions involve running two or more basic instructions.
|
|
* For other instructions, just return a list of one instruction.
|
|
* See the X86lite specification for details. *)
|
|
let uops : ins list = crack (op, args) in
|
|
m.regs.(rind Rip) <- m.regs.(rind Rip) +. ins_size;
|
|
List.iter
|
|
(fun ((uop, _) as u) ->
|
|
if !debug_simulator then print_endline @@ string_of_ins u;
|
|
let ws = interp_operands m u in
|
|
let res = interp_opcode m uop ws in
|
|
ins_writeback m u @@ res.Int64_overflow.value;
|
|
set_flags m op ws res)
|
|
uops
|
|
;;
|
|
|
|
(* Runs the machine until the rip register reaches a designated
|
|
memory address. Returns the contents of %rax when the
|
|
machine halts. *)
|
|
let run (m : mach) : int64 =
|
|
while m.regs.(rind Rip) <> exit_addr do
|
|
step m
|
|
done;
|
|
m.regs.(rind Rax)
|
|
;;
|
|
|
|
(* assembling and linking --------------------------------------------------- *)
|
|
|
|
(* A representation of the executable *)
|
|
type exec =
|
|
{ entry : quad (* address of the entry point *)
|
|
; text_pos : quad (* starting address of the code *)
|
|
; data_pos : quad (* starting address of the data *)
|
|
; text_seg : sbyte list (* contents of the text segment *)
|
|
; data_seg : sbyte list (* contents of the data segment *)
|
|
}
|
|
|
|
(* Assemble should raise this when a label is used but not defined *)
|
|
exception Undefined_sym of lbl
|
|
|
|
(* Assemble should raise this when a label is defined more than once *)
|
|
exception Redefined_sym of lbl
|
|
|
|
(* Convert an X86 program into an object file:
|
|
- separate the text and data segments
|
|
- compute the size of each segment
|
|
Note: the size of an Asciz string section is (1 + the string length)
|
|
due to the null terminator
|
|
|
|
- resolve the labels to concrete addresses and 'patch' the instructions to
|
|
replace Lbl values with the corresponding Imm values.
|
|
HINT: consider building a mapping from symboli Lbl to memory address
|
|
|
|
- the text segment starts at the lowest address
|
|
- the data segment starts after the text segment
|
|
|
|
HINT: List.fold_left and List.fold_right are your friends.
|
|
*)
|
|
|
|
exception UnexpectedData
|
|
exception UnexpectedText
|
|
|
|
let is_size (is : ins list) : quad = 8L *. (Int64.of_int @@ List.length is)
|
|
;;
|
|
|
|
let elem_is_size (e : elem) : quad = match e with
|
|
| { asm = Text il; _ } -> is_size il
|
|
| _ -> raise UnexpectedData
|
|
;;
|
|
|
|
let ds_size (ds : data list) : quad =
|
|
let d_size d acc = match d with
|
|
| Asciz s -> acc +. 1L +. (Int64.of_int @@ String.length s)
|
|
| Quad _ -> acc +. 8L
|
|
in
|
|
List.fold_right d_size ds 0L
|
|
;;
|
|
|
|
let elem_ds_size (e : elem) : quad = match e with
|
|
| { asm = Data dl; _ } -> ds_size dl
|
|
| _ -> raise UnexpectedText
|
|
;;
|
|
|
|
let elem_size (e : elem) : quad = match e with
|
|
| { asm = Data dl; _ } -> ds_size dl
|
|
| { asm = Text il; _ } -> is_size il
|
|
;;
|
|
|
|
type symbol_table = (lbl * quad) list
|
|
|
|
let build_symbol_table (p : prog) : symbol_table =
|
|
let accum_elem ((addr, tbl) as acc) ({ lbl = l; _ } as e) =
|
|
let curr = List.assoc_opt l tbl in
|
|
let new_addr = addr +. (elem_size e) in
|
|
let new_sym_tbl = (l, addr) :: tbl in
|
|
match curr with
|
|
| Some _ -> raise (Redefined_sym l)
|
|
| None -> (new_addr, new_sym_tbl)
|
|
in
|
|
let (_, sym_tbl) = List.fold_left accum_elem (mem_bot, []) p in
|
|
sym_tbl
|
|
;;
|
|
|
|
let patch_imm (i : imm) (t : symbol_table) : imm = match i with
|
|
| Lbl l ->
|
|
let addr_opt = List.assoc_opt l t in
|
|
(match addr_opt with
|
|
| Some addr -> Lit addr
|
|
| None -> raise (Undefined_sym l))
|
|
| _ -> i
|
|
;;
|
|
|
|
let patch_data (t : symbol_table) (d : data) : data = match d with
|
|
| Quad i -> Quad (patch_imm i t)
|
|
| _ -> d
|
|
;;
|
|
|
|
let patch_ins (t : symbol_table) ((op, operands) as i : ins) : ins =
|
|
let patch_operand o = match o with
|
|
| Imm im -> Imm (patch_imm im t)
|
|
| Ind1 im -> Ind1 (patch_imm im t)
|
|
| Ind3 (im, r) -> Ind3 (patch_imm im t, r)
|
|
| _ -> o
|
|
in
|
|
let new_operands = List.map patch_operand operands in
|
|
(op, new_operands)
|
|
;;
|
|
|
|
let emit_elem (t :symbol_table) ({ asm = a; _ } : elem) : sbyte list = match a with
|
|
| Text il -> List.fold_right (fun ins acc -> (sbytes_of_ins ins) @ acc) (List.map (patch_ins t) il) []
|
|
| Data dl -> List.fold_right (fun d acc -> (sbytes_of_data d) @ acc) (List.map (patch_data t) dl) []
|
|
;;
|
|
|
|
let assemble (p : prog) : exec =
|
|
let txts =
|
|
let append_if_text e acc = match e with
|
|
| { asm = Text _; _ } -> e :: acc
|
|
| _ -> acc
|
|
in
|
|
List.fold_right append_if_text p []
|
|
in
|
|
let txts_size = List.fold_left (fun acc e -> acc +. (elem_is_size e)) 0L txts in
|
|
let dtas =
|
|
let append_if_data e acc = match e with
|
|
| { asm = Data _; _ } -> e :: acc
|
|
| _ -> acc
|
|
in
|
|
List.fold_right append_if_data p []
|
|
in
|
|
let dtas_size = List.fold_left (fun acc e -> acc +. (elem_ds_size e)) 0L dtas in
|
|
let ordered_prog = txts @ dtas in
|
|
let sym_tbl = build_symbol_table ordered_prog in
|
|
let entry = match (List.assoc_opt "main" sym_tbl) with
|
|
| Some q -> q
|
|
| None -> raise (Undefined_sym "main")
|
|
in
|
|
let ee = emit_elem sym_tbl in
|
|
let d_seg = List.fold_right (fun d acc -> (ee d) @ acc) dtas [] in
|
|
let t_seg = List.fold_right (fun t acc -> (ee t) @ acc) txts [] in
|
|
{
|
|
entry = entry;
|
|
text_pos = mem_bot;
|
|
data_pos = mem_bot +. txts_size;
|
|
text_seg = t_seg;
|
|
data_seg = d_seg;
|
|
}
|
|
;;
|
|
|
|
(* Convert an object file into an executable machine state.
|
|
- allocate the mem array
|
|
- set up the memory state by writing the symbolic bytes to the
|
|
appropriate locations
|
|
- create the inital register state
|
|
- initialize rip to the entry point address
|
|
- initializes rsp to the last word in memory
|
|
- the other registers are initialized to 0
|
|
- the condition code flags start as 'false'
|
|
|
|
Hint: The Array.make, Array.blit, and Array.of_list library functions
|
|
may be of use.
|
|
*)
|
|
let load { entry; text_pos; data_pos; text_seg; data_seg } : mach =
|
|
failwith "load not implemented"
|
|
;;
|