(* X86lite Simulator *) (* See the documentation in the X86lite specification, available on the course web pages, for a detailed explanation of the instruction semantics. *) open X86 (* simulator machine state -------------------------------------------------- *) let mem_bot = 0x400000L (* lowest valid address *) let mem_top = 0x410000L (* one past the last byte in memory *) let mem_size = Int64.to_int (Int64.sub mem_top mem_bot) let nregs = 17 (* including Rip *) let ins_size = 8L (* assume we have a 8-byte encoding *) let exit_addr = 0xfdeadL (* halt when m.regs(%rip) = exit_addr *) (* The simulator memory maps addresses to symbolic bytes. Symbolic bytes are either actual data indicated by the Byte constructor or 'symbolic instructions' that take up eight bytes for the purposes of layout. The symbolic bytes abstract away from the details of how instructions are represented in memory. Each instruction takes exactly eight consecutive bytes, where the first byte InsB0 stores the actual instruction, and the next sevent bytes are InsFrag elements, which aren't valid data. For example, the two-instruction sequence: at&t syntax ocaml syntax movq %rdi, (%rsp) Movq, [~%Rdi; Ind2 Rsp] decq %rdi Decq, [~%Rdi] is represented by the following elements of the mem array (starting at address 0x400000): 0x400000 : InsB0 (Movq, [~%Rdi; Ind2 Rsp]) 0x400001 : InsFrag 0x400002 : InsFrag 0x400003 : InsFrag 0x400004 : InsFrag 0x400005 : InsFrag 0x400006 : InsFrag 0x400007 : InsFrag 0x400008 : InsB0 (Decq, [~%Rdi]) 0x40000A : InsFrag 0x40000B : InsFrag 0x40000C : InsFrag 0x40000D : InsFrag 0x40000E : InsFrag 0x40000F : InsFrag 0x400010 : InsFrag *) type sbyte = | InsB0 of ins (* 1st byte of an instruction *) | InsFrag (* 2nd - 8th bytes of an instruction *) | Byte of char (* non-instruction byte *) (* memory maps addresses to symbolic bytes *) type mem = sbyte array (* Flags for condition codes *) type flags = { mutable fo : bool ; mutable fs : bool ; mutable fz : bool } (* Register files *) type regs = int64 array (* Complete machine state *) type mach = { flags : flags ; regs : regs ; mem : mem } (* simulator helper functions ----------------------------------------------- *) (* The index of a register in the regs array *) let rind : reg -> int = function | Rip -> 16 | Rax -> 0 | Rbx -> 1 | Rcx -> 2 | Rdx -> 3 | Rsi -> 4 | Rdi -> 5 | Rbp -> 6 | Rsp -> 7 | R08 -> 8 | R09 -> 9 | R10 -> 10 | R11 -> 11 | R12 -> 12 | R13 -> 13 | R14 -> 14 | R15 -> 15 ;; (* Helper functions for reading/writing sbytes *) (* Convert an int64 to its sbyte representation *) let sbytes_of_int64 (i : int64) : sbyte list = let open Char in let open Int64 in List.map (fun n -> Byte (shift_right i n |> logand 0xffL |> to_int |> chr)) [ 0; 8; 16; 24; 32; 40; 48; 56 ] ;; (* Convert an sbyte representation to an int64 *) let int64_of_sbytes (bs : sbyte list) : int64 = let open Char in let open Int64 in let f b i = match b with | Byte c -> logor (shift_left i 8) (c |> code |> of_int) | _ -> 0L in List.fold_right f bs 0L ;; (* Convert a string to its sbyte representation *) let sbytes_of_string (s : string) : sbyte list = let rec loop acc = function | i when i < 0 -> acc | i -> loop (Byte s.[i] :: acc) (pred i) in loop [ Byte '\x00' ] @@ (String.length s - 1) ;; (* Serialize an instruction to sbytes *) let sbytes_of_ins ((op, args) : ins) : sbyte list = let check = function | Imm (Lbl _) | Ind1 (Lbl _) | Ind3 (Lbl _, _) -> invalid_arg "sbytes_of_ins: tried to serialize a label!" | _ -> () in List.iter check args; [ InsB0 (op, args); InsFrag; InsFrag; InsFrag; InsFrag; InsFrag; InsFrag; InsFrag ] ;; (* Serialize a data element to sbytes *) let sbytes_of_data : data -> sbyte list = function | Quad (Lit i) -> sbytes_of_int64 i | Asciz s -> sbytes_of_string s | Quad (Lbl _) -> invalid_arg "sbytes_of_data: tried to serialize a label!" ;; (* It might be useful to toggle printing of intermediate states of your simulator. Our implementation uses this mutable flag to turn on/off printing. For instance, you might write something like: [if !debug_simulator then print_endline @@ string_of_ins u; ...] *) let debug_simulator = ref false (* override some useful operators *) let ( +. ) = Int64.add let ( -. ) = Int64.sub let ( *. ) = Int64.mul let ( <. ) a b = Int64.compare a b < 0 let ( >. ) a b = Int64.compare a b > 0 let ( <=. ) a b = Int64.compare a b <= 0 let ( >=. ) a b = Int64.compare a b >= 0 (* Interpret a condition code with respect to the given flags. *) (* !!! Check the Specification for Help *) let interp_cnd { fo; fs; fz } : cnd -> bool = function | Eq -> fz | Neq -> not fz | Gt -> fo = fs && not fz | Ge -> fo = fs | Lt -> fo <> fs | Le -> fo <> fs || fz (* Maps an X86lite address into Some OCaml array index, or None if the address is not within the legal address space. *) let map_addr (addr : quad) : int option = if addr >=. mem_bot && addr <. mem_top then Some (Int64.to_int (addr -. mem_bot)) else None (* Your simulator should raise this exception if it tries to read from or store to an address not within the valid address space. *) exception X86lite_segfault (* Raise X86lite_segfault when addr is invalid. *) let map_addr_segfault (addr : quad) : int = let idx = map_addr addr in match idx with | Some i -> i | None -> raise X86lite_segfault (* Simulates one step of the machine: - fetch the instruction at %rip - compute the source and/or destination information from the operands - simulate the instruction semantics - update the registers and/or memory appropriately - set the condition flags We provide the basic structure of step function and helper functions. Implement the subroutine below to complete the step function. See step function to understand each subroutine and how they are glued together. *) let readquad (m : mach) (addr : quad) : quad = int64_of_sbytes @@ Array.to_list @@ Array.sub m.mem (map_addr_segfault addr) 8 let writequad (m : mach) (addr : quad) (w : quad) : unit = Array.blit (Array.of_list @@ sbytes_of_int64 w) 0 m.mem (map_addr_segfault addr) 8 exception InvalidInstructinAlignment let fetchins (m : mach) (addr : quad) : ins = let ins_byte = m.mem.(map_addr_segfault addr) in match ins_byte with | InsB0 i -> i | _ -> raise InvalidInstructinAlignment exception MalformedOpcodeArgs (* Compute the instruction result. * NOTE: See int64_overflow.ml for the definition of the return type * Int64_overflow.t. *) let interp_opcode (m : mach) (o : opcode) (args : int64 list) : Int64_overflow.t = let open Int64 in let open Int64_overflow in match o, args with | Negq, [ n ] -> neg n | Incq, [ n ] -> succ n | Decq, [ n ] -> pred n | Addq, [n1; n2] -> add n1 n2 | Subq, [n1; n2] -> sub n2 n1 | Imulq, [n1; n2] -> mul n1 n2 | Notq, [ n ] -> ok @@ lognot n | Andq, [n1; n2] -> ok @@ logand n1 n2 | Orq, [n1; n2] -> ok @@ logor n1 n2 | Xorq, [n1; n2] -> ok @@ logxor n1 n2 | Sarq, [a; n1] -> a |> to_int |> shift_right n1 |> ok (* Passing the sign of the original value as the overflow is hacky, but it's the only way to communicate information about the original operand to the set_flags function *) | Shrq, [a; n1] -> a |> to_int |> shift_right_logical n1 |> withok (n1 <. 0L) (* Passing the sign of the original value as the overflow is hacky, but it's the only way to communicate information about the original operand to the set_flags function *) | Shlq, [a; n1] -> a |> to_int |> shift_left n1 |> withok (n1 <. 0L) | Set c, [ n1 ] -> if interp_cnd m.flags c then ok @@ logand n1 0xFFFFFFFFFFFFFF01L else ok @@ logand n1 0xFFFFFFFFFFFFFF00L | Leaq, [i; _] -> ok i | Movq, [s; _] -> ok s | Cmpq, [n1; n2] -> sub n2 n1 | Jmp, [i] -> ok i | J c, [d] -> if interp_cnd m.flags c then ok d else ok m.regs.(rind Rip) | _ -> raise MalformedOpcodeArgs ;; exception FoundLabelInAsm exception UnknownWriteBack (** Update machine state with instruction results. *) let ins_writeback (m : mach) : ins -> int64 -> unit = let write_to_dest q d = match d with | Reg r -> m.regs.(rind r) <- q | Ind1 (Lit a) -> writequad m a q | Ind2 r -> writequad m m.regs.(rind r) q | Ind3 (Lit dis, r) -> writequad m (m.regs.(rind r) +. dis) q | _ -> raise FoundLabelInAsm in fun i q -> match i with | Jmp, _ -> m.regs.(rind Rip) <- q | J _, _ -> m.regs.(rind Rip) <- q | Cmpq, _ -> () | _, [_; d] -> write_to_dest q d | _, [ d ] -> write_to_dest q d | _ -> raise UnknownWriteBack ;; exception NonIndirectOrLabel (* mem addr ---> mem array index *) let interp_operands (m : mach) : ins -> int64 list = let inter_value = function | Imm ( Lit q ) -> q | Reg r -> m.regs.(rind r) | Ind1 ( Lit q ) -> readquad m q | Ind2 r -> readquad m m.regs.(rind r) | Ind3 (Lit q, r) -> readquad m @@ m.regs.(rind r) +. q | _ -> raise FoundLabelInAsm in let inter_addr = function | Ind1 (Lit q) -> q | Ind2 r -> m.regs.(rind r) | Ind3 (Lit q, r) -> m.regs.(rind r) +. q | _ -> raise NonIndirectOrLabel in function | Leaq, [i; d] -> [ inter_addr i; inter_value d ] | _, ops -> List.map inter_value ops ;; let validate_operands : ins -> unit = function (* TODO: Add validations, leaq can only take indirect operands for example *) | _ -> () ;; let crack : ins -> ins list = function | Pushq, [ op ] -> [Subq, [Imm (Lit 8L); Reg Rsp]; Movq, [op; Ind2 Rsp]] | Popq, [ op ] -> [Movq, [Ind2 Rsp; op]; Addq, [Imm (Lit 8L); Reg Rsp]] (* TODO: These two have a more elegant implementation if we allow crack to be recursive *) | Callq, [ op ] -> [Subq, [Imm (Lit 8L); Reg Rsp]; Movq, [Reg Rip; Ind2 Rsp]; Jmp, [op]] | Retq, [] -> [Movq, [Ind2 Rsp; Reg Rip]; Addq, [Imm (Lit 8L); Reg Rsp]] | i -> [ i ] ;; (* TODO: double check against spec *) let set_flags (m : mach) (op : opcode) (ws : quad list) (w : Int64_overflow.t) : unit = let fz = w.value = 0L in let fs = w.value <. 0L in let fo = w.overflow in let set_zf_and_sf = function | Notq | Set _ | Leaq | Movq | Jmp | J _ | Callq | Retq | Pushq | Popq -> () | Sarq | Shlq | Shrq -> if 0L = (List.hd ws) then () else m.flags.fz <- fz; m.flags.fs <- fs (* Technically, fz and fs are not defined for Imulq, but it's fine to set them to these values *) | _ -> m.flags.fz <- fz; m.flags.fs <- fs in let set_of = function | Notq | Set _ | Leaq | Movq | Jmp | J _ | Callq | Retq | Pushq | Popq -> () | Andq | Orq | Xorq -> m.flags.fo <- false | Sarq -> if 1L = (List.hd ws) then m.flags.fo <- false else () (* fo and fs are calculated after the shift. When AMT == 1, comparing fs and fo is equivalent to comparing the 2 most significant bits of the original value *) | Shlq -> if 1L = (List.hd ws) then m.flags.fo <- (fs = fo) else () | Shrq -> if 1L = (List.hd ws) then m.flags.fo <- fo else () (* This covers Negq, Addq, Subq, Imulq, Incq, Decq and Cmpq *) | _ -> m.flags.fo <- fo in set_zf_and_sf op; set_of op ;; let step (m : mach) : unit = (* execute an instruction *) let ((op, args) as ins) = fetchins m m.regs.(rind Rip) in validate_operands ins; (* Some instructions involve running two or more basic instructions. * For other instructions, just return a list of one instruction. * See the X86lite specification for details. *) let uops : ins list = crack (op, args) in m.regs.(rind Rip) <- m.regs.(rind Rip) +. ins_size; List.iter (fun ((uop, _) as u) -> if !debug_simulator then print_endline @@ string_of_ins u; let ws = interp_operands m u in let res = interp_opcode m uop ws in ins_writeback m u @@ res.Int64_overflow.value; set_flags m op ws res) uops ;; (* Runs the machine until the rip register reaches a designated memory address. Returns the contents of %rax when the machine halts. *) let run (m : mach) : int64 = while m.regs.(rind Rip) <> exit_addr do step m done; m.regs.(rind Rax) ;; (* assembling and linking --------------------------------------------------- *) (* A representation of the executable *) type exec = { entry : quad (* address of the entry point *) ; text_pos : quad (* starting address of the code *) ; data_pos : quad (* starting address of the data *) ; text_seg : sbyte list (* contents of the text segment *) ; data_seg : sbyte list (* contents of the data segment *) } (* Assemble should raise this when a label is used but not defined *) exception Undefined_sym of lbl (* Assemble should raise this when a label is defined more than once *) exception Redefined_sym of lbl (* Convert an X86 program into an object file: - separate the text and data segments - compute the size of each segment Note: the size of an Asciz string section is (1 + the string length) due to the null terminator - resolve the labels to concrete addresses and 'patch' the instructions to replace Lbl values with the corresponding Imm values. HINT: consider building a mapping from symboli Lbl to memory address - the text segment starts at the lowest address - the data segment starts after the text segment HINT: List.fold_left and List.fold_right are your friends. *) exception UnexpectedData exception UnexpectedText let is_size (is : ins list) : quad = 8L *. (Int64.of_int @@ List.length is) ;; let elem_is_size (e : elem) : quad = match e with | { asm = Text il; _ } -> is_size il | _ -> raise UnexpectedData ;; let ds_size (ds : data list) : quad = let d_size d acc = match d with | Asciz s -> acc +. 1L +. (Int64.of_int @@ String.length s) | Quad _ -> acc +. 8L in List.fold_right d_size ds 0L ;; let elem_ds_size (e : elem) : quad = match e with | { asm = Data dl; _ } -> ds_size dl | _ -> raise UnexpectedText ;; let elem_size (e : elem) : quad = match e with | { asm = Data dl; _ } -> ds_size dl | { asm = Text il; _ } -> is_size il ;; type symbol_table = (lbl * quad) list let build_symbol_table (p : prog) : symbol_table = let accum_elem ((addr, tbl) as acc) ({ lbl = l; _ } as e) = let curr = List.assoc_opt l tbl in let new_addr = addr +. (elem_size e) in let new_sym_tbl = (l, addr) :: tbl in match curr with | Some _ -> raise (Redefined_sym l) | None -> (new_addr, new_sym_tbl) in let (_, sym_tbl) = List.fold_left accum_elem (mem_bot, []) p in sym_tbl ;; let patch_imm (i : imm) (t : symbol_table) : imm = match i with | Lbl l -> let addr_opt = List.assoc_opt l t in (match addr_opt with | Some addr -> Lit addr | None -> raise (Undefined_sym l)) | _ -> i ;; let patch_data (t : symbol_table) (d : data) : data = match d with | Quad i -> Quad (patch_imm i t) | _ -> d ;; let patch_ins (t : symbol_table) ((op, operands) as i : ins) : ins = let patch_operand o = match o with | Imm im -> Imm (patch_imm im t) | Ind1 im -> Ind1 (patch_imm im t) | Ind3 (im, r) -> Ind3 (patch_imm im t, r) | _ -> o in let new_operands = List.map patch_operand operands in (op, new_operands) ;; let emit_elem (t :symbol_table) ({ asm = a; _ } : elem) : sbyte list = match a with | Text il -> List.fold_right (fun ins acc -> (sbytes_of_ins ins) @ acc) (List.map (patch_ins t) il) [] | Data dl -> List.fold_right (fun d acc -> (sbytes_of_data d) @ acc) (List.map (patch_data t) dl) [] ;; let assemble (p : prog) : exec = let txts = let append_if_text e acc = match e with | { asm = Text _; _ } -> e :: acc | _ -> acc in List.fold_right append_if_text p [] in let txts_size = List.fold_left (fun acc e -> acc +. (elem_is_size e)) 0L txts in let dtas = let append_if_data e acc = match e with | { asm = Data _; _ } -> e :: acc | _ -> acc in List.fold_right append_if_data p [] in let dtas_size = List.fold_left (fun acc e -> acc +. (elem_ds_size e)) 0L dtas in let ordered_prog = txts @ dtas in let sym_tbl = build_symbol_table ordered_prog in let entry = match (List.assoc_opt "main" sym_tbl) with | Some q -> q | None -> raise (Undefined_sym "main") in let ee = emit_elem sym_tbl in let d_seg = List.fold_right (fun d acc -> (ee d) @ acc) dtas [] in let t_seg = List.fold_right (fun t acc -> (ee t) @ acc) txts [] in { entry = entry; text_pos = mem_bot; data_pos = mem_bot +. txts_size; text_seg = t_seg; data_seg = d_seg; } ;; (* Convert an object file into an executable machine state. - allocate the mem array - set up the memory state by writing the symbolic bytes to the appropriate locations - create the inital register state - initialize rip to the entry point address - initializes rsp to the last word in memory - the other registers are initialized to 0 - the condition code flags start as 'false' Hint: The Array.make, Array.blit, and Array.of_list library functions may be of use. *) let load { entry; text_pos; data_pos; text_seg; data_seg } : mach = let mem = Array.make mem_size InsFrag in let regs = Array.make 17 0L in let flags = {fs = false; fz = false; fo = false} in let sentinel_addr = mem_top -. 8L in let sentinel_pos = mem_size - 8 in let text_seg_size = List.length text_seg in let data_seg_size = List.length data_seg in regs.(rind Rip) <- entry; regs.(rind Rsp) <- sentinel_addr; Array.blit (Array.of_list @@ sbytes_of_int64 exit_addr) 0 mem sentinel_pos 8; Array.blit (Array.of_list text_seg) 0 mem 0 text_seg_size; Array.blit (Array.of_list data_seg) 0 mem text_seg_size data_seg_size; { flags = flags; regs = regs; mem = mem; } ;;