The 3c compiler was my final year Bournemouth University project. It was graded 83%.
In the past, implementing virtual machines has either been a custom process or an endeavour into interfacing an existing virtual machine using (relatively) low level programming languages like C. Recently there has been a boom in high level scripting languages, which claim to make a programmer more productive, yet the field of compiler design is still rooted firmly with low level languages. It remains to be seen why language processors are not implemented in high level scripting languages.
The following report presents an investigation into designing and implementing com puter languages using a modern compiler construction tool-kit called the “Low Level Virtual Machine” (LLVM), in combination with a modern scripting language. The report covers in detail traditional approaches to compiler construction, parsing and virtual machine theory. Comparisons are made between traditional approaches and the modern approach offered by LLVM, via an experimental object oriented language called 3c, developed using LLVM, the Aperiot parser, Python and an incremental de velopment methodology.
Development happened in phases. I branched regularly (using svn), in the case of accidental breakage.
I am pleased to announce, that I have 3c doing full proper recursion!
Consider this 3c program:
#!/usr/bin/env 3c
# Fibonacci number generator
# $Id: fib2.3c 211 2009-05-10 07:20:46Z edd $
func fib(n)
let result = 0
if n <= 1
let result = n
else
let n1 = n - 1
let n2 = n - 2
let result = call fib(n1) + call fib(n2)
if_done
ret result
func_done
let loop = 0
let out = ""
while loop < 10
let out = out + call fib(loop) + " "
let loop = loop + 1
while_done
print "Your Fibonacci numbers:"
print out
When run you get this:
x31% 3c fib2.3c Your Fibonacci numbers: 0 1 1 2 3 5 8 13 21 34
Grand!
In phase 10b, operator overloading was completed using a set of “polyop tables” (one for each of [ +, -, *, / ]). This offers a dynamic way for variables of different types to be added (for example). Once user classes are implemented, the compiler is aware that any method named in a certain way defines an operator overloading rule. For example a method named “String::oper+Integer” defines how a integer can be “added” to a string.
Observe the following 3c code:
#!./3c # example 3c program showing builtin operator overloading # it's all too simple to add primitives of the same type let a = 34 + 6 call a->inspect() # but how about this... let b = "My age is: " + 24 call b->inspect() # or even... let c = 2 + 4 + "bleb" + " flob>" + (3 + 2) + "<hehe" call c->inspect()
When run you get this:
<Integer: value = 40> <String: value = "My age is: 24" length="13"> <String: value = "24bleb flob>5<hehe" length="18">
Notice how:
*NOTE*: This example was run before object heirachy was implemented. The process differs slightly now.
Consider this 3c program:
#!./3c
# turn on debug
# spits out loads of parser info and
# llvm byte code befire jitted
trace_all
# define and assign a variable
let a = 1
# print a variable
print a
# define a function
def my_func(z)
# show that arguments work
print z
# assign a local 'a'
let a = 5
print a
# you can return values
ret 55
done
# capture return value of a variable
let ret_val = call my_func(2)
print ret_val
# prove 'a' in my_func is local to it
print a
# some more complex syntax
print ((2 + ret_val) + 2 - 3) / 2 + call my_func(3)
When run we see the following output
*trace: int: i32 1
*trace: var_assign: Identifier('a'), i32 1
*trace: var_get: Identifier('a')
*trace: int_print: %0 = load i32* %a ; <i32> [#uses=0]
*trace: arg_push: Identifier('z')
*trace: func_def: Identifier('my_func')
*trace: var_get: Identifier('z')
*trace: int_print: %0 = load i32* %z ; <i32> [#uses=0]
*trace: int: i32 5
*trace: var_assign: Identifier('a'), i32 5
*trace: var_get: Identifier('a')
*trace: int_print: %2 = load i32* %a ; <i32> [#uses=0]
*trace: int: i32 55
*trace: ret: i32 55
*trace: end_block
*trace: int: i32 2
*trace: arg_push: i32 2
*trace: func_call: Identifier('my_func')
*trace: var_assign: Identifier('ret_val'), %2 = call i32 @my_func(i32 2) ; <i32> [#uses=0]
*trace: var_get: Identifier('ret_val')
*trace: int_print: %3 = load i32* %ret_val ; <i32> [#uses=0]
*trace: var_get: Identifier('a')
*trace: int_print: %5 = load i32* %a ; <i32> [#uses=0]
*trace: int: i32 2
*trace: var_get: Identifier('ret_val')
*trace: int_add: i32 2, %7 = load i32* %ret_val ; <i32> [#uses=0]
*trace: int: i32 2
*trace: int: i32 3
*trace: int_sub: i32 2, i32 3
*trace: int_add: %8 = add i32 2, %7 ; <i32> [#uses=0]
, i32 -1
*trace: int: i32 2
*trace: int_div: %9 = add i32 %8, -1 ; <i32> [#uses=0]
, i32 2
*trace: int: i32 3
*trace: arg_push: i32 3
*trace: func_call: Identifier('my_func')
*trace: int_add: %10 = sdiv i32 %9, 2 ; <i32> [#uses=0]
, %11 = call i32 @my_func(i32 3) ; <i32> [#uses=0]
*trace: int_print: %12 = add i32 %10, %11 ; <i32> [#uses=0]
ir dump:
===========
; ModuleID = '3c'
@__printf_fmt_i32 = constant [4 x i8] c"%d\0A\00" ; <[4 x i8]*> [#uses=1]
declare i32 @printf(i8*, ...)
define i32 @main() {
entry:
%a = alloca i32 ; <i32*> [#uses=3]
store i32 1, i32* %a
%0 = load i32* %a ; <i32> [#uses=1]
%1 = call i32 (i8*, ...)* @printf(i8* getelementptr ([4 x i8]* @__printf_fmt_i32, i32 0, i32 0), i32 %0) ; <i32> [#uses=0]
%2 = call i32 @my_func(i32 2) ; <i32> [#uses=1]
%ret_val = alloca i32 ; <i32*> [#uses=3]
store i32 %2, i32* %ret_val
%3 = load i32* %ret_val ; <i32> [#uses=1]
%4 = call i32 (i8*, ...)* @printf(i8* getelementptr ([4 x i8]* @__printf_fmt_i32, i32 0, i32 0), i32 %3) ; <i32> [#uses=0]
%5 = load i32* %a ; <i32> [#uses=1]
%6 = call i32 (i8*, ...)* @printf(i8* getelementptr ([4 x i8]* @__printf_fmt_i32, i32 0, i32 0), i32 %5) ; <i32> [#uses=0]
%7 = load i32* %ret_val ; <i32> [#uses=1]
%8 = add i32 2, %7 ; <i32> [#uses=1]
%9 = add i32 %8, -1 ; <i32> [#uses=1]
%10 = sdiv i32 %9, 2 ; <i32> [#uses=1]
%11 = call i32 @my_func(i32 3) ; <i32> [#uses=1]
%12 = add i32 %10, %11 ; <i32> [#uses=1]
%13 = call i32 (i8*, ...)* @printf(i8* getelementptr ([4 x i8]* @__printf_fmt_i32, i32 0, i32 0), i32 %12) ; <i32> [#uses=0]
ret i32 1
}
define i32 @my_func(i32 %__arg-z) {
entry:
%z = alloca i32 ; <i32*> [#uses=2]
store i32 %__arg-z, i32* %z
%0 = load i32* %z ; <i32> [#uses=1]
%1 = call i32 (i8*, ...)* @printf(i8* getelementptr ([4 x i8]* @__printf_fmt_i32, i32 0, i32 0), i32 %0) ; <i32> [#uses=0]
%a = alloca i32 ; <i32*> [#uses=2]
store i32 5, i32* %a
%2 = load i32* %a ; <i32> [#uses=1]
%3 = call i32 (i8*, ...)* @printf(i8* getelementptr ([4 x i8]* @__printf_fmt_i32, i32 0, i32 0), i32 %2) ; <i32> [#uses=0]
ret i32 55
}
1
2
5
55
1
3
5
83
*trace: exit: <llvm.ee.GenericValue object at 0x8741176c>
What we see here is the parser analysing the source code of the 3c program and telling us what it is recognising as it does so (lines starting '*trace:'). As it does so, it is generating an LLVM module, which is a bytecode representation of the program. When the parser is done, the bytecode is printed (ir dump) and then the module is executed using JIT (just in time compilation).
Happy days :)