Додатки¶
[B1] | Пустоваров В.И. Ассемблер: программирование и анализ корректности машинных программ. – К: BHV, 2000, Стор. 230-265. |
[B2] | Dandamudi, S.P.: Guide to Assembly Language Programming in Linux - Springer, 2005, Стор. 552. |
[B3] | Tanenbaum, A.S. and Woodhull, A.S.: Operating systems: design and implementation - Pearson Prentice Hall, 2006, 1054 c. |
[B4] | Mark Lutz: Learning Python, 2009, 1214 c. |
[B5] | Marty Alchin: Pro Python, 2010, 368 c. |
[B6] | Бек Л.: Введение в системное программирование: Пер. с англ.- М.: Мир, 1988. - 448 с. |
[B7] | Інтернет-ресурс Intel x86 Instruction Reference - http://www.posix.nl/linuxassembly/nasmdochtml/nasmdoca.html |
[B8] | Інтернет-ресурс Running NASM - http://www.nasm.us/doc/nasmdoc2.html |
Висновок¶
Було розроблено мову програмування високого рівня, повоної за Тюрингом, до неї розроблено компілятор з функцією оптимізації. Мова описана в формальній граматиці.
Під час написання роботи було написано лексичний, синтаксичний аналізатори, вивчені та реалізовані оптимізаційні алгоритми, робота кінцевого автомату станів, реалізований механізм обробки помилок. Вивчений синтаксис асемблеру NASM, тонкощі компіляції програм для операційної системи GNU/Linux.
Розроблена програма може бути використана як повноцінний компілятор, або бути частиної інтерпретатора з функцію Just-in-Time компіляції, які зараз достатньо поширені. Така можливість істотньо покращить показники швидкості запуску програм інтерпретованих мов програмування.
Код програми¶
./pyc
#!/usr/bin/python2
from __future__ import print_function
import sys
import string
import os
import argparse
import subprocess
from utils.lexer import lex
from utils.syntax import synt, print_tree
from utils.gen import find_vars, gen_code
from utils.gen_asm import gen_real_asm
from utils.optimizer import optimize
from utils import ParserError, verbose_output
VERSION = (1, 1)
def print_header():
print("pyCompiler %d.%d" % VERSION)
print('-------------------')
def build_file_path(fl_name, ext, build_dir=True):
fl = dirname = os.path.basename(fl_name)
if build_dir:
dirname = os.path.join(os.path.dirname(fl_name), 'build')
else:
dirname = os.path.dirname(fl_name)
parts = fl.split('.')
parts[-1] = ext
return os.path.join(dirname, string.join(parts, '.'))
@verbose_output
def do_lex(args):
lex_file = build_file_path(args.file, 'lex')
print("Lexical analysis: ", end="")
lex_l = lex(file(args.file).read())
if args.lex:
lexf = open(lex_file, 'w')
print(lex_l, file=lexf)
lexf.close()
print("Done")
return lex_l
@verbose_output
def do_synt(args, lex_l):
tree_file = build_file_path(args.file, 'synt')
print("Syntax analysis: ", end="")
tree = synt(lex_l)
if args.synt:
tree_f = open(tree_file, 'w')
print(tree, file=tree_f)
print_tree(tree, f=tree_f)
tree_f.close()
print("Done")
return tree
@verbose_output
def do_stats(args, tree):
stats_file = build_file_path(args.file, 'stat')
print("Find variables and strings: ", end="")
stat = find_vars(tree)
if args.synt:
stats_f = open(stats_file, 'w')
print("vars =", stat.vars, file=stats_f)
print("strs =", stat.strs, file=stats_f)
stats_f.close()
print("Done")
return stat
@verbose_output
def do_gen_pseudo_asm(args, tree, stat):
print("Generate Pseudo-Asm code: ", end="")
p = gen_code(tree, stat)
print("Done")
return p
@verbose_output
def do_gen_asm(args, p):
fl = os.path.basename(args.file)
asmfile_name = build_file_path(args.file, 'asm')
print("Generate NASM code: ", end="")
asmfile = open(asmfile_name, 'w')
lines = gen_real_asm(p, os.path.basename(fl))
for line in lines:
print(line, file=asmfile)
asmfile.close()
print("Done")
return p
@verbose_output
def do_optimization(args, p):
print("Optimization: ", end="")
res = optimize(p)
print("Done")
return res
@verbose_output
def do_compile(args):
lstfile_name = build_file_path(args.file, 'lst')
asmfile_name = build_file_path(args.file, 'asm')
ofile_name = build_file_path(args.file, 'o')
print("Compiling: ", end="")
params = ["nasm", "-f", "elf", "-o", ofile_name, '-O2' if not args.no_optimize else '',
'-l' if args.lst else '', lstfile_name if args.lst else '', asmfile_name]
try:
if args.verbose:
print('\n', ' '.join(params))
subprocess.check_output(params)
except subprocess.CalledProcessError:
print("NASM Error!", file=sys.stderr)
# print(res)
sys.exit(-1)
except OSError:
print("NASM not found", file=sys.stderr)
sys.exit(-1)
if not args.asm:
os.remove(asmfile_name)
print("Done")
@verbose_output
def do_link(args):
ofile_name = build_file_path(args.file, 'o')
binfile_name = build_file_path(args.file, 'bin', build_dir=False)
print("Linking: ", end="")
params = ["ld", '-s', "-lc", '-dynamic-linker', '/lib/ld-linux.so.2', '-o', binfile_name, ofile_name]
try:
print('\n', ' '.join(params))
res = subprocess.check_output(params)
except subprocess.CalledProcessError:
print("ld Error!", file=sys.stderr)
print(res)
sys.exit(-1)
except OSError:
print("ld not found", file=sys.stderr)
sys.exit(-1)
if not args.asm:
os.remove(ofile_name)
print("Done")
def main():
print_header()
parser = argparse.ArgumentParser()
parser.add_argument('file', metavar="FILE", type=str, help='source file name')
parser.add_argument('-l', '--lex', action="store_true", help="save lexical processing results to file")
parser.add_argument('-s', '--synt', action="store_true", help="save syntax tree to file")
parser.add_argument('-v', '--verbose', action="store_true", help="print information")
parser.add_argument('-A', '--asm', action="store_true", help="save asm file")
parser.add_argument('-L', '--lst', action="store_true", help="listing")
parser.add_argument('-O0', action="store_true", help="no internal optimization")
parser.add_argument('-n', '--no-optimize', action="store_true", help="no -O2 option for nasm")
parser.add_argument('-a', '--all-intermediate', action="store_true", help="Synonym for -slAL")
parser.add_argument('--target', type=str, choices=["linux", "c2m"], default="linux",
help="c2m target is Cyclone II based CPU")
args = parser.parse_args()
if args.all_intermediate:
args.lex = args.asm = args.synt = args.lst = True
build_files = args.lex or args.asm or args.synt or args.lst
build_dir_name = os.path.join(os.path.dirname(args.file), 'build')
if not os.path.exists(build_dir_name):
os.mkdir(build_dir_name)
try:
tree = do_synt(args, do_lex(args))
p = do_gen_pseudo_asm(args, tree, do_stats(args, tree))
if not args.O0:
p = do_optimization(args, p)
do_gen_asm(args, p)
do_compile(args)
do_link(args)
except IOError:
print("\n\nERROR: File not found", file=sys.stderr)
sys.exit(-1)
except ParserError, e:
print('\n\nERROR: %s' % e.message, file=sys.stderr)
if args.verbose:
import traceback
traceback.print_exc()
sys.exit(-2)
except NotImplementedError, e:
print("\n\nNot implemented: %s" % e.message, file=sys.stderr)
sys.exit(-3)
if not build_files:
try:
os.rmdir(build_dir_name)
except OSError:
pass
if __name__ == '__main__':
main()
./utils/__init__.py
class ParserError(Exception):
pass
import os
import sys
from const import *
def typeof(t):
if t is None:
return T_NO
if isinstance(t, FunctionCallInfo):
return T_CALL
if not isinstance(t, str):
return T_NO
if t.isalpha():
if t in RESERVED_WORDS:
return RESERVED_WORDS[t]
else:
return T_VAR
elif t.isdigit():
return T_NUMBER
elif t in SYMB_DICT:
return SYMB_DICT[t]
elif t[0] == '"' and t[-1] == '"':
return T_STRING
elif t.isalnum():
return T_VAR
else:
return T_NO
class FunctionCallInfo(str):
def __new__(cls, name, args):
s = super(FunctionCallInfo, cls).__new__(cls, name)
s.args = args
return s
def verbose_output(func):
"""
Suppresses output if --verbose was not set
"""
def _verbose_output(*pargs, **kwargs):
args = pargs[0]
old_stdout = sys.stdout
if not args.verbose:
null_output = open(os.devnull, 'w')
sys.stdout = null_output
try:
ret = func(*pargs, **kwargs)
if not args.verbose:
sys.stdout = old_stdout
except:
#fallback
if not args.verbose:
sys.stdout = old_stdout
raise
return ret
return _verbose_output
./utils/const.py
# Token types
T_NO, T_IF, T_PRINT, T_READ, T_VAR, T_NUMBER, T_STRING, T_OPEREND, T_CTRLEND, \
T_EQ, T_PLUS, T_MINUS, T_IMUL, T_IDIV, T_POPEN, T_PCLOSE, T_BEGIN, T_END, \
T_LT, T_GT, T_GE, T_LE, T_ELSE, T_ENDIF, T_WHILE, T_ENDWHILE, T_MOD,\
T_FUNCTION, T_SEPARATOR, T_RETURN, T_ENDFUNC, T_CALL = range(32)
# Tree blocks
A_NO, A_ASSIGN, A_IF, A_BLOCK, A_PRINT, A_ELSE, A_READ, A_WHILE, A_FUNCTION, \
A_RETURN, A_CALL = range(11)
# Asm command type
# C_EQU_F is $-label
C_NO, C_ADD, C_SUB, C_PUSH, C_POP, C_CALL, C_PRINT, C_COMMENT, C_READ, C_MOV, \
C_CMP, C_DB, C_DD, C_EQU, C_EQU_F, C_EXTRN, C_GLOBL, C_LABEL, C_INT, C_JMP, \
C_IMUL, C_IDIV, C_RET, C_NEG = range(24)
C_OPT_NO, C_OPT_ADDR, C_PRINT_STR, C_PRINT_VAR = range(4)
EXPRESSIONS_TOKENS = [T_VAR, T_NUMBER, T_STRING, T_EQ,
T_PLUS, T_MINUS, T_IMUL, T_IDIV, T_MOD,
T_LT, T_GT, T_GE, T_LE, T_POPEN, T_PCLOSE,
T_SEPARATOR, ]
NAMES = \
{
A_NO: "<no>",
A_ASSIGN: "=",
A_IF: "if",
A_BLOCK: "{block}",
A_PRINT: "print",
A_READ: "read",
A_WHILE: "while",
A_FUNCTION: "function",
A_RETURN: "return",
A_CALL: "call",
}
# line start states
START_LIST = [] # gen
RANGES_LIST = (T_BEGIN, T_END)
EXPRESSIONS_STATES = None # gen
START_NODE = -1
links = \
{ # gen
}
SYMB_DICT = \
{
"=": T_EQ,
"+": T_PLUS,
"-": T_MINUS,
"*": T_IMUL,
"/": T_IDIV,
"%": T_MOD,
";": T_OPEREND,
":": T_CTRLEND,
">": T_GT,
"<": T_LT,
">=": T_GE,
"<=": T_LE,
'(': T_POPEN,
')': T_PCLOSE,
'{': T_BEGIN,
'}': T_END,
',': T_SEPARATOR,
}
RESERVED_WORDS = \
{
'if': T_IF,
'print': T_PRINT,
'read': T_READ,
'else': T_ELSE,
'endif': T_ENDIF,
'while': T_WHILE,
'endwhile': T_ENDWHILE,
'function': T_FUNCTION,
'return': T_RETURN,
'endfunc': T_ENDFUNC,
}
from graph import read_syntax_graph
import sys
read_syntax_graph(sys.modules[__name__])
./utils/lexer.py
import string
from utils import ParserError
class Token(str):
def __new__(cls, val, line):
s = super(Token, cls).__new__(cls, val)
s.line = line
return s
def lex(text):
global line_num
NO, ALPHA, NUM, SYMB, CMDEND, QUOTE, QUOTE_end, COMMENT = range(8)
line_num = 1
def typeof(s, string=False):
global line_num
if s[0].isalpha():
return ALPHA
elif s[0].isdigit():
return NUM
elif s[0] in "=:><+-*/(){}%,":
return SYMB
elif s[0] in [';']:
return CMDEND
elif s[0] in ['"', '\'']:
return QUOTE
elif s[0] in ['#']:
return COMMENT
elif s.strip() == "":
return NO
else:
if not string:
raise ParserError(
"Unknown symbol %s on line %d" % (s, line_num))
def symb_check(t, s):
COMBINATIONS = ('>=', '<=', '**', )
return (string.strip(string.join(t, '')) + s) in COMBINATIONS
all_tokens = []
token = []
prev_type = current_type = NO
string_token = ""
inline_comment = False
for s in text:
if s == '\n':
if prev_type == QUOTE:
raise ParserError("Unclosed quotes on line %d" % line_num)
line_num += 1
if inline_comment:
if s == '\n':
inline_comment = False
continue
current_type = typeof(s, prev_type == QUOTE)
if prev_type == QUOTE:
if current_type != QUOTE:
string_token += s
else:
all_tokens.append(Token("\"%s\"" % string_token, line_num))
string_token = ""
prev_type = QUOTE_end
token = []
continue
if current_type == COMMENT:
inline_comment = True
continue
if ((prev_type == current_type) or (prev_type == ALPHA and current_type == NUM)) \
and (current_type != SYMB or symb_check(token, s)):
token.append(s)
else:
if prev_type != NO:
clear_token = string.strip(string.join(token, ''))
if len(clear_token) > 0:
all_tokens.append(Token(clear_token, line_num))
prev_type = current_type
token = [s]
clear_token = string.strip(string.join(token, ''))
if len(clear_token) > 0:
all_tokens.append(clear_token)
return all_tokens
./utils/syntax.py
#-*- coding: utf8 -*-
" Syntax analyser "
from __future__ import print_function
import sys
from utils import ParserError
from const import *
from . import typeof, FunctionCallInfo
MACHINE_DEFAULT, MACHINE_EXPR = range(2)
machine = []
DEBUG = False
def synt(lex):
" Builds syntax tree "
global global_lex, global_stack, gres
global_lex = []
global_stack = []
gres = []
def_machine = m_default()
def_machine.next()
# INIT
machine.append(def_machine)
global_lex = list(reversed(lex))
while len(global_lex) > 0:
token = global_lex.pop()
machine[-1].send(token)
def_machine.send('}')
return gres
global_stack = []
gres = []
global_lex = []
def m_expressions():
" Machine for expression analysis "
global global_lex
stack = []
res = []
weights = {
T_POPEN: 0,
T_CALL: 0,
T_PCLOSE: 1,
T_SEPARATOR: 2,
T_PLUS: 20,
T_MINUS: 20,
T_IMUL: 10,
T_IDIV: 10,
T_MOD: 10,
T_GT: 5,
T_LT: 5,
T_GE: 5,
T_LE: 5,
T_EQ: 5,
T_OPEREND: -9000,
T_CTRLEND: -9000,
}
current_line = -1
while True:
token = (yield)
if token == None:
continue
if hasattr(token, 'line'):
current_line = token.line
token_type = typeof(token)
if token_type not in [T_OPEREND, T_CTRLEND] + EXPRESSIONS_TOKENS:
raise ParserError('Syntax error on line %d' % current_line)
if token_type in [T_VAR, T_NUMBER]:
res.append(token)
# FIXME: replace by logging
if DEBUG:
print (stack, res)
continue
if token_type in [T_POPEN, ]: # parentheses or function call
if (len(res) > 0) and typeof(res[-1]) == T_VAR:
stack.append(FunctionCallInfo(res.pop(), len(res)))
else:
stack.append(token)
if DEBUG:
print (stack, res)
continue
while (len(stack) != 0) and \
(weights[token_type] <= weights[typeof(stack[-1])]):
operation = (stack[-1], res[-2:])
del res[-2:]
stack.pop()
res.append(operation)
if token_type in [T_PCLOSE, ]:
oper = stack.pop()
if typeof(oper) == T_CALL:
assert isinstance(oper, FunctionCallInfo)
# function
args_count = len(res) - oper.args
if DEBUG:
print (oper, args_count, res[-args_count:])
if args_count > 0:
args = res[-args_count:]
del res[-args_count:]
else:
args = []
res.append((A_CALL, oper, args))
if DEBUG:
print (stack, res)
continue
if len(stack) == 0 or (weights[token_type] > weights[typeof(stack[-1])]):
if token_type != T_SEPARATOR:
stack.append(token)
if DEBUG:
print (stack, res)
if token_type not in EXPRESSIONS_TOKENS:
stack.pop()
assert len(stack) == 0
machine.pop()
global_stack.append(res)
global_lex.append(token)
class FunctionDescription(object):
def __init__(self):
self.name = None
self.args = []
self.inner_vars = []
def __repr__(self):
return "%s%s" % (self.name, self.args)
class Machine(object):
def __init__(self):
self.func = None
self.token = None
def extract_block(self, stop):
group = []
while True:
if not len(gres) > 0:
raise ParserError("Syntax error: invalid block")
last = gres.pop()
if last != stop:
group.append(last)
else:
break
gres.append((A_BLOCK, group))
def do_function(self):
self.func = FunctionDescription()
def do_func_name(self):
self.func.name = self.token
def do_func_arg(self):
self.func.args.append(self.token)
def do_func_args_end(self):
gres.append(A_FUNCTION)
def do_beg(self):
gres.append(A_BLOCK)
def do_end(self):
group = []
self.extract_block(A_BLOCK)
if len(group) > 0:
gres.append((A_BLOCK, group))
def do_var(self):
gres.append(self.token)
def do_string(self):
gres.append(self.token)
def do_ifsend(self):
gres.append(A_IF)
def do_elsesend(self):
self.extract_block(A_IF) # THEN-block
gres.append(A_ELSE)
def do_endifsend(self):
self.extract_block(A_ELSE)
elseblock = gres.pop()
thenblock = gres.pop()
oper = (A_IF, [global_stack.pop(), thenblock, elseblock])
# print (op)
gres.append(oper)
def do_endfuncsend(self):
self.extract_block(A_FUNCTION)
block = gres.pop()
op = (A_FUNCTION, [self.func, block])
gres.append(op)
def do_assignsend(self):
oper = (A_ASSIGN, [gres.pop(), global_stack.pop()])
gres.append(oper)
def do_returnsend(self):
oper = (A_RETURN, gres.pop())
gres.append(oper)
def do_printsend(self):
oper = (A_PRINT, gres.pop())
gres.append(oper)
def do_readsend(self):
oper = (A_READ, gres.pop())
gres.append(oper)
def do_whilesend(self):
gres.append(A_WHILE)
def do_endwhilesend(self):
self.extract_block(A_WHILE)
block = gres.pop()
op = (A_WHILE, [global_stack.pop(), block])
gres.append(op)
def m_default():
ptype = START_NODE
waitfor = links[ptype][0]
def istypeeq(token_type, state_type):
if state_type == None:
return True
else:
return token_type in links[state_type][1]
stack = []
gres.append(A_BLOCK)
current_line = -1
# func = None
state_executor = Machine()
while True:
token = (yield)
if hasattr(token, 'line'):
current_line = token.line
if ptype in EXPRESSIONS_STATES: # processed in other machine, so waiting for ';' or ':'
waitfor = links[ptype][0]
ctype = typeof(token)
# check syntax errors
possibles = reduce(lambda a, b: a + b, [[] if links[x][1] == None else list(links[x][1])
for x in waitfor])
if possibles is None:
possibles = []
else:
possibles = list(possibles)
possibles += RANGES_LIST
if possibles is not None and ctype not in possibles:
raise ParserError('Syntax error on line %d' % current_line)
if ctype == T_NO and token != None:
# FIXME: dead code?
raise ParserError(
"Unknown token '%s' on line %d" % (token, current_line))
# Next state
possibles = []
if len(waitfor) == 1: # single transition
possibles.append(waitfor[0])
else:
for possible in waitfor:
if istypeeq(typeof(token), possible):
possibles.append(possible)
if len(possibles) > 1:
raise ParserError("Syntax error: Ambiguity")
if len(possibles) == 0:
raise ParserError("Syntax error")
ptype = possibles[0]
# Process state
if links[ptype][2] is not None:
action = getattr(state_executor, "do_%s" % links[ptype][2])
state_executor.token = token
action()
waitfor = links[ptype][0]
# check instant states
if links[waitfor[0]][3]:
ptype = links[ptype][0][0]
waitfor = links[ptype][0]
if len(waitfor) == 1 and waitfor[0] in EXPRESSIONS_STATES:
m = m_expressions()
m.next()
machine.append(m)
stack.append(ctype)
ptype = waitfor[0]
def print_tree(t, n=0, f=sys.stdout):
if isinstance(t, list):
for x in reversed(t):
print_tree(x, n, f=f)
elif isinstance(t, tuple):
if t[0] in NAMES:
print("--" * n, NAMES[t[0]], file=f)
else:
print("--" * n, t[0], file=f)
print_tree(t[1], n=n + 1, f=f)
else:
print("--" * n, t, file=f)
./utils/gen.py
from __future__ import print_function
from functools import partial
from utils.const import *
from utils import ParserError, typeof
class TreeStats(object):
def __init__(self, vars=None, strs=None, funcs=None):
if vars is None:
vars = []
if strs is None:
strs = []
if funcs is None:
funcs = {}
self.vars = vars
self.strs = strs
self.funcs = funcs
self.use_print = False
self.use_read = False
def find_vars(t, stat=None, func=None):
if func is not None:
prefix = func.name + "_"
else:
prefix = ""
if stat == None:
stat = TreeStats()
if isinstance(t, list):
for x in reversed(t):
find_vars(x, stat=stat, func=func)
elif isinstance(t, tuple):
if t[0] == A_PRINT:
stat.use_print = True
elif t[0] == A_READ:
stat.use_read = True
if t[0] == A_FUNCTION:
stat.funcs[t[1][0].name] = {'args': t[1][0].args,
'args_count': len(t[1][0].args),
'info': t[1][0]}
for v in t[1][0].args:
var = "%s_%s" % (t[1][0].name, v)
stat.vars.append(var)
t[1][0].inner_vars.append(var)
find_vars(t[1][1], stat=stat, func=t[1][0])
else:
find_vars(t[1], stat=stat, func=func)
else:
if isinstance(t, str) and typeof(t) == T_VAR:
if prefix + t not in stat.vars:
stat.vars.append(prefix + t)
if func is not None:
func.inner_vars.append(prefix + t)
if isinstance(t, str) and typeof(t) == T_STRING:
if t not in stat.strs:
stat.strs.append(t)
return stat
def make_asm_node_p(p, cmd, o, v, shift=0):
p.text.append((cmd, o, v, shift,))
class PseudoAsm(object):
def __init__(self, p=None):
if p is None:
self.text = []
self.data = []
self.labelNum = 0
self.ifNum = 0
self.loopNum = 0
self.funcNum = 0
else:
self.text = p.text[:]
self.data = p.data[:]
self.labelNum = p.labelNum
self.ifNum = p.ifNum
self.loopNum = p.loopNum
self.funcNum = p.funcNum
def gen_text_section(t, stat, p=None, prefix=""):
if p is None:
p = PseudoAsm()
make_asm_node = partial(
make_asm_node_p, p=p, shift=(0 if len(prefix) == 0 else 1))
aa_push = partial(make_asm_node, cmd=C_PUSH)
aa_pop = partial(make_asm_node, cmd=C_POP, o=None)
aa_mov = partial(make_asm_node, cmd=C_MOV)
aa_call = partial(make_asm_node, cmd=C_CALL, o=None)
aa_add = partial(make_asm_node, cmd=C_ADD)
aa_sub = partial(make_asm_node, cmd=C_SUB)
aa_imul = partial(make_asm_node, cmd=C_IMUL, o=None)
aa_idiv = partial(make_asm_node, cmd=C_IDIV, o=None)
aa_cmp = partial(make_asm_node, cmd=C_CMP, o=None)
aa_jmp = partial(make_asm_node, cmd=C_JMP)
aa_label = partial(make_asm_node, cmd=C_LABEL, o=None)
aa_neg = partial(make_asm_node, cmd=C_NEG, o=None)
aa_push_num = partial(aa_push, o=C_OPT_NO)
aa_push_addr = partial(aa_push, o=C_OPT_ADDR)
aa_ret = partial(make_asm_node, cmd=C_RET, o=None, v=None)
iterate = t
if isinstance(t, list):
iterate = reversed(t)
if isinstance(t, (str, tuple)):
iterate = [t]
for node in iterate:
p.text.append((C_COMMENT, None, None))
if isinstance(node, str):
tnode = typeof(node)
if tnode == T_NUMBER:
aa_push_num(v=node)
elif tnode == T_VAR:
aa_push_addr(v="v%s" % prefix + node)
else:
raise ParserError(
"Error generating ASM code on node %s" % node)
elif node[0] == A_BLOCK:
gen_text_section(node[1], stat, p=p, prefix=prefix)
elif node[0] == A_FUNCTION:
p.funcNum += 1
p.text.append((C_COMMENT, None, None))
p.text.append((C_COMMENT, None, "Function %s" % node[1][0].name))
aa_jmp(o=None, v="Func%dEnd" % p.funcNum, shift=1)
aa_label(v="Func_%s" % node[1][0].name, shift=1)
for i, arg in enumerate(node[1][0].args):
var = "v%s_%s" % (node[1][0].name, arg)
aa_mov(o=[C_OPT_NO, C_OPT_ADDR], v=["eax",
"esp+%d" % ((i + 1) * 4)], shift=1)
aa_mov(o=[C_OPT_ADDR, C_OPT_NO], v=[var, "eax"], shift=1)
gen_text_section(
node[1][1], stat, p=p, prefix=node[1][0].name + "_")
aa_label(v="Func%dEnd" % p.funcNum, shift=1)
elif node[0] == A_RETURN:
# print (node[1])
aa_mov(
o=[C_OPT_NO, C_OPT_ADDR], v=["eax", "v%s" % prefix + node[1]])
# aa_push_num(v="eax")
aa_ret()
elif node[0] == A_PRINT:
if typeof(node[1]) == T_STRING:
strnum = stat.strs.index(node[1])
aa_push_num(v="str%d" % strnum)
aa_call(v="printf")
aa_add(o=[C_OPT_NO, C_OPT_NO], v=["esp", "4"])
aa_push_addr(v="stdout")
aa_call(v="fflush")
aa_add(o=[C_OPT_NO, C_OPT_NO], v=["esp", "4"])
elif typeof(node[1]) == T_VAR:
aa_mov(o=[C_OPT_NO, C_OPT_ADDR], v=["eax",
"v%s" % prefix + node[1]])
aa_push_num(v="eax")
aa_push_num(v="numbs")
aa_call(v="printf")
aa_add(o=[C_OPT_NO, C_OPT_NO], v=["esp", "8"])
aa_push_addr(v="stdout")
aa_call(v="fflush")
aa_add(o=[C_OPT_NO, C_OPT_NO], v=["esp", "4"])
else:
raise ParserError("Error print argument: %s" % node)
elif node[0] == A_READ:
assert typeof(node[1]) == T_VAR
aa_push_num(v="v%s" % prefix + node[1])
aa_push_num(v="numbs_in_format")
aa_call(v="scanf")
aa_add(o=[C_OPT_NO, C_OPT_NO], v=["esp", "8"])
aa_call(v="getchar")
elif node[0] == A_ASSIGN:
var = node[1][0]
gen_text_section(node[1][1], stat, p=p, prefix=prefix)
aa_pop(v="eax")
aa_mov(o=[C_OPT_ADDR, C_OPT_NO], v=["v%s" % prefix + var, "eax"])
elif node[0] == A_IF:
p.ifNum += 1
gen_text_section(node[1][0], stat, p=p, prefix=prefix)
aa_pop(v="eax")
aa_cmp(o=[C_OPT_NO, C_OPT_NO], v=["eax", "0"])
aa_jmp(o="jnz", v="llIf%dElse" % p.ifNum)
# then
gen_text_section(node[1][1], stat, p=p, prefix=prefix)
aa_jmp(o=None, v="llIf%dEnd" % p.ifNum)
# else
aa_label(v="llIf%dElse" % p.ifNum)
gen_text_section(node[1][2], stat, p=p, prefix=prefix)
aa_label(v="llIf%dEnd" % p.ifNum)
elif node[0] == '+':
gen_text_section(node[1], stat, p=p, prefix=prefix)
aa_pop(v="eax")
aa_pop(v="ebx")
aa_add(o=[C_OPT_NO, C_OPT_NO], v=["eax", "ebx"])
aa_push_num(v="eax")
elif node[0] in ['>=', '<=', '>', '<', '=']:
gen_text_section(node[1], stat, p=p, prefix=prefix)
p.labelNum += 1
op = {'>=': 'jge',
'<=': 'jle',
'>': 'jg',
'<': 'jl',
'=': 'je',
}
aa_pop(v="eax")
aa_pop(v="ebx")
aa_cmp(v=["eax", "ebx"])
aa_jmp(o=op[node[0]], v="ll%d" % p.labelNum)
aa_push_num(v="1")
aa_jmp(o=None, v="ell%d" % p.labelNum)
aa_label(v="ll%d" % p.labelNum)
aa_push_num(v="0")
aa_label(v="ell%d" % p.labelNum)
elif node[0] == '-':
gen_text_section(node[1], stat, p=p, prefix=prefix)
if len(node[1]) == 2:
aa_pop(v="eax")
aa_pop(v="ebx")
aa_sub(o=[C_OPT_NO, C_OPT_NO], v=["eax", "ebx"])
aa_push_num(v="eax")
else:
aa_pop(v="eax")
aa_neg(v="eax")
aa_push_num(v="eax")
elif node[0] == '*':
gen_text_section(node[1], stat, p=p, prefix=prefix)
aa_pop(v="eax")
aa_pop(v="ebx")
aa_imul(v="ebx")
aa_push_num(v="eax")
elif node[0] == '/':
gen_text_section(node[1], stat, p=p, prefix=prefix)
aa_pop(v="eax")
aa_pop(v="ebx")
aa_idiv(v="ebx")
aa_push_num(v="eax")
elif node[0] == '%':
gen_text_section(node[1], stat, p=p, prefix=prefix)
aa_pop(v="eax")
aa_pop(v="ebx")
aa_idiv(v="ebx")
aa_push_num(v="edx")
elif node[0] == A_WHILE:
p.loopNum += 1
aa_label(v="llWhile%d" % p.loopNum)
gen_text_section(node[1][0], stat, p=p, prefix=prefix)
aa_pop(v="eax")
aa_cmp(o=[C_OPT_NO, C_OPT_NO], v=["eax", "0"])
aa_jmp(o="jnz", v="llWhile%dEnd" % p.loopNum)
gen_text_section(node[1][1], stat, p=p, prefix=prefix)
aa_jmp(o=None, v="llWhile%d" % p.loopNum)
aa_label(v="llWhile%dEnd" % p.loopNum)
elif node[0] == A_CALL:
if stat.funcs[node[1]]['args_count'] != len(node[2]):
raise ParserError("Call %s passing %d arguments. %d expected" %
(node[1], len(node[2]), stat.funcs[node[1]]['args_count']))
for iv in stat.funcs[node[1]]['info'].inner_vars:
aa_push_addr(v="v%s" % iv)
for arg in reversed(node[2]):
gen_text_section(arg, stat, p=p, prefix=prefix)
aa_call(v="Func_%s" % node[1])
aa_add(o=[C_OPT_NO, C_OPT_NO], v=["esp", str(4 * len(node[2]))])
for iv in reversed(stat.funcs[node[1]]['info'].inner_vars):
aa_pop(v="edx")
aa_mov(o=[C_OPT_ADDR, C_OPT_NO], v=["v%s" % iv, "edx"])
aa_push_num(v="eax")
elif node[0] in ['/', '%']:
raise NotImplementedError(
"%s operation is not implemented yet" % node[0])
else:
raise ParserError(
"Error generating ASM code on node %s" % repr(node))
def clear_string(s):
r = s
r = "\",10,\"".join(r.split("\\n"))
return r
def gen_code(t, stat):
p = PseudoAsm()
p.data.append((C_EQU, "_kernel_", "0x80"))
p.data.append((C_COMMENT, None, "Strings"))
for i, vs in enumerate(stat.strs):
s = clear_string(vs)
p.data.append((C_DB, "str%d" % i, "%s, 0" % s))
p.data.append((C_EQU_F, "lstr%d" % i, "str%d" % i))
p.data.append((C_DB, "numbs", "\"%d\", 0"))
p.data.append((C_DB, "numbs_in_format", "\"%d\",0"))
p.data.append((C_COMMENT, None, "Variables"))
for i, vs in enumerate(stat.vars):
p.data.append((C_DD, "v%s" % vs, "0"))
p.text.append((C_GLOBL, None, "_start"))
extern = []
if stat.use_print:
extern.append("printf")
if stat.use_read:
extern.append("scanf")
extern.append("getchar")
if stat.use_read or stat.use_print:
extern.append("fflush")
extern.append("stdout")
for e in extern:
p.text.append((C_EXTRN, None, e))
p.text.append((C_LABEL, None, "_start"))
p.text.append((C_COMMENT, None, "setup stack frame"))
p.text.append((C_PUSH, C_OPT_NO, "ebp"))
p.text.append((C_MOV, [C_OPT_NO, C_OPT_NO], ["ebp", "esp"]))
gen_text_section(t, stat, p=p)
# end
p.text.append((C_COMMENT, None, "restore stack frame"))
p.text.append((C_MOV, [C_OPT_NO, C_OPT_NO], ["esp", "ebp"]))
p.text.append((C_POP, None, "ebp"))
p.text.append((C_MOV, [C_OPT_NO, C_OPT_NO], ["ebx", "0"]))
p.text.append((C_MOV, [C_OPT_NO, C_OPT_NO], ["eax", "1"]))
p.text.append((C_INT, None, "_kernel_"))
return p
./utils/optimizer.py
from utils.const import *
from utils.gen import PseudoAsm
def optimize(pseudo, num=2):
text = pseudo
for x in xrange(num):
text = do_optimize(text)
return text
def do_optimize(pseudo):
pseudo = PseudoAsm(pseudo)
text = pseudo.text
text = optimize_push_pop(text)
text = optimize_mov(text)
text = optimize_mov_push(text)
text = optimize_mov_to_self(text)
text = optimize_clean_lines(text)
pseudo.text = text
return pseudo
def optimize_mov_to_self(text):
" reduce mov (mov eax,eax) "
result = []
for i, op in enumerate(text):
if op[0] == C_MOV:
if (op[2][0] == op[2][1]) and (op[1][0] == op[1][1]):
pass
else:
result.append(op)
else:
result.append(op)
return result
def optimize_push_pop(text):
" reduce push and pop sequences "
stack = []
result = []
ires = []
for i, op in enumerate(text):
if len(op) > 3:
offset = op[3]
else:
offset = 0
if op[0] == C_PUSH:
stack.append(op)
elif op[0] == C_POP:
if len(stack) > 0:
v = stack.pop()
can_reduce = True
for s in ires:
if s[0] == C_MOV:
can_reduce = can_reduce and (
s[2][0] != v[2]) and (s[2][1] != v[2])
if can_reduce:
ires.append(
(C_MOV, [C_OPT_NO, v[1]], (op[2], v[2]), offset))
else:
ires.insert(0, v)
ires.append(op)
else:
ires.append(op)
elif op[0] == C_COMMENT:
result.append(op)
else:
result += stack
stack = []
result += ires
ires = []
result.append(op)
result += stack
return result
def optimize_mov(text):
" reduce mov twice (mov eax, 5; mov ebx, eax)"
prev_mov = None
result = []
comments = []
for i, op in enumerate(text):
if len(op) > 3:
offset = op[3]
else:
offset = 0
if op[0] == C_MOV:
if prev_mov is not None:
v = prev_mov
if (v[2][0] == op[2][1]) and (v[1][0] == op[1][1]) and \
(not (op[1][0] == v[1][1] and op[1][0] == C_OPT_ADDR)):
result.append((C_MOV, [op[1][0], v[
1][1]], (op[2][0], v[2][1]), offset))
prev_mov = None
else:
result.append(prev_mov)
prev_mov = op
else:
prev_mov = op
elif op[0] == C_COMMENT:
comments.append(op)
else:
if prev_mov is not None:
result.append(prev_mov)
prev_mov = None
result += comments
comments = []
result.append(op)
if prev_mov is not None:
result += prev_mov
return result
def optimize_mov_push(text):
" reduce mov before push (mov eax, 5; push eax) "
prev_mov = None
result = []
comments = []
ires = []
for i, op in enumerate(text):
if len(op) > 3:
offset = op[3]
else:
offset = 0
if op[0] == C_MOV:
if prev_mov is not None:
ires.append(prev_mov)
prev_mov = op
elif op[0] == C_PUSH:
if prev_mov is not None:
v = prev_mov
if (v[2][0] == op[2]) and (v[1][0] == op[1]) and (v[1][0] != C_OPT_ADDR):
ires.append((C_PUSH, v[1][1], v[2][1], offset))
else:
ires.append(prev_mov)
ires.append(op)
else:
ires.append(op)
prev_mov = None
elif op[0] == C_COMMENT:
comments.append(op)
else:
if prev_mov is not None:
ires.append(prev_mov)
prev_mov = None
result += comments
comments = []
result += ires
ires = []
result.append(op)
if prev_mov is not None:
result += prev_mov
return result
def optimize_clean_lines(text):
result = []
n = 0
for i, op in enumerate(text):
if op[0] == C_COMMENT:
if op[2] == None or op[2] == "":
n += 1
if n < 2:
result.append(op)
else:
result.append(op)
else:
result.append(op)
n = 0
return result
./utils/gen_asm.py
from __future__ import print_function
from time import strftime, gmtime
from utils.const import *
from utils import ParserError
def gen_real_asm(pseudo, src_file):
res = []
res.append("; Source file: %s" % src_file)
res.append("; Generated %s" % strftime("%Y-%m-%d %H:%M:%S", gmtime()))
res.append("")
res.append("SECTION .data")
res.append("")
for command in pseudo.data:
res += nasm_gen(command)
res.append("")
res.append("SECTION .text")
res.append("")
for command in pseudo.text:
res += nasm_gen(command)
return res
def operand(x, t):
if t == C_OPT_NO:
return x
elif t == C_OPT_ADDR:
return "[%s]" % x
def nasm_gen(l):
cmd = None
if l[0] == C_PUSH:
cmd = ["push\tdword\t%s" % operand(l[2], l[1])]
elif l[0] == C_POP:
cmd = ["pop\t\tdword\t%s" % l[2]]
elif l[0] == C_CALL:
cmd = ["call\t%s" % l[2]]
elif l[0] == C_INT:
cmd = ["int\t%s" % l[2]]
elif l[0] == C_MOV:
cmd = ["mov\t\tdword\t%s, %s" % (operand(l[2][0], l[1][0]),
operand(l[2][1], l[1][1]))]
elif l[0] == C_ADD:
cmd = ["add\t\t%s, %s" % (operand(l[2][0], l[1][0]),
operand(l[2][1], l[1][1]))]
elif l[0] == C_IMUL:
cmd = ["imul\t%s" % l[2]]
elif l[0] == C_IDIV:
cmd = ["idiv\t%s" % l[2]]
elif l[0] == C_SUB:
cmd = ["sub\t\t%s, %s" % (operand(l[2][0], l[1][0]),
operand(l[2][1], l[1][1]))]
elif l[0] == C_CMP:
cmd = ["cmp\t%s,%s" % (l[2][0], l[2][1])]
elif l[0] == C_COMMENT:
if l[2] is None:
cmd = [""]
else:
cmd = ["; %s" % l[2]]
elif l[0] == C_EQU:
cmd = ["%s:\tequ\t%s" % (l[1], l[2])]
elif l[0] == C_EQU_F:
cmd = ["%s:\tequ\t\t$-%s" % (l[1], l[2])]
elif l[0] == C_DB:
cmd = ["%s:\tdb\t\t%s" % (l[1], l[2])]
elif l[0] == C_DD:
cmd = ["%s:\t\tdd\t%s" % (l[1], l[2])]
elif l[0] == C_GLOBL:
cmd = ["global\t%s" % l[2]]
elif l[0] == C_EXTRN:
cmd = ["extern\t%s" % l[2]]
elif l[0] == C_NEG:
cmd = ["neg\t%s" % l[2]]
elif l[0] == C_LABEL:
cmd = ["%s:" % l[2]]
elif l[0] == C_JMP:
if l[1] is None:
cmd = ["jmp\t%s" % l[2]]
else:
cmd = ["%s\t%s" % (l[1], l[2])]
elif l[0] == C_RET:
cmd = ["ret"]
else:
raise ParserError("Can't translate %d command" % l[0])
cmd = map(lambda x: "\t%s" % x, cmd)
if len(l) > 3:
return map(lambda x: "%s%s" % (l[3] * "\t", x), cmd)
else:
return cmd