Додатки

[B1]Пустоваров В.И. Ассемблер: программирование и анализ корректности машинных программ. – К: BHV, 2000, Стор. 230-265.
[B2]Dandamudi, S.P.: Guide to Assembly Language Programming in Linux - Springer, 2005, Стор. 552.
[B3]Tanenbaum, A.S. and Woodhull, A.S.: Operating systems: design and implementation - Pearson Prentice Hall, 2006, 1054 c.
[B4]Mark Lutz: Learning Python, 2009, 1214 c.
[B5]Marty Alchin: Pro Python, 2010, 368 c.
[B6]Бек Л.: Введение в системное программирование: Пер. с англ.- М.: Мир, 1988. - 448 с.
[B7]Інтернет-ресурс Intel x86 Instruction Reference - http://www.posix.nl/linuxassembly/nasmdochtml/nasmdoca.html
[B8]Інтернет-ресурс Running NASM - http://www.nasm.us/doc/nasmdoc2.html

Висновок

Було розроблено мову програмування високого рівня, повоної за Тюрингом, до неї розроблено компілятор з функцією оптимізації. Мова описана в формальній граматиці.

Під час написання роботи було написано лексичний, синтаксичний аналізатори, вивчені та реалізовані оптимізаційні алгоритми, робота кінцевого автомату станів, реалізований механізм обробки помилок. Вивчений синтаксис асемблеру NASM, тонкощі компіляції програм для операційної системи GNU/Linux.

Розроблена програма може бути використана як повноцінний компілятор, або бути частиної інтерпретатора з функцію Just-in-Time компіляції, які зараз достатньо поширені. Така можливість істотньо покращить показники швидкості запуску програм інтерпретованих мов програмування.

Код програми

./pyc

#!/usr/bin/python2

from __future__ import print_function

import sys
import string
import os
import argparse
import subprocess

from utils.lexer import lex
from utils.syntax import synt, print_tree
from utils.gen import find_vars, gen_code
from utils.gen_asm import gen_real_asm
from utils.optimizer import optimize
from utils import ParserError, verbose_output

VERSION = (1, 1)


def print_header():
    print("pyCompiler %d.%d" % VERSION)
    print('-------------------')


def build_file_path(fl_name, ext, build_dir=True):
    fl = dirname = os.path.basename(fl_name)

    if build_dir:
        dirname = os.path.join(os.path.dirname(fl_name), 'build')
    else:
        dirname = os.path.dirname(fl_name)

    parts = fl.split('.')
    parts[-1] = ext
    return os.path.join(dirname, string.join(parts, '.'))


@verbose_output
def do_lex(args):
    lex_file = build_file_path(args.file, 'lex')
    print("Lexical analysis: ", end="")

    lex_l = lex(file(args.file).read())

    if args.lex:
        lexf = open(lex_file, 'w')
        print(lex_l, file=lexf)
        lexf.close()

    print("Done")

    return lex_l


@verbose_output
def do_synt(args, lex_l):
    tree_file = build_file_path(args.file, 'synt')

    print("Syntax analysis: ", end="")

    tree = synt(lex_l)

    if args.synt:
        tree_f = open(tree_file, 'w')
        print(tree, file=tree_f)
        print_tree(tree, f=tree_f)
        tree_f.close()

    print("Done")

    return tree


@verbose_output
def do_stats(args, tree):
    stats_file = build_file_path(args.file, 'stat')

    print("Find variables and strings: ", end="")
    stat = find_vars(tree)

    if args.synt:
        stats_f = open(stats_file, 'w')
        print("vars =", stat.vars, file=stats_f)
        print("strs =", stat.strs, file=stats_f)
        stats_f.close()
    print("Done")

    return stat


@verbose_output
def do_gen_pseudo_asm(args, tree, stat):
    print("Generate Pseudo-Asm code: ", end="")
    p = gen_code(tree, stat)

    print("Done")

    return p


@verbose_output
def do_gen_asm(args, p):
    fl = os.path.basename(args.file)
    asmfile_name = build_file_path(args.file, 'asm')

    print("Generate NASM code: ", end="")

    asmfile = open(asmfile_name, 'w')
    lines = gen_real_asm(p, os.path.basename(fl))

    for line in lines:
        print(line, file=asmfile)

    asmfile.close()

    print("Done")

    return p


@verbose_output
def do_optimization(args, p):
    print("Optimization: ", end="")
    res = optimize(p)
    print("Done")

    return res


@verbose_output
def do_compile(args):
    lstfile_name = build_file_path(args.file, 'lst')
    asmfile_name = build_file_path(args.file, 'asm')
    ofile_name = build_file_path(args.file, 'o')

    print("Compiling: ", end="")

    params = ["nasm", "-f", "elf", "-o", ofile_name, '-O2' if not args.no_optimize else '',
              '-l' if args.lst else '', lstfile_name if args.lst else '', asmfile_name]
    try:
        if args.verbose:
            print('\n', ' '.join(params))
        subprocess.check_output(params)
    except subprocess.CalledProcessError:
        print("NASM Error!", file=sys.stderr)
        # print(res)
        sys.exit(-1)
    except OSError:
        print("NASM not found", file=sys.stderr)
        sys.exit(-1)

    if not args.asm:
        os.remove(asmfile_name)
    print("Done")


@verbose_output
def do_link(args):
    ofile_name = build_file_path(args.file, 'o')
    binfile_name = build_file_path(args.file, 'bin', build_dir=False)

    print("Linking: ", end="")
    params = ["ld", '-s',  "-lc", '-dynamic-linker', '/lib/ld-linux.so.2', '-o', binfile_name, ofile_name]

    try:
        print('\n', ' '.join(params))
        res = subprocess.check_output(params)
    except subprocess.CalledProcessError:
        print("ld Error!", file=sys.stderr)
        print(res)
        sys.exit(-1)
    except OSError:
        print("ld not found", file=sys.stderr)
        sys.exit(-1)

    if not args.asm:
        os.remove(ofile_name)
    print("Done")


def main():
    print_header()

    parser = argparse.ArgumentParser()
    parser.add_argument('file', metavar="FILE", type=str, help='source file name')
    parser.add_argument('-l', '--lex', action="store_true", help="save lexical processing results to file")
    parser.add_argument('-s', '--synt', action="store_true", help="save syntax tree to file")
    parser.add_argument('-v', '--verbose', action="store_true", help="print information")
    parser.add_argument('-A', '--asm', action="store_true", help="save asm file")
    parser.add_argument('-L', '--lst', action="store_true", help="listing")
    parser.add_argument('-O0', action="store_true", help="no internal optimization")
    parser.add_argument('-n', '--no-optimize', action="store_true", help="no -O2 option for nasm")
    parser.add_argument('-a', '--all-intermediate', action="store_true", help="Synonym for -slAL")
    parser.add_argument('--target', type=str, choices=["linux", "c2m"], default="linux",
                        help="c2m target is Cyclone II based CPU")

    args = parser.parse_args()
    if args.all_intermediate:
        args.lex = args.asm = args.synt = args.lst = True
    build_files = args.lex or args.asm or args.synt or args.lst

    build_dir_name = os.path.join(os.path.dirname(args.file), 'build')

    if not os.path.exists(build_dir_name):
        os.mkdir(build_dir_name)

    try:
        tree = do_synt(args, do_lex(args))

        p = do_gen_pseudo_asm(args, tree, do_stats(args, tree))
        if not args.O0:
            p = do_optimization(args, p)
        do_gen_asm(args, p)
        do_compile(args)
        do_link(args)

    except IOError:
        print("\n\nERROR: File not found", file=sys.stderr)
        sys.exit(-1)

    except ParserError, e:
        print('\n\nERROR: %s' % e.message, file=sys.stderr)

        if args.verbose:
            import traceback
            traceback.print_exc()
        sys.exit(-2)

    except NotImplementedError, e:
        print("\n\nNot implemented: %s" % e.message, file=sys.stderr)
        sys.exit(-3)

    if not build_files:
        try:
            os.rmdir(build_dir_name)
        except OSError:
            pass

if __name__ == '__main__':
    main()

./utils/__init__.py

class ParserError(Exception):
    pass

import os
import sys

from const import *


def typeof(t):
    if t is None:
        return T_NO

    if isinstance(t, FunctionCallInfo):
        return T_CALL
    if not isinstance(t, str):
        return T_NO

    if t.isalpha():
        if t in RESERVED_WORDS:
            return RESERVED_WORDS[t]
        else:
            return T_VAR
    elif t.isdigit():
        return T_NUMBER
    elif t in SYMB_DICT:
        return SYMB_DICT[t]
    elif t[0] == '"' and t[-1] == '"':
        return T_STRING
    elif t.isalnum():
        return T_VAR
    else:
        return T_NO


class FunctionCallInfo(str):
    def __new__(cls, name, args):
        s = super(FunctionCallInfo, cls).__new__(cls, name)
        s.args = args
        return s


def verbose_output(func):
    """
    Suppresses output if --verbose was not set
    """
    def _verbose_output(*pargs, **kwargs):
        args = pargs[0]
        old_stdout = sys.stdout
        if not args.verbose:
            null_output = open(os.devnull, 'w')
            sys.stdout = null_output
        try:
            ret = func(*pargs, **kwargs)
            if not args.verbose:
                sys.stdout = old_stdout
        except:
            #fallback
            if not args.verbose:
                sys.stdout = old_stdout
            raise
        return ret
    return _verbose_output

./utils/const.py

# Token types
T_NO, T_IF, T_PRINT, T_READ, T_VAR, T_NUMBER, T_STRING, T_OPEREND, T_CTRLEND, \
    T_EQ, T_PLUS, T_MINUS, T_IMUL, T_IDIV, T_POPEN, T_PCLOSE, T_BEGIN, T_END, \
    T_LT, T_GT, T_GE, T_LE, T_ELSE, T_ENDIF, T_WHILE, T_ENDWHILE, T_MOD,\
    T_FUNCTION, T_SEPARATOR, T_RETURN, T_ENDFUNC, T_CALL = range(32)

# Tree blocks
A_NO, A_ASSIGN, A_IF, A_BLOCK, A_PRINT, A_ELSE, A_READ, A_WHILE, A_FUNCTION, \
    A_RETURN, A_CALL = range(11)

# Asm command type
# C_EQU_F is $-label
C_NO, C_ADD, C_SUB, C_PUSH, C_POP, C_CALL, C_PRINT, C_COMMENT, C_READ, C_MOV, \
    C_CMP, C_DB, C_DD, C_EQU, C_EQU_F, C_EXTRN, C_GLOBL, C_LABEL, C_INT, C_JMP, \
    C_IMUL, C_IDIV, C_RET, C_NEG = range(24)

C_OPT_NO, C_OPT_ADDR, C_PRINT_STR, C_PRINT_VAR = range(4)


EXPRESSIONS_TOKENS = [T_VAR, T_NUMBER, T_STRING, T_EQ,
                      T_PLUS, T_MINUS, T_IMUL, T_IDIV, T_MOD,
                      T_LT, T_GT, T_GE, T_LE, T_POPEN, T_PCLOSE,
                      T_SEPARATOR, ]

NAMES = \
    {
        A_NO: "<no>",
        A_ASSIGN: "=",
        A_IF: "if",
        A_BLOCK: "{block}",
        A_PRINT: "print",
        A_READ: "read",
        A_WHILE: "while",
        A_FUNCTION: "function",
        A_RETURN: "return",
        A_CALL: "call",
    }

# line start states
START_LIST = []  # gen

RANGES_LIST = (T_BEGIN, T_END)
EXPRESSIONS_STATES = None  # gen

START_NODE = -1

links = \
    {  # gen
    }

SYMB_DICT = \
    {
        "=": T_EQ,
        "+": T_PLUS,
        "-": T_MINUS,
        "*": T_IMUL,
        "/": T_IDIV,
        "%": T_MOD,
        ";": T_OPEREND,
        ":": T_CTRLEND,
        ">": T_GT,
        "<": T_LT,
        ">=": T_GE,
        "<=": T_LE,
        '(': T_POPEN,
        ')': T_PCLOSE,
        '{': T_BEGIN,
        '}': T_END,
        ',': T_SEPARATOR,
    }

RESERVED_WORDS = \
    {
        'if': T_IF,
        'print': T_PRINT,
        'read': T_READ,
        'else': T_ELSE,
        'endif': T_ENDIF,
        'while': T_WHILE,
        'endwhile': T_ENDWHILE,
        'function': T_FUNCTION,
        'return': T_RETURN,
        'endfunc': T_ENDFUNC,
    }

from graph import read_syntax_graph
import sys
read_syntax_graph(sys.modules[__name__])

./utils/lexer.py

import string

from utils import ParserError


class Token(str):
    def __new__(cls, val, line):
        s = super(Token, cls).__new__(cls, val)
        s.line = line
        return s


def lex(text):
    global line_num

    NO, ALPHA, NUM, SYMB, CMDEND, QUOTE, QUOTE_end, COMMENT = range(8)

    line_num = 1

    def typeof(s, string=False):
        global line_num
        if s[0].isalpha():
            return ALPHA
        elif s[0].isdigit():
            return NUM
        elif s[0] in "=:><+-*/(){}%,":
            return SYMB
        elif s[0] in [';']:
            return CMDEND
        elif s[0] in ['"', '\'']:
            return QUOTE
        elif s[0] in ['#']:
            return COMMENT
        elif s.strip() == "":
            return NO
        else:
            if not string:
                raise ParserError(
                    "Unknown symbol %s on line %d" % (s, line_num))

    def symb_check(t, s):
        COMBINATIONS = ('>=', '<=', '**', )
        return (string.strip(string.join(t, '')) + s) in COMBINATIONS

    all_tokens = []
    token = []
    prev_type = current_type = NO
    string_token = ""
    inline_comment = False

    for s in text:
        if s == '\n':
            if prev_type == QUOTE:
                raise ParserError("Unclosed quotes on line %d" % line_num)
            line_num += 1
        if inline_comment:
            if s == '\n':
                inline_comment = False
            continue

        current_type = typeof(s, prev_type == QUOTE)

        if prev_type == QUOTE:
            if current_type != QUOTE:
                string_token += s
            else:
                all_tokens.append(Token("\"%s\"" % string_token, line_num))
                string_token = ""
                prev_type = QUOTE_end
                token = []
            continue

        if current_type == COMMENT:
            inline_comment = True
            continue
        if ((prev_type == current_type) or (prev_type == ALPHA and current_type == NUM)) \
                and (current_type != SYMB or symb_check(token, s)):
            token.append(s)
        else:
            if prev_type != NO:
                clear_token = string.strip(string.join(token, ''))
                if len(clear_token) > 0:
                    all_tokens.append(Token(clear_token, line_num))
            prev_type = current_type
            token = [s]

    clear_token = string.strip(string.join(token, ''))
    if len(clear_token) > 0:
        all_tokens.append(clear_token)
    return all_tokens

./utils/syntax.py

#-*- coding: utf8 -*-

" Syntax analyser "

from __future__ import print_function

import sys

from utils import ParserError
from const import *
from . import typeof, FunctionCallInfo

MACHINE_DEFAULT, MACHINE_EXPR = range(2)
machine = []

DEBUG = False


def synt(lex):
    " Builds syntax tree "
    global global_lex, global_stack, gres
    global_lex = []
    global_stack = []
    gres = []
    def_machine = m_default()
    def_machine.next()
    # INIT
    machine.append(def_machine)

    global_lex = list(reversed(lex))
    while len(global_lex) > 0:
        token = global_lex.pop()
        machine[-1].send(token)

    def_machine.send('}')
    return gres

global_stack = []
gres = []
global_lex = []


def m_expressions():
    " Machine for expression analysis "
    global global_lex
    stack = []
    res = []

    weights = {
        T_POPEN: 0,
        T_CALL: 0,
        T_PCLOSE: 1,
        T_SEPARATOR: 2,
        T_PLUS: 20,
        T_MINUS: 20,
        T_IMUL: 10,
        T_IDIV: 10,
        T_MOD: 10,

        T_GT: 5,
        T_LT: 5,
        T_GE: 5,
        T_LE: 5,
        T_EQ: 5,

        T_OPEREND: -9000,
        T_CTRLEND: -9000,
    }

    current_line = -1

    while True:
        token = (yield)
        if token == None:
            continue

        if hasattr(token, 'line'):
            current_line = token.line

        token_type = typeof(token)

        if token_type not in [T_OPEREND, T_CTRLEND] + EXPRESSIONS_TOKENS:
            raise ParserError('Syntax error on line %d' % current_line)

        if token_type in [T_VAR, T_NUMBER]:
            res.append(token)
            # FIXME: replace by logging
            if DEBUG:
                print (stack, res)
            continue

        if token_type in [T_POPEN, ]:  # parentheses or function call
            if (len(res) > 0) and typeof(res[-1]) == T_VAR:
                stack.append(FunctionCallInfo(res.pop(), len(res)))
            else:
                stack.append(token)
            if DEBUG:
                print (stack, res)
            continue

        while (len(stack) != 0) and \
              (weights[token_type] <= weights[typeof(stack[-1])]):
            operation = (stack[-1], res[-2:])
            del res[-2:]
            stack.pop()
            res.append(operation)

        if token_type in [T_PCLOSE, ]:
            oper = stack.pop()
            if typeof(oper) == T_CALL:
                assert isinstance(oper, FunctionCallInfo)
                # function
                args_count = len(res) - oper.args
                if DEBUG:
                    print (oper, args_count, res[-args_count:])
                if args_count > 0:
                    args = res[-args_count:]
                    del res[-args_count:]
                else:
                    args = []
                res.append((A_CALL, oper, args))

            if DEBUG:
                print (stack, res)
            continue

        if len(stack) == 0 or (weights[token_type] > weights[typeof(stack[-1])]):
            if token_type != T_SEPARATOR:
                stack.append(token)

        if DEBUG:
            print (stack, res)

        if token_type not in EXPRESSIONS_TOKENS:
            stack.pop()
            assert len(stack) == 0
            machine.pop()
            global_stack.append(res)
            global_lex.append(token)


class FunctionDescription(object):
    def __init__(self):
        self.name = None
        self.args = []
        self.inner_vars = []

    def __repr__(self):
        return "%s%s" % (self.name, self.args)


class Machine(object):
    def __init__(self):
        self.func = None
        self.token = None

    def extract_block(self, stop):
        group = []
        while True:
            if not len(gres) > 0:
                raise ParserError("Syntax error: invalid block")
            last = gres.pop()
            if last != stop:
                group.append(last)
            else:
                break
        gres.append((A_BLOCK, group))

    def do_function(self):
        self.func = FunctionDescription()

    def do_func_name(self):
        self.func.name = self.token

    def do_func_arg(self):
        self.func.args.append(self.token)

    def do_func_args_end(self):
        gres.append(A_FUNCTION)

    def do_beg(self):
        gres.append(A_BLOCK)

    def do_end(self):
        group = []
        self.extract_block(A_BLOCK)
        if len(group) > 0:
            gres.append((A_BLOCK, group))

    def do_var(self):
        gres.append(self.token)

    def do_string(self):
        gres.append(self.token)

    def do_ifsend(self):
        gres.append(A_IF)

    def do_elsesend(self):
        self.extract_block(A_IF)  # THEN-block
        gres.append(A_ELSE)

    def do_endifsend(self):
        self.extract_block(A_ELSE)
        elseblock = gres.pop()
        thenblock = gres.pop()
        oper = (A_IF, [global_stack.pop(), thenblock, elseblock])
        # print (op)
        gres.append(oper)

    def do_endfuncsend(self):
        self.extract_block(A_FUNCTION)
        block = gres.pop()
        op = (A_FUNCTION, [self.func, block])
        gres.append(op)

    def do_assignsend(self):
        oper = (A_ASSIGN, [gres.pop(), global_stack.pop()])
        gres.append(oper)

    def do_returnsend(self):
        oper = (A_RETURN, gres.pop())
        gres.append(oper)

    def do_printsend(self):
        oper = (A_PRINT, gres.pop())
        gres.append(oper)

    def do_readsend(self):
        oper = (A_READ, gres.pop())
        gres.append(oper)

    def do_whilesend(self):
        gres.append(A_WHILE)

    def do_endwhilesend(self):
        self.extract_block(A_WHILE)
        block = gres.pop()
        op = (A_WHILE, [global_stack.pop(), block])
        gres.append(op)


def m_default():
    ptype = START_NODE
    waitfor = links[ptype][0]

    def istypeeq(token_type, state_type):
        if state_type == None:
            return True
        else:
            return token_type in links[state_type][1]

    stack = []
    gres.append(A_BLOCK)
    current_line = -1
    # func = None
    state_executor = Machine()

    while True:
        token = (yield)
        if hasattr(token, 'line'):
            current_line = token.line

        if ptype in EXPRESSIONS_STATES:  # processed in other machine, so waiting for ';' or ':'
            waitfor = links[ptype][0]

        ctype = typeof(token)

        # check syntax errors
        possibles = reduce(lambda a, b: a + b, [[] if links[x][1] == None else list(links[x][1])
                                                for x in waitfor])
        if possibles is None:
            possibles = []
        else:
            possibles = list(possibles)
        possibles += RANGES_LIST
        if possibles is not None and ctype not in possibles:
            raise ParserError('Syntax error on line %d' % current_line)
        if ctype == T_NO and token != None:
            # FIXME: dead code?
            raise ParserError(
                "Unknown token '%s' on line %d" % (token, current_line))

        # Next state
        possibles = []

        if len(waitfor) == 1:  # single transition
            possibles.append(waitfor[0])
        else:
            for possible in waitfor:
                if istypeeq(typeof(token), possible):
                    possibles.append(possible)
        if len(possibles) > 1:
            raise ParserError("Syntax error: Ambiguity")
        if len(possibles) == 0:
            raise ParserError("Syntax error")

        ptype = possibles[0]

        # Process state

        if links[ptype][2] is not None:
            action = getattr(state_executor, "do_%s" % links[ptype][2])
            state_executor.token = token
            action()

        waitfor = links[ptype][0]

        # check instant states
        if links[waitfor[0]][3]:
            ptype = links[ptype][0][0]
            waitfor = links[ptype][0]

        if len(waitfor) == 1 and waitfor[0] in EXPRESSIONS_STATES:
            m = m_expressions()
            m.next()
            machine.append(m)
            stack.append(ctype)
            ptype = waitfor[0]


def print_tree(t, n=0, f=sys.stdout):
    if isinstance(t, list):
        for x in reversed(t):
            print_tree(x, n, f=f)
    elif isinstance(t, tuple):
        if t[0] in NAMES:
            print("--" * n, NAMES[t[0]], file=f)
        else:
            print("--" * n, t[0], file=f)
        print_tree(t[1], n=n + 1, f=f)
    else:
        print("--" * n, t, file=f)

./utils/gen.py

from __future__ import print_function

from functools import partial

from utils.const import *
from utils import ParserError, typeof


class TreeStats(object):
    def __init__(self, vars=None, strs=None, funcs=None):
        if vars is None:
            vars = []
        if strs is None:
            strs = []
        if funcs is None:
            funcs = {}

        self.vars = vars
        self.strs = strs
        self.funcs = funcs
        self.use_print = False
        self.use_read = False


def find_vars(t, stat=None, func=None):
    if func is not None:
        prefix = func.name + "_"
    else:
        prefix = ""

    if stat == None:
        stat = TreeStats()
    if isinstance(t, list):
        for x in reversed(t):
            find_vars(x, stat=stat, func=func)
    elif isinstance(t, tuple):
        if t[0] == A_PRINT:
            stat.use_print = True
        elif t[0] == A_READ:
            stat.use_read = True
        if t[0] == A_FUNCTION:
            stat.funcs[t[1][0].name] = {'args': t[1][0].args,
                                        'args_count': len(t[1][0].args),
                                        'info': t[1][0]}
            for v in t[1][0].args:
                var = "%s_%s" % (t[1][0].name, v)
                stat.vars.append(var)
                t[1][0].inner_vars.append(var)

            find_vars(t[1][1], stat=stat, func=t[1][0])
        else:
            find_vars(t[1], stat=stat, func=func)
    else:
        if isinstance(t, str) and typeof(t) == T_VAR:
            if prefix + t not in stat.vars:
                stat.vars.append(prefix + t)
                if func is not None:
                    func.inner_vars.append(prefix + t)

        if isinstance(t, str) and typeof(t) == T_STRING:
            if t not in stat.strs:
                stat.strs.append(t)
    return stat


def make_asm_node_p(p, cmd, o, v, shift=0):
    p.text.append((cmd, o, v, shift,))


class PseudoAsm(object):
    def __init__(self, p=None):
        if p is None:
            self.text = []
            self.data = []
            self.labelNum = 0
            self.ifNum = 0
            self.loopNum = 0
            self.funcNum = 0
        else:
            self.text = p.text[:]
            self.data = p.data[:]
            self.labelNum = p.labelNum
            self.ifNum = p.ifNum
            self.loopNum = p.loopNum
            self.funcNum = p.funcNum


def gen_text_section(t, stat, p=None, prefix=""):
    if p is None:
        p = PseudoAsm()

    make_asm_node = partial(
        make_asm_node_p, p=p, shift=(0 if len(prefix) == 0 else 1))
    aa_push = partial(make_asm_node, cmd=C_PUSH)
    aa_pop = partial(make_asm_node, cmd=C_POP, o=None)
    aa_mov = partial(make_asm_node, cmd=C_MOV)
    aa_call = partial(make_asm_node, cmd=C_CALL, o=None)
    aa_add = partial(make_asm_node, cmd=C_ADD)
    aa_sub = partial(make_asm_node, cmd=C_SUB)
    aa_imul = partial(make_asm_node, cmd=C_IMUL, o=None)
    aa_idiv = partial(make_asm_node, cmd=C_IDIV, o=None)
    aa_cmp = partial(make_asm_node, cmd=C_CMP, o=None)
    aa_jmp = partial(make_asm_node, cmd=C_JMP)
    aa_label = partial(make_asm_node, cmd=C_LABEL, o=None)
    aa_neg = partial(make_asm_node, cmd=C_NEG, o=None)
    aa_push_num = partial(aa_push, o=C_OPT_NO)
    aa_push_addr = partial(aa_push, o=C_OPT_ADDR)
    aa_ret = partial(make_asm_node, cmd=C_RET, o=None, v=None)

    iterate = t
    if isinstance(t, list):
        iterate = reversed(t)
    if isinstance(t, (str, tuple)):
        iterate = [t]
    for node in iterate:
        p.text.append((C_COMMENT, None, None))
        if isinstance(node, str):
            tnode = typeof(node)
            if tnode == T_NUMBER:
                aa_push_num(v=node)
            elif tnode == T_VAR:
                aa_push_addr(v="v%s" % prefix + node)
            else:
                raise ParserError(
                    "Error generating ASM code on node %s" % node)

        elif node[0] == A_BLOCK:
            gen_text_section(node[1], stat, p=p, prefix=prefix)

        elif node[0] == A_FUNCTION:
            p.funcNum += 1
            p.text.append((C_COMMENT, None, None))
            p.text.append((C_COMMENT, None, "Function %s" % node[1][0].name))
            aa_jmp(o=None, v="Func%dEnd" % p.funcNum, shift=1)
            aa_label(v="Func_%s" % node[1][0].name, shift=1)
            for i, arg in enumerate(node[1][0].args):
                var = "v%s_%s" % (node[1][0].name, arg)
                aa_mov(o=[C_OPT_NO, C_OPT_ADDR], v=["eax",
                       "esp+%d" % ((i + 1) * 4)], shift=1)
                aa_mov(o=[C_OPT_ADDR, C_OPT_NO], v=[var, "eax"], shift=1)
            gen_text_section(
                node[1][1], stat, p=p, prefix=node[1][0].name + "_")
            aa_label(v="Func%dEnd" % p.funcNum, shift=1)

        elif node[0] == A_RETURN:
            # print (node[1])
            aa_mov(
                o=[C_OPT_NO, C_OPT_ADDR], v=["eax", "v%s" % prefix + node[1]])
            # aa_push_num(v="eax")
            aa_ret()

        elif node[0] == A_PRINT:
            if typeof(node[1]) == T_STRING:
                strnum = stat.strs.index(node[1])

                aa_push_num(v="str%d" % strnum)
                aa_call(v="printf")
                aa_add(o=[C_OPT_NO, C_OPT_NO], v=["esp", "4"])
                aa_push_addr(v="stdout")
                aa_call(v="fflush")
                aa_add(o=[C_OPT_NO, C_OPT_NO], v=["esp", "4"])

            elif typeof(node[1]) == T_VAR:
                aa_mov(o=[C_OPT_NO, C_OPT_ADDR], v=["eax",
                       "v%s" % prefix + node[1]])
                aa_push_num(v="eax")
                aa_push_num(v="numbs")
                aa_call(v="printf")
                aa_add(o=[C_OPT_NO, C_OPT_NO], v=["esp", "8"])
                aa_push_addr(v="stdout")
                aa_call(v="fflush")
                aa_add(o=[C_OPT_NO, C_OPT_NO], v=["esp", "4"])
            else:
                raise ParserError("Error print argument: %s" % node)

        elif node[0] == A_READ:
            assert typeof(node[1]) == T_VAR

            aa_push_num(v="v%s" % prefix + node[1])
            aa_push_num(v="numbs_in_format")
            aa_call(v="scanf")
            aa_add(o=[C_OPT_NO, C_OPT_NO], v=["esp", "8"])
            aa_call(v="getchar")

        elif node[0] == A_ASSIGN:
            var = node[1][0]
            gen_text_section(node[1][1], stat, p=p, prefix=prefix)
            aa_pop(v="eax")
            aa_mov(o=[C_OPT_ADDR, C_OPT_NO], v=["v%s" % prefix + var, "eax"])

        elif node[0] == A_IF:
            p.ifNum += 1
            gen_text_section(node[1][0], stat, p=p, prefix=prefix)
            aa_pop(v="eax")
            aa_cmp(o=[C_OPT_NO, C_OPT_NO], v=["eax", "0"])
            aa_jmp(o="jnz", v="llIf%dElse" % p.ifNum)
            # then
            gen_text_section(node[1][1], stat, p=p, prefix=prefix)
            aa_jmp(o=None, v="llIf%dEnd" % p.ifNum)
            # else
            aa_label(v="llIf%dElse" % p.ifNum)
            gen_text_section(node[1][2], stat, p=p, prefix=prefix)

            aa_label(v="llIf%dEnd" % p.ifNum)

        elif node[0] == '+':
            gen_text_section(node[1], stat, p=p, prefix=prefix)
            aa_pop(v="eax")
            aa_pop(v="ebx")
            aa_add(o=[C_OPT_NO, C_OPT_NO], v=["eax", "ebx"])
            aa_push_num(v="eax")

        elif node[0] in ['>=', '<=', '>', '<', '=']:
            gen_text_section(node[1], stat, p=p, prefix=prefix)
            p.labelNum += 1
            op = {'>=': 'jge',
                  '<=': 'jle',
                  '>': 'jg',
                  '<': 'jl',
                  '=': 'je',
                  }
            aa_pop(v="eax")
            aa_pop(v="ebx")
            aa_cmp(v=["eax", "ebx"])
            aa_jmp(o=op[node[0]], v="ll%d" % p.labelNum)
            aa_push_num(v="1")
            aa_jmp(o=None, v="ell%d" % p.labelNum)
            aa_label(v="ll%d" % p.labelNum)
            aa_push_num(v="0")
            aa_label(v="ell%d" % p.labelNum)

        elif node[0] == '-':
            gen_text_section(node[1], stat, p=p, prefix=prefix)
            if len(node[1]) == 2:
                aa_pop(v="eax")
                aa_pop(v="ebx")
                aa_sub(o=[C_OPT_NO, C_OPT_NO], v=["eax", "ebx"])
                aa_push_num(v="eax")
            else:
                aa_pop(v="eax")
                aa_neg(v="eax")
                aa_push_num(v="eax")

        elif node[0] == '*':
            gen_text_section(node[1], stat, p=p, prefix=prefix)
            aa_pop(v="eax")
            aa_pop(v="ebx")
            aa_imul(v="ebx")
            aa_push_num(v="eax")

        elif node[0] == '/':
            gen_text_section(node[1], stat, p=p, prefix=prefix)
            aa_pop(v="eax")
            aa_pop(v="ebx")
            aa_idiv(v="ebx")
            aa_push_num(v="eax")

        elif node[0] == '%':
            gen_text_section(node[1], stat, p=p, prefix=prefix)
            aa_pop(v="eax")
            aa_pop(v="ebx")
            aa_idiv(v="ebx")
            aa_push_num(v="edx")

        elif node[0] == A_WHILE:
            p.loopNum += 1

            aa_label(v="llWhile%d" % p.loopNum)
            gen_text_section(node[1][0], stat, p=p, prefix=prefix)

            aa_pop(v="eax")
            aa_cmp(o=[C_OPT_NO, C_OPT_NO], v=["eax", "0"])
            aa_jmp(o="jnz", v="llWhile%dEnd" % p.loopNum)

            gen_text_section(node[1][1], stat, p=p, prefix=prefix)

            aa_jmp(o=None, v="llWhile%d" % p.loopNum)
            aa_label(v="llWhile%dEnd" % p.loopNum)

        elif node[0] == A_CALL:
            if stat.funcs[node[1]]['args_count'] != len(node[2]):
                raise ParserError("Call %s passing %d arguments. %d expected" %
                                 (node[1], len(node[2]), stat.funcs[node[1]]['args_count']))

            for iv in stat.funcs[node[1]]['info'].inner_vars:
                aa_push_addr(v="v%s" % iv)

            for arg in reversed(node[2]):
                gen_text_section(arg, stat, p=p, prefix=prefix)

            aa_call(v="Func_%s" % node[1])
            aa_add(o=[C_OPT_NO, C_OPT_NO], v=["esp", str(4 * len(node[2]))])

            for iv in reversed(stat.funcs[node[1]]['info'].inner_vars):
                aa_pop(v="edx")
                aa_mov(o=[C_OPT_ADDR, C_OPT_NO], v=["v%s" % iv, "edx"])

            aa_push_num(v="eax")

        elif node[0] in ['/', '%']:
            raise NotImplementedError(
                "%s operation is not implemented yet" % node[0])
        else:
            raise ParserError(
                "Error generating ASM code on node %s" % repr(node))


def clear_string(s):
    r = s
    r = "\",10,\"".join(r.split("\\n"))
    return r


def gen_code(t, stat):
    p = PseudoAsm()

    p.data.append((C_EQU, "_kernel_", "0x80"))

    p.data.append((C_COMMENT, None, "Strings"))
    for i, vs in enumerate(stat.strs):
        s = clear_string(vs)
        p.data.append((C_DB, "str%d" % i, "%s, 0" % s))
        p.data.append((C_EQU_F, "lstr%d" % i, "str%d" % i))

    p.data.append((C_DB, "numbs", "\"%d\", 0"))
    p.data.append((C_DB, "numbs_in_format", "\"%d\",0"))

    p.data.append((C_COMMENT, None, "Variables"))

    for i, vs in enumerate(stat.vars):
        p.data.append((C_DD, "v%s" % vs, "0"))

    p.text.append((C_GLOBL, None, "_start"))

    extern = []
    if stat.use_print:
        extern.append("printf")
    if stat.use_read:
        extern.append("scanf")
        extern.append("getchar")
    if stat.use_read or stat.use_print:
        extern.append("fflush")
        extern.append("stdout")

    for e in extern:
        p.text.append((C_EXTRN, None, e))

    p.text.append((C_LABEL, None, "_start"))

    p.text.append((C_COMMENT, None, "setup stack frame"))
    p.text.append((C_PUSH, C_OPT_NO, "ebp"))
    p.text.append((C_MOV, [C_OPT_NO, C_OPT_NO], ["ebp", "esp"]))

    gen_text_section(t, stat, p=p)

    # end
    p.text.append((C_COMMENT, None, "restore stack frame"))
    p.text.append((C_MOV, [C_OPT_NO, C_OPT_NO], ["esp", "ebp"]))
    p.text.append((C_POP, None, "ebp"))
    p.text.append((C_MOV, [C_OPT_NO, C_OPT_NO], ["ebx", "0"]))
    p.text.append((C_MOV, [C_OPT_NO, C_OPT_NO], ["eax", "1"]))
    p.text.append((C_INT, None, "_kernel_"))

    return p

./utils/optimizer.py


from utils.const import *
from utils.gen import PseudoAsm


def optimize(pseudo, num=2):
    text = pseudo
    for x in xrange(num):
        text = do_optimize(text)
    return text


def do_optimize(pseudo):
    pseudo = PseudoAsm(pseudo)

    text = pseudo.text
    text = optimize_push_pop(text)
    text = optimize_mov(text)
    text = optimize_mov_push(text)
    text = optimize_mov_to_self(text)
    text = optimize_clean_lines(text)

    pseudo.text = text
    return pseudo


def optimize_mov_to_self(text):
    " reduce mov (mov eax,eax) "
    result = []
    for i, op in enumerate(text):
        if op[0] == C_MOV:
            if (op[2][0] == op[2][1]) and (op[1][0] == op[1][1]):
                pass
            else:
                result.append(op)
        else:
            result.append(op)
    return result


def optimize_push_pop(text):
    " reduce push and pop sequences "
    stack = []
    result = []
    ires = []
    for i, op in enumerate(text):
        if len(op) > 3:
            offset = op[3]
        else:
            offset = 0

        if op[0] == C_PUSH:
            stack.append(op)
        elif op[0] == C_POP:
            if len(stack) > 0:
                v = stack.pop()
                can_reduce = True
                for s in ires:
                    if s[0] == C_MOV:
                        can_reduce = can_reduce and (
                            s[2][0] != v[2]) and (s[2][1] != v[2])

                if can_reduce:
                    ires.append(
                        (C_MOV, [C_OPT_NO, v[1]], (op[2], v[2]), offset))
                else:
                    ires.insert(0, v)
                    ires.append(op)
            else:
                ires.append(op)
        elif op[0] == C_COMMENT:
            result.append(op)
        else:
            result += stack
            stack = []
            result += ires
            ires = []
            result.append(op)
    result += stack
    return result


def optimize_mov(text):
    " reduce mov twice (mov eax, 5; mov ebx, eax)"
    prev_mov = None
    result = []
    comments = []
    for i, op in enumerate(text):
        if len(op) > 3:
            offset = op[3]
        else:
            offset = 0

        if op[0] == C_MOV:
            if prev_mov is not None:
                v = prev_mov
                if (v[2][0] == op[2][1]) and (v[1][0] == op[1][1]) and \
                   (not (op[1][0] == v[1][1] and op[1][0] == C_OPT_ADDR)):
                    result.append((C_MOV, [op[1][0], v[
                                  1][1]], (op[2][0], v[2][1]), offset))
                    prev_mov = None
                else:
                    result.append(prev_mov)
                    prev_mov = op
            else:
                prev_mov = op
        elif op[0] == C_COMMENT:
            comments.append(op)
        else:
            if prev_mov is not None:
                result.append(prev_mov)
            prev_mov = None
            result += comments
            comments = []
            result.append(op)
    if prev_mov is not None:
        result += prev_mov
    return result


def optimize_mov_push(text):
    " reduce mov before push (mov eax, 5; push eax) "
    prev_mov = None
    result = []
    comments = []
    ires = []
    for i, op in enumerate(text):
        if len(op) > 3:
            offset = op[3]
        else:
            offset = 0

        if op[0] == C_MOV:
            if prev_mov is not None:
                ires.append(prev_mov)
            prev_mov = op
        elif op[0] == C_PUSH:
            if prev_mov is not None:
                v = prev_mov
                if (v[2][0] == op[2]) and (v[1][0] == op[1]) and (v[1][0] != C_OPT_ADDR):
                    ires.append((C_PUSH, v[1][1], v[2][1], offset))
                else:
                    ires.append(prev_mov)
                    ires.append(op)
            else:
                ires.append(op)
            prev_mov = None
        elif op[0] == C_COMMENT:
            comments.append(op)
        else:
            if prev_mov is not None:
                ires.append(prev_mov)
            prev_mov = None
            result += comments
            comments = []
            result += ires
            ires = []
            result.append(op)
    if prev_mov is not None:
        result += prev_mov
    return result


def optimize_clean_lines(text):
    result = []
    n = 0
    for i, op in enumerate(text):
        if op[0] == C_COMMENT:
            if op[2] == None or op[2] == "":
                n += 1
                if n < 2:
                    result.append(op)
            else:
                result.append(op)
        else:
            result.append(op)
            n = 0
    return result

./utils/gen_asm.py

from __future__ import print_function

from time import strftime, gmtime

from utils.const import *
from utils import ParserError


def gen_real_asm(pseudo, src_file):
    res = []
    res.append("; Source file: %s" % src_file)
    res.append("; Generated %s" % strftime("%Y-%m-%d %H:%M:%S", gmtime()))
    res.append("")
    res.append("SECTION .data")
    res.append("")
    for command in pseudo.data:
        res += nasm_gen(command)
    res.append("")
    res.append("SECTION .text")
    res.append("")
    for command in pseudo.text:
        res += nasm_gen(command)

    return res


def operand(x, t):
    if t == C_OPT_NO:
        return x
    elif t == C_OPT_ADDR:
        return "[%s]" % x


def nasm_gen(l):
    cmd = None
    if l[0] == C_PUSH:
        cmd = ["push\tdword\t%s" % operand(l[2], l[1])]
    elif l[0] == C_POP:
        cmd = ["pop\t\tdword\t%s" % l[2]]
    elif l[0] == C_CALL:
        cmd = ["call\t%s" % l[2]]
    elif l[0] == C_INT:
        cmd = ["int\t%s" % l[2]]
    elif l[0] == C_MOV:
        cmd = ["mov\t\tdword\t%s, %s" % (operand(l[2][0], l[1][0]),
                                         operand(l[2][1], l[1][1]))]
    elif l[0] == C_ADD:
        cmd = ["add\t\t%s, %s" % (operand(l[2][0], l[1][0]),
                                  operand(l[2][1], l[1][1]))]
    elif l[0] == C_IMUL:
        cmd = ["imul\t%s" % l[2]]
    elif l[0] == C_IDIV:
        cmd = ["idiv\t%s" % l[2]]
    elif l[0] == C_SUB:
        cmd = ["sub\t\t%s, %s" % (operand(l[2][0], l[1][0]),
                                  operand(l[2][1], l[1][1]))]
    elif l[0] == C_CMP:
        cmd = ["cmp\t%s,%s" % (l[2][0], l[2][1])]
    elif l[0] == C_COMMENT:
        if l[2] is None:
            cmd = [""]
        else:
            cmd = ["; %s" % l[2]]
    elif l[0] == C_EQU:
        cmd = ["%s:\tequ\t%s" % (l[1], l[2])]
    elif l[0] == C_EQU_F:
        cmd = ["%s:\tequ\t\t$-%s" % (l[1], l[2])]
    elif l[0] == C_DB:
        cmd = ["%s:\tdb\t\t%s" % (l[1], l[2])]
    elif l[0] == C_DD:
        cmd = ["%s:\t\tdd\t%s" % (l[1], l[2])]
    elif l[0] == C_GLOBL:
        cmd = ["global\t%s" % l[2]]
    elif l[0] == C_EXTRN:
        cmd = ["extern\t%s" % l[2]]
    elif l[0] == C_NEG:
        cmd = ["neg\t%s" % l[2]]
    elif l[0] == C_LABEL:
        cmd = ["%s:" % l[2]]
    elif l[0] == C_JMP:
        if l[1] is None:
            cmd = ["jmp\t%s" % l[2]]
        else:
            cmd = ["%s\t%s" % (l[1], l[2])]
    elif l[0] == C_RET:
        cmd = ["ret"]
    else:
        raise ParserError("Can't translate %d command" % l[0])

    cmd = map(lambda x: "\t%s" % x, cmd)

    if len(l) > 3:
        return map(lambda x: "%s%s" % (l[3] * "\t", x), cmd)
    else:
        return cmd