Monday, 12 December 2022

How to access content of stack in Python disassembler?

I'm creating a tool that analyses code used within definition of user defined function, for example:

def help_fun(a, b): return a * b
def test1(numbers):
    r1, r2 = np.multiply.reduce(numbers), sum(numbers)/len(numbers)
    r = math.sin(help_fun(r1, r2))
    return math.sqrt(r)

I understand, however, how to interpret results of dis.dis(test1):

3           0 LOAD_GLOBAL              0 (np)
            2 LOAD_ATTR                1 (multiply)
            4 LOAD_METHOD              2 (reduce)
            6 LOAD_FAST                0 (numbers)
            8 CALL_METHOD              1
           10 LOAD_GLOBAL              3 (sum)
           ...
5          46 LOAD_GLOBAL              5 (math)
           48 LOAD_METHOD              8 (sqrt)
           50 LOAD_FAST                3 (r)
           52 CALL_METHOD              1
           54 RETURN_VALUE

My expect output is:

{0: {'functions': ['sum', 'len', 'help_fun'], 
     'methods': ['np.multiply.reduce', 'math.sin', 'math.sqrt']}} #0 indicates global frame

In order to collect function and method names, I implement a wrapper for contents of stack of disassemler and use a specific way to extract these names from stack archive.

import types
import dis
def get_frames(code):
    '''given <function>.__code__ instance, iterate each code frame
    >>> [type(x) for x in get_frames(test1.__code__)]
    [code]
    '''
    yield code
    for c in code.co_consts:
        if isinstance(c, types.CodeType):
            yield from get_frames(c)
            break

def get_calls(instructions, stack):
    '''get called functions and methods in CALL_FUNCTION and CALL_METHOD opnames'''
    functions, methods = [], []
    for idx, instr in enumerate(instructions):
        if instr.opname == 'CALL_FUNCTION':
            functions.append(stack[idx - 1][- 1 - instr.arg])
        elif instr.opname == 'CALL_METHOD':
            methods.append(stack[idx - 1][- 1 - instr.arg])
    return {'functions': functions, 'methods': methods}

def get_stack(instructions):
    '''Wrapper for stack contents'''
    stack = []
    for n in instructions:
        if n.opname in ('LOAD_FAST', 'LOAD_GLOBAL', 'LOAD_CONST'):
            stack.append(n.argrepr) #global var
        elif n.opname in ('LOAD_METHOD', 'LOAD_ATTR'):
            stack[-1] = f'{stack[-1]}.{n.argrepr}'
        elif n.opname in ('CALL_FUNCTION', 'CALL_METHOD'):
            args = stack[-n.arg:]
            del stack[-n.arg:]
            stack[-1] = f'{stack[-1]}({", ".join(args)})'
        elif n.opname == 'BINARY_TRUE_DIVIDE':
            stack[-2:] = [' / '.join(stack[-2:])]
        elif n.opname == 'STORE_FAST':
            del stack[-1]
        elif n.opname == 'ROT_TWO':
            stack[-1], stack[-2] = stack[-2], stack[-1]
        elif n.opname == 'GET_ITER':
            stack[-1] = f'iter({stack[-1]})'
        yield stack.copy()

code = list(get_frames(test1.__code__))
out = dict()
for i, c in enumerate(code):
    instructions = dis.Bytecode(c)
    stack = list(get_stack(instructions))
    out[i] = get_calls(instructions, stack)
out
>>> {0: {'functions': ['sum', 'len', 'help_fun'], 'methods': ['np.multiply.reduce', 'math.sin', 'math.sqrt']}}

In my approach names of functions and methods are extracted from stack column of table:

|   line | opname             |   arg | argrepr   | stack                                                    |
|--------|--------------------|-------|-----------|----------------------------------------------------------|
|      3 | LOAD_GLOBAL        |     0 | np        | np                                                       |
|        | LOAD_ATTR          |     1 | multiply  | np.multiply                                              |
|        | LOAD_METHOD        |     2 | reduce    | np.multiply.reduce                                       |
|        | LOAD_FAST          |     0 | numbers   | np.multiply.reduce, numbers                              |
|        | CALL_METHOD        |     1 |           | np.multiply.reduce(numbers)                              |
|        | LOAD_GLOBAL        |     3 | sum       | np.multiply.reduce(numbers), sum                         |
|        | LOAD_FAST          |     0 | numbers   | np.multiply.reduce(numbers), sum, numbers                |
|        | CALL_FUNCTION      |     1 |           | np.multiply.reduce(numbers), sum(numbers)                |
|        | LOAD_GLOBAL        |     4 | len       | np.multiply.reduce(numbers), sum(numbers), len           |
|        | LOAD_FAST          |     0 | numbers   | np.multiply.reduce(numbers), sum(numbers), len, numbers  |
|        | CALL_FUNCTION      |     1 |           | np.multiply.reduce(numbers), sum(numbers), len(numbers)  |
|        | BINARY_TRUE_DIVIDE |       |           | np.multiply.reduce(numbers), sum(numbers) / len(numbers) |
|        | ROT_TWO            |       |           | sum(numbers) / len(numbers), np.multiply.reduce(numbers) |
|        | STORE_FAST         |     1 | r1        | sum(numbers) / len(numbers)                              |
|        | STORE_FAST         |     2 | r2        |                                                          |
|      4 | LOAD_GLOBAL        |     5 | math      | math                                                     |
|        | LOAD_METHOD        |     6 | sin       | math.sin                                                 |
|        | LOAD_GLOBAL        |     7 | help_fun  | math.sin, help_fun                                       |
|        | LOAD_FAST          |     1 | r1        | math.sin, help_fun, r1                                   |
|        | LOAD_FAST          |     2 | r2        | math.sin, help_fun, r1, r2                               |
|        | CALL_FUNCTION      |     2 |           | math.sin, help_fun(r1, r2)                               |
|        | CALL_METHOD        |     1 |           | math.sin(help_fun(r1, r2))                               |
|        | STORE_FAST         |     3 | r         |                                                          |
|      5 | LOAD_GLOBAL        |     5 | math      | math                                                     |
|        | LOAD_METHOD        |     8 | sqrt      | math.sqrt                                                |
|        | LOAD_FAST          |     3 | r         | math.sqrt, r                                             |
|        | CALL_METHOD        |     1 |           | math.sqrt(r)                                             |
|        | RETURN_VALUE       |       |           | math.sqrt(r)                                             |

However, things get more complicated if other kind of opnames in my instructions are included. For instance, I'm not sure about behaviour of stack if there are any list comprehensions used. Getting names of methods and functions will crash in case of:

def test2(x): 
    return [[math.sqrt(m) for m in list(n)] for n in x]

Is there any easier way to get names of methods and functions used inside a caller? Are there any better ways to get archive of stack? I know, my implementation of get_stack is poor at the moment, I'm looking for a different approach or better documentation of stack control.



from How to access content of stack in Python disassembler?

No comments:

Post a Comment