.pyc 文件相信大家见怪不怪,大家经常在 __pycache__ 里面见到这些文件。这些文件存储了 python 编译出来的字节码文件,还有一些元信息(例如版本号,对应文件的修改时间)。接下来将通过对源码的解析对 Python Compiled 文件将进行简要的剖析。

注:本文章中出现的代码来自2021年7月12日的 cpython 的 Tag: v3.9.5,作者对源码进行了部分注释和删减

约定和提要

在这篇文章中,我们的目标是填充好这个结构体。

type MTimePycFile struct { ... }

同时,这篇文章需要读者具备一定的 python 和 C 代码的阅读能力。

pyc 文件的生成

在 python 中,我们可以使用 py_compile 模块来输出 pyc 文件。

python -m py_compile test.py

在同目录下的 __pycache__ 里面,可以找到生成的文件:test.cpython-38.pyc

接下来,我们深入到 py_compile 的源码实现里面,看看它到底是怎么生成的。我们以带时间戳的 pyc 文件输出为主要研究对象。

# cpython/Lib/importlib/ _bootstrap_external.py
# ...

def main(args=None):
    """Compile several source files.

    The files named in 'args' (or on the command line, if 'args' is
    not specified) are compiled and the resulting bytecode is cached
    in the normal manner.  This function does not search a directory
    structure to locate source files; it only compiles files named
    explicitly.  If '-' is the only parameter in args, the list of
    files is taken from standard input.

    """
    if args is None:
        args = sys.argv[1:]
    rv = 0  # exit code
    if args == ['-']:
        ...
    else:
        for filename in args:
            try:
                compile(filename, doraise=True)  # 开始进行编译
            except PyCompileError as error:
                # return value to indicate at least one failure
                rv = 1
                sys.stderr.write("%s\n" % error.msg)
    return rv

if __name__ == "__main__":
    sys.exit(main())
# cpython/Lib/importlib/ _bootstrap_external.py
def compile(file, cfile=None, dfile=None, doraise=False, optimize=-1,
            invalidation_mode=None, quiet=0):
    ...
    source_bytes = loader.get_data(file)  # 得到源码
    try:
        # 编译源码到 code 对象
        code = loader.source_to_code(source_bytes, dfile or file,
                                     _optimize=optimize)
    except Exception as err:
        ...
    ...
    if invalidation_mode == PycInvalidationMode.TIMESTAMP:
        source_stats = loader.path_stats(file)
        
        # 调用 _code_to_timestamp_pyc ,输出 pyc 带时间戳的pyc文件的内容
        bytecode = importlib._bootstrap_external._code_to_timestamp_pyc(
            code, source_stats['mtime'], source_stats['size'])
    else:
        ...
    mode = importlib._bootstrap_external._calc_mode(file)
    importlib._bootstrap_external._write_atomic(cfile, bytecode, mode)
    return cfile
# cpython/Lib/importlib/_bootstrap_external.py
# ...
MAGIC_NUMBER = (3425).to_bytes(2, 'little') + b'\r\n'
# ...
def _code_to_timestamp_pyc(code, mtime=0, source_size=0):
    "Produce the data for a timestamp-based pyc."
    data = bytearray(MAGIC_NUMBER)
    data.extend(_pack_uint32(0))
    data.extend(_pack_uint32(mtime))  # 源码修改时间
    data.extend(_pack_uint32(source_size))  # 源码长度
    data.extend(marshal.dumps(code))
    return data

需要注意的是,里面的 MAGIC_NUMBER 是由版本号确定的,不同的 python 版本由不同的版本号。在这里是 3425.

_code_to_timestamp_pyc 里面,我们可以初步推断 pyc 文件结构:

type MTimePycFile struct {
    MagicNumber		 int32
    Sep				 uint32
    ModificationTime uint32
    SourceSize		 uint32
    CodeData		 CodeObject
}

marshal 探秘

接下来,我们进一步探索 CodeObject 的结构。我们在上面的代码中可以看到,CodeData 是来自 marshal.dumps(code) 。对于 marshal 模块的作用,python 文档是这么说的:

marshal — Internal Python object serialization

这个模块用来序列化和反序列化各种 内置 python对象,它使用 C 来实现,是 python 的扩展模块,同时,也是 python 核心中的重要组成。下面是 marshal 支持的类型:

// cpython/Python/marshal.c

#define TYPE_NULL               '0'
#define TYPE_NONE               'N'
#define TYPE_FALSE              'F'
#define TYPE_TRUE               'T'
#define TYPE_STOPITER           'S'
#define TYPE_ELLIPSIS           '.'
#define TYPE_INT                'i'
/* TYPE_INT64 is not generated anymore.
   Supported for backward compatibility only. */
#define TYPE_INT64              'I'
#define TYPE_FLOAT              'f'
#define TYPE_BINARY_FLOAT       'g'
#define TYPE_COMPLEX            'x'
#define TYPE_BINARY_COMPLEX     'y'
#define TYPE_LONG               'l'
#define TYPE_STRING             's'
#define TYPE_INTERNED           't'
#define TYPE_REF                'r'
#define TYPE_TUPLE              '('
#define TYPE_LIST               '['
#define TYPE_DICT               '{'
#define TYPE_CODE               'c'
#define TYPE_UNICODE            'u'
#define TYPE_UNKNOWN            '?'
#define TYPE_SET                '<'
#define TYPE_FROZENSET          '>'
#define TYPE_ASCII              'a'
#define TYPE_ASCII_INTERNED     'A'
#define TYPE_SMALL_TUPLE        ')'
#define TYPE_SHORT_ASCII        'z'
#define TYPE_SHORT_ASCII_INTERNED 'Z'

我们来看看 marshal.dumps 的实现:

static PyObject *
marshal_dumps_impl(PyObject *module, PyObject *value, int version)
/*[clinic end generated code: output=9c200f98d7256cad input=a2139ea8608e9b27]*/
{
    return PyMarshal_WriteObjectToString(value, version);
}
PyObject *
PyMarshal_WriteObjectToString(PyObject *x, int version)
{
    WFILE wf;

    memset(&wf, 0, sizeof(wf));
    wf.str = PyBytes_FromStringAndSize((char *)NULL, 50);
    if (wf.str == NULL)
        return NULL;
    wf.ptr = wf.buf = PyBytes_AS_STRING(wf.str);
    wf.end = wf.ptr + PyBytes_GET_SIZE(wf.str);
    wf.error = WFERR_OK;
    wf.version = version;
    if (w_init_refs(&wf, version)) {
        Py_DECREF(wf.str);
        return NULL;
    }
    w_object(x, &wf);  // 序列化对象
    w_clear_refs(&wf);
    // ...
    return wf.str;
}

接下来我们就要深入到 w_object 中了,这个函数是由一个个 if, else if 构成的。

static void
w_object(PyObject *v, WFILE *p)
{
    char flag = '\0';

    p->depth++;

    if (p->depth > MAX_MARSHAL_STACK_DEPTH) {
        p->error = WFERR_NESTEDTOODEEP;
    }
    else if (v == NULL) {
        w_byte(TYPE_NULL, p);
    }
    else if (v == Py_None) {
        w_byte(TYPE_NONE, p);
    }
    else if (v == PyExc_StopIteration) {
        w_byte(TYPE_STOPITER, p);
    }
    else if (v == Py_Ellipsis) {
        w_byte(TYPE_ELLIPSIS, p);
    }
    else if (v == Py_False) {
        w_byte(TYPE_FALSE, p);
    }
    else if (v == Py_True) {
        w_byte(TYPE_TRUE, p);
    }
    else if (!w_ref(v, &flag, p))
        w_complex_object(v, flag, p);  // 如果要序列化 code object 最终会来到这里

    p->depth--;
}

static void
w_complex_object(PyObject *v, char flag, WFILE *p)
{
    Py_ssize_t i, n;

    if (PyLong_CheckExact(v)) {
        int overflow;
        long x = PyLong_AsLongAndOverflow(v, &overflow);
        if (overflow) {
            w_PyLong((PyLongObject *)v, flag, p);
        }
        else {
#if SIZEOF_LONG > 4
            long y = Py_ARITHMETIC_RIGHT_SHIFT(long, x, 31);
            if (y && y != -1) {
                /* Too large for TYPE_INT */
                w_PyLong((PyLongObject*)v, flag, p);
            }
            else
#endif
            {
                W_TYPE(TYPE_INT, p);
                w_long(x, p);
            }
        }
    }
    else if (PyFloat_CheckExact(v)) {
        if (p->version > 1) {
            W_TYPE(TYPE_BINARY_FLOAT, p);
            w_float_bin(PyFloat_AS_DOUBLE(v), p);
        }
        else {
            W_TYPE(TYPE_FLOAT, p);
            w_float_str(PyFloat_AS_DOUBLE(v), p);
        }
    }
    else if (PyComplex_CheckExact(v)) {
        if (p->version > 1) {
            W_TYPE(TYPE_BINARY_COMPLEX, p);
            w_float_bin(PyComplex_RealAsDouble(v), p);
            w_float_bin(PyComplex_ImagAsDouble(v), p);
        }
        else {
            W_TYPE(TYPE_COMPLEX, p);
            w_float_str(PyComplex_RealAsDouble(v), p);
            w_float_str(PyComplex_ImagAsDouble(v), p);
        }
    }
    else if (PyBytes_CheckExact(v)) {
        W_TYPE(TYPE_STRING, p);
        w_pstring(PyBytes_AS_STRING(v), PyBytes_GET_SIZE(v), p);
    }
    else if (PyUnicode_CheckExact(v)) {
        if (p->version >= 4 && PyUnicode_IS_ASCII(v)) {
            int is_short = PyUnicode_GET_LENGTH(v) < 256;
            if (is_short) {
                if (PyUnicode_CHECK_INTERNED(v))
                    W_TYPE(TYPE_SHORT_ASCII_INTERNED, p);
                else
                    W_TYPE(TYPE_SHORT_ASCII, p);
                w_short_pstring(PyUnicode_1BYTE_DATA(v),
                                PyUnicode_GET_LENGTH(v), p);
            }
            else {
                if (PyUnicode_CHECK_INTERNED(v))
                    W_TYPE(TYPE_ASCII_INTERNED, p);
                else
                    W_TYPE(TYPE_ASCII, p);
                w_pstring(PyUnicode_1BYTE_DATA(v),
                          PyUnicode_GET_LENGTH(v), p);
            }
        }
        else {
            PyObject *utf8;
            utf8 = PyUnicode_AsEncodedString(v, "utf8", "surrogatepass");
            if (utf8 == NULL) {
                p->depth--;
                p->error = WFERR_UNMARSHALLABLE;
                return;
            }
            if (p->version >= 3 &&  PyUnicode_CHECK_INTERNED(v))
                W_TYPE(TYPE_INTERNED, p);
            else
                W_TYPE(TYPE_UNICODE, p);
            w_pstring(PyBytes_AS_STRING(utf8), PyBytes_GET_SIZE(utf8), p);
            Py_DECREF(utf8);
        }
    }
    else if (PyTuple_CheckExact(v)) {
        n = PyTuple_GET_SIZE(v);
        if (p->version >= 4 && n < 256) {
            W_TYPE(TYPE_SMALL_TUPLE, p);
            w_byte((unsigned char)n, p);
        }
        else {
            W_TYPE(TYPE_TUPLE, p);
            W_SIZE(n, p);
        }
        for (i = 0; i < n; i++) {
            w_object(PyTuple_GET_ITEM(v, i), p);
        }
    }
    else if (PyList_CheckExact(v)) {
        W_TYPE(TYPE_LIST, p);
        n = PyList_GET_SIZE(v);
        W_SIZE(n, p);
        for (i = 0; i < n; i++) {
            w_object(PyList_GET_ITEM(v, i), p);
        }
    }
    else if (PyDict_CheckExact(v)) {
        Py_ssize_t pos;
        PyObject *key, *value;
        W_TYPE(TYPE_DICT, p);
        /* This one is NULL object terminated! */
        pos = 0;
        while (PyDict_Next(v, &pos, &key, &value)) {
            w_object(key, p);
            w_object(value, p);
        }
        w_object((PyObject *)NULL, p);
    }
    else if (PyAnySet_CheckExact(v)) {
        PyObject *value;
        Py_ssize_t pos = 0;
        Py_hash_t hash;

        if (PyFrozenSet_CheckExact(v))
            W_TYPE(TYPE_FROZENSET, p);
        else
            W_TYPE(TYPE_SET, p);
        n = PySet_GET_SIZE(v);
        W_SIZE(n, p);
        while (_PySet_NextEntry(v, &pos, &value, &hash)) {
            w_object(value, p);
        }
    }
    else if (PyCode_Check(v)) {
        PyCodeObject *co = (PyCodeObject *)v;
        W_TYPE(TYPE_CODE, p);
        w_long(co->co_argcount, p);
        w_long(co->co_posonlyargcount, p);
        w_long(co->co_kwonlyargcount, p);
        w_long(co->co_nlocals, p);
        w_long(co->co_stacksize, p);
        w_long(co->co_flags, p);
        w_object(co->co_code, p);
        w_object(co->co_consts, p);
        w_object(co->co_names, p);
        w_object(co->co_varnames, p);
        w_object(co->co_freevars, p);
        w_object(co->co_cellvars, p);
        w_object(co->co_filename, p);
        w_object(co->co_name, p);
        w_long(co->co_firstlineno, p);
        w_object(co->co_lnotab, p);
    }
    else if (PyObject_CheckBuffer(v)) {
        /* Write unknown bytes-like objects as a bytes object */
        Py_buffer view;
        if (PyObject_GetBuffer(v, &view, PyBUF_SIMPLE) != 0) {
            w_byte(TYPE_UNKNOWN, p);
            p->depth--;
            p->error = WFERR_UNMARSHALLABLE;
            return;
        }
        W_TYPE(TYPE_STRING, p);
        w_pstring(view.buf, view.len, p);
        PyBuffer_Release(&view);
    }
    else {
        W_TYPE(TYPE_UNKNOWN, p);
        p->error = WFERR_UNMARSHALLABLE;
    }
}

上面展示了 w_objectw_complex_object 的全貌。接下来我们来看看 Code Object 是如何进行序列化的。

static void
w_complex_object(PyObject *v, char flag, WFILE *p)
{
    // ...
    else if (PyCode_Check(v)) {
        PyCodeObject *co = (PyCodeObject *)v;
        W_TYPE(TYPE_CODE, p);
        w_long(co->co_argcount, p);
        w_long(co->co_posonlyargcount, p);
        w_long(co->co_kwonlyargcount, p);
        w_long(co->co_nlocals, p);
        w_long(co->co_stacksize, p);
        w_long(co->co_flags, p);
        w_object(co->co_code, p);
        w_object(co->co_consts, p);
        w_object(co->co_names, p);
        w_object(co->co_varnames, p);
        w_object(co->co_freevars, p);
        w_object(co->co_cellvars, p);
        w_object(co->co_filename, p);
        w_object(co->co_name, p);
        w_long(co->co_firstlineno, p);
        w_object(co->co_lnotab, p);
    }
    // ...
}

一个 CodeObject 就是按照这个顺序被一步步序列化的。至此,我们可以完善我们的 struct 。

type MTimePycFile struct {
    MagicNumber		 int32
    Sep				 uint32
    ModificationTime uint32
    SourceSize		 uint32
    CodeData		 CodeObject
}

type CodeObject struct {
    TYPE_CODE			byte
    co_argcount 		int64
    co_posonlyargcount	int64
    co_kwonlyargcount	int64
    co_nlocals			int64
    co_stacksize		int64
    co_flags			int64
    co_code				PyObject
    co_consts			PyObject			
    co_names			PyObject
    co_varnames			PyObject
    co_freevars			PyObject
    co_cellvars			PyObject
    co_filename			PyObject
    co_name				PyObject
    co_firstlineno		int64
    co_lnotab			PyObject
}

好了,这就是带时间戳的 pyc 文件结构了。PyObject 的结构由于篇幅有限,这里就不多阐述了,有兴趣的朋友可以继续深挖 marshal 。