.pyc
文件相信大家见怪不怪,大家经常在 __pycache__
里面见到这些文件。这些文件存储了 python 编译出来的字节码文件,还有一些元信息(例如版本号,对应文件的修改时间)。接下来将通过对源码的解析对 Python Compiled 文件将进行简要的剖析。
注:本文章中出现的代码来自2021年7月12日的 cpython 的 Tag: v3.9.5,作者对源码进行了部分注释和删减
约定和提要
在这篇文章中,我们的目标是填充好这个结构体。
type MTimePycFile struct { ... }
同时,这篇文章需要读者具备一定的 python 和 C 代码的阅读能力。
pyc 文件的生成
在 python 中,我们可以使用 py_compile
模块来输出 pyc 文件。
python -m py_compile test.py
在同目录下的 __pycache__
里面,可以找到生成的文件:test.cpython-38.pyc
。
接下来,我们深入到 py_compile 的源码实现里面,看看它到底是怎么生成的。我们以带时间戳的 pyc 文件输出为主要研究对象。
# cpython/Lib/importlib/ _bootstrap_external.py
# ...
def main(args=None):
"""Compile several source files.
The files named in 'args' (or on the command line, if 'args' is
not specified) are compiled and the resulting bytecode is cached
in the normal manner. This function does not search a directory
structure to locate source files; it only compiles files named
explicitly. If '-' is the only parameter in args, the list of
files is taken from standard input.
"""
if args is None:
args = sys.argv[1:]
rv = 0 # exit code
if args == ['-']:
...
else:
for filename in args:
try:
compile(filename, doraise=True) # 开始进行编译
except PyCompileError as error:
# return value to indicate at least one failure
rv = 1
sys.stderr.write("%s\n" % error.msg)
return rv
if __name__ == "__main__":
sys.exit(main())
# cpython/Lib/importlib/ _bootstrap_external.py
def compile(file, cfile=None, dfile=None, doraise=False, optimize=-1,
invalidation_mode=None, quiet=0):
...
source_bytes = loader.get_data(file) # 得到源码
try:
# 编译源码到 code 对象
code = loader.source_to_code(source_bytes, dfile or file,
_optimize=optimize)
except Exception as err:
...
...
if invalidation_mode == PycInvalidationMode.TIMESTAMP:
source_stats = loader.path_stats(file)
# 调用 _code_to_timestamp_pyc ,输出 pyc 带时间戳的pyc文件的内容
bytecode = importlib._bootstrap_external._code_to_timestamp_pyc(
code, source_stats['mtime'], source_stats['size'])
else:
...
mode = importlib._bootstrap_external._calc_mode(file)
importlib._bootstrap_external._write_atomic(cfile, bytecode, mode)
return cfile
# cpython/Lib/importlib/_bootstrap_external.py
# ...
MAGIC_NUMBER = (3425).to_bytes(2, 'little') + b'\r\n'
# ...
def _code_to_timestamp_pyc(code, mtime=0, source_size=0):
"Produce the data for a timestamp-based pyc."
data = bytearray(MAGIC_NUMBER)
data.extend(_pack_uint32(0))
data.extend(_pack_uint32(mtime)) # 源码修改时间
data.extend(_pack_uint32(source_size)) # 源码长度
data.extend(marshal.dumps(code))
return data
需要注意的是,里面的 MAGIC_NUMBER
是由版本号确定的,不同的 python 版本由不同的版本号。在这里是 3425.
在 _code_to_timestamp_pyc
里面,我们可以初步推断 pyc 文件结构:
type MTimePycFile struct {
MagicNumber int32
Sep uint32
ModificationTime uint32
SourceSize uint32
CodeData CodeObject
}
marshal 探秘
接下来,我们进一步探索 CodeObject 的结构。我们在上面的代码中可以看到,CodeData
是来自 marshal.dumps(code)
。对于 marshal 模块的作用,python 文档是这么说的:
marshal — Internal Python object serialization
这个模块用来序列化和反序列化各种 内置 python对象,它使用 C 来实现,是 python 的扩展模块,同时,也是 python 核心中的重要组成。下面是 marshal 支持的类型:
// cpython/Python/marshal.c
#define TYPE_NULL '0'
#define TYPE_NONE 'N'
#define TYPE_FALSE 'F'
#define TYPE_TRUE 'T'
#define TYPE_STOPITER 'S'
#define TYPE_ELLIPSIS '.'
#define TYPE_INT 'i'
/* TYPE_INT64 is not generated anymore.
Supported for backward compatibility only. */
#define TYPE_INT64 'I'
#define TYPE_FLOAT 'f'
#define TYPE_BINARY_FLOAT 'g'
#define TYPE_COMPLEX 'x'
#define TYPE_BINARY_COMPLEX 'y'
#define TYPE_LONG 'l'
#define TYPE_STRING 's'
#define TYPE_INTERNED 't'
#define TYPE_REF 'r'
#define TYPE_TUPLE '('
#define TYPE_LIST '['
#define TYPE_DICT '{'
#define TYPE_CODE 'c'
#define TYPE_UNICODE 'u'
#define TYPE_UNKNOWN '?'
#define TYPE_SET '<'
#define TYPE_FROZENSET '>'
#define TYPE_ASCII 'a'
#define TYPE_ASCII_INTERNED 'A'
#define TYPE_SMALL_TUPLE ')'
#define TYPE_SHORT_ASCII 'z'
#define TYPE_SHORT_ASCII_INTERNED 'Z'
我们来看看 marshal.dumps
的实现:
static PyObject *
marshal_dumps_impl(PyObject *module, PyObject *value, int version)
/*[clinic end generated code: output=9c200f98d7256cad input=a2139ea8608e9b27]*/
{
return PyMarshal_WriteObjectToString(value, version);
}
PyObject *
PyMarshal_WriteObjectToString(PyObject *x, int version)
{
WFILE wf;
memset(&wf, 0, sizeof(wf));
wf.str = PyBytes_FromStringAndSize((char *)NULL, 50);
if (wf.str == NULL)
return NULL;
wf.ptr = wf.buf = PyBytes_AS_STRING(wf.str);
wf.end = wf.ptr + PyBytes_GET_SIZE(wf.str);
wf.error = WFERR_OK;
wf.version = version;
if (w_init_refs(&wf, version)) {
Py_DECREF(wf.str);
return NULL;
}
w_object(x, &wf); // 序列化对象
w_clear_refs(&wf);
// ...
return wf.str;
}
接下来我们就要深入到 w_object
中了,这个函数是由一个个 if, else if 构成的。
static void
w_object(PyObject *v, WFILE *p)
{
char flag = '\0';
p->depth++;
if (p->depth > MAX_MARSHAL_STACK_DEPTH) {
p->error = WFERR_NESTEDTOODEEP;
}
else if (v == NULL) {
w_byte(TYPE_NULL, p);
}
else if (v == Py_None) {
w_byte(TYPE_NONE, p);
}
else if (v == PyExc_StopIteration) {
w_byte(TYPE_STOPITER, p);
}
else if (v == Py_Ellipsis) {
w_byte(TYPE_ELLIPSIS, p);
}
else if (v == Py_False) {
w_byte(TYPE_FALSE, p);
}
else if (v == Py_True) {
w_byte(TYPE_TRUE, p);
}
else if (!w_ref(v, &flag, p))
w_complex_object(v, flag, p); // 如果要序列化 code object 最终会来到这里
p->depth--;
}
static void
w_complex_object(PyObject *v, char flag, WFILE *p)
{
Py_ssize_t i, n;
if (PyLong_CheckExact(v)) {
int overflow;
long x = PyLong_AsLongAndOverflow(v, &overflow);
if (overflow) {
w_PyLong((PyLongObject *)v, flag, p);
}
else {
#if SIZEOF_LONG > 4
long y = Py_ARITHMETIC_RIGHT_SHIFT(long, x, 31);
if (y && y != -1) {
/* Too large for TYPE_INT */
w_PyLong((PyLongObject*)v, flag, p);
}
else
#endif
{
W_TYPE(TYPE_INT, p);
w_long(x, p);
}
}
}
else if (PyFloat_CheckExact(v)) {
if (p->version > 1) {
W_TYPE(TYPE_BINARY_FLOAT, p);
w_float_bin(PyFloat_AS_DOUBLE(v), p);
}
else {
W_TYPE(TYPE_FLOAT, p);
w_float_str(PyFloat_AS_DOUBLE(v), p);
}
}
else if (PyComplex_CheckExact(v)) {
if (p->version > 1) {
W_TYPE(TYPE_BINARY_COMPLEX, p);
w_float_bin(PyComplex_RealAsDouble(v), p);
w_float_bin(PyComplex_ImagAsDouble(v), p);
}
else {
W_TYPE(TYPE_COMPLEX, p);
w_float_str(PyComplex_RealAsDouble(v), p);
w_float_str(PyComplex_ImagAsDouble(v), p);
}
}
else if (PyBytes_CheckExact(v)) {
W_TYPE(TYPE_STRING, p);
w_pstring(PyBytes_AS_STRING(v), PyBytes_GET_SIZE(v), p);
}
else if (PyUnicode_CheckExact(v)) {
if (p->version >= 4 && PyUnicode_IS_ASCII(v)) {
int is_short = PyUnicode_GET_LENGTH(v) < 256;
if (is_short) {
if (PyUnicode_CHECK_INTERNED(v))
W_TYPE(TYPE_SHORT_ASCII_INTERNED, p);
else
W_TYPE(TYPE_SHORT_ASCII, p);
w_short_pstring(PyUnicode_1BYTE_DATA(v),
PyUnicode_GET_LENGTH(v), p);
}
else {
if (PyUnicode_CHECK_INTERNED(v))
W_TYPE(TYPE_ASCII_INTERNED, p);
else
W_TYPE(TYPE_ASCII, p);
w_pstring(PyUnicode_1BYTE_DATA(v),
PyUnicode_GET_LENGTH(v), p);
}
}
else {
PyObject *utf8;
utf8 = PyUnicode_AsEncodedString(v, "utf8", "surrogatepass");
if (utf8 == NULL) {
p->depth--;
p->error = WFERR_UNMARSHALLABLE;
return;
}
if (p->version >= 3 && PyUnicode_CHECK_INTERNED(v))
W_TYPE(TYPE_INTERNED, p);
else
W_TYPE(TYPE_UNICODE, p);
w_pstring(PyBytes_AS_STRING(utf8), PyBytes_GET_SIZE(utf8), p);
Py_DECREF(utf8);
}
}
else if (PyTuple_CheckExact(v)) {
n = PyTuple_GET_SIZE(v);
if (p->version >= 4 && n < 256) {
W_TYPE(TYPE_SMALL_TUPLE, p);
w_byte((unsigned char)n, p);
}
else {
W_TYPE(TYPE_TUPLE, p);
W_SIZE(n, p);
}
for (i = 0; i < n; i++) {
w_object(PyTuple_GET_ITEM(v, i), p);
}
}
else if (PyList_CheckExact(v)) {
W_TYPE(TYPE_LIST, p);
n = PyList_GET_SIZE(v);
W_SIZE(n, p);
for (i = 0; i < n; i++) {
w_object(PyList_GET_ITEM(v, i), p);
}
}
else if (PyDict_CheckExact(v)) {
Py_ssize_t pos;
PyObject *key, *value;
W_TYPE(TYPE_DICT, p);
/* This one is NULL object terminated! */
pos = 0;
while (PyDict_Next(v, &pos, &key, &value)) {
w_object(key, p);
w_object(value, p);
}
w_object((PyObject *)NULL, p);
}
else if (PyAnySet_CheckExact(v)) {
PyObject *value;
Py_ssize_t pos = 0;
Py_hash_t hash;
if (PyFrozenSet_CheckExact(v))
W_TYPE(TYPE_FROZENSET, p);
else
W_TYPE(TYPE_SET, p);
n = PySet_GET_SIZE(v);
W_SIZE(n, p);
while (_PySet_NextEntry(v, &pos, &value, &hash)) {
w_object(value, p);
}
}
else if (PyCode_Check(v)) {
PyCodeObject *co = (PyCodeObject *)v;
W_TYPE(TYPE_CODE, p);
w_long(co->co_argcount, p);
w_long(co->co_posonlyargcount, p);
w_long(co->co_kwonlyargcount, p);
w_long(co->co_nlocals, p);
w_long(co->co_stacksize, p);
w_long(co->co_flags, p);
w_object(co->co_code, p);
w_object(co->co_consts, p);
w_object(co->co_names, p);
w_object(co->co_varnames, p);
w_object(co->co_freevars, p);
w_object(co->co_cellvars, p);
w_object(co->co_filename, p);
w_object(co->co_name, p);
w_long(co->co_firstlineno, p);
w_object(co->co_lnotab, p);
}
else if (PyObject_CheckBuffer(v)) {
/* Write unknown bytes-like objects as a bytes object */
Py_buffer view;
if (PyObject_GetBuffer(v, &view, PyBUF_SIMPLE) != 0) {
w_byte(TYPE_UNKNOWN, p);
p->depth--;
p->error = WFERR_UNMARSHALLABLE;
return;
}
W_TYPE(TYPE_STRING, p);
w_pstring(view.buf, view.len, p);
PyBuffer_Release(&view);
}
else {
W_TYPE(TYPE_UNKNOWN, p);
p->error = WFERR_UNMARSHALLABLE;
}
}
上面展示了 w_object
和 w_complex_object
的全貌。接下来我们来看看 Code Object 是如何进行序列化的。
static void
w_complex_object(PyObject *v, char flag, WFILE *p)
{
// ...
else if (PyCode_Check(v)) {
PyCodeObject *co = (PyCodeObject *)v;
W_TYPE(TYPE_CODE, p);
w_long(co->co_argcount, p);
w_long(co->co_posonlyargcount, p);
w_long(co->co_kwonlyargcount, p);
w_long(co->co_nlocals, p);
w_long(co->co_stacksize, p);
w_long(co->co_flags, p);
w_object(co->co_code, p);
w_object(co->co_consts, p);
w_object(co->co_names, p);
w_object(co->co_varnames, p);
w_object(co->co_freevars, p);
w_object(co->co_cellvars, p);
w_object(co->co_filename, p);
w_object(co->co_name, p);
w_long(co->co_firstlineno, p);
w_object(co->co_lnotab, p);
}
// ...
}
一个 CodeObject 就是按照这个顺序被一步步序列化的。至此,我们可以完善我们的 struct 。
type MTimePycFile struct {
MagicNumber int32
Sep uint32
ModificationTime uint32
SourceSize uint32
CodeData CodeObject
}
type CodeObject struct {
TYPE_CODE byte
co_argcount int64
co_posonlyargcount int64
co_kwonlyargcount int64
co_nlocals int64
co_stacksize int64
co_flags int64
co_code PyObject
co_consts PyObject
co_names PyObject
co_varnames PyObject
co_freevars PyObject
co_cellvars PyObject
co_filename PyObject
co_name PyObject
co_firstlineno int64
co_lnotab PyObject
}
好了,这就是带时间戳的 pyc 文件结构了。PyObject 的结构由于篇幅有限,这里就不多阐述了,有兴趣的朋友可以继续深挖 marshal 。