Python数据统计作业 python数据统计代码

转载

footballboy 2023-10-03 10:18:41

文章标签 Python数据统计作业 python 小工具 CSS Java 文章分类 Python 后端开发

0. 前言

最近突然想知道自己总共写了多少行代码，于是做了这样一个小工具……

1. 准备工作

先考虑一下希望得到的效果：

Language（语言）	Lines（代码行数）	Size（代码文件总大小）	Files（代码文件总数）
A	12345	300 KB	193
B	2345	165 KB	98

如上，程序输出一个表格，将代码行数作为关键字排序。
代码框架：

# -*- encoding: utf-8 -*-
import ...

# 代码行数计数类
class CodeLinesCounter(object):
    SIZES = [('B', 1), ('KB', 1024), ('MB', 1024**2), ('GB', 1024**3), ('TB', 1024**4)]
    
    def __init__(self, languages):
        self._languages = languages # 语言（dict,{文件后缀名：语言}）
        self._codelines = {suffix: (0, 0, 0) for suffix in languages} # 统计结果，{后缀名：(行数，大小，文件数)}
        self._successful = self._error = 0 # 记录成功、失败文件个数
    
    # @param directory: 要扫描的目录
    # @param log: 是否打印日志
    def scan(self, directory, log=False):
        if log: print('Scanning', directory)
        pass
    
    def report(self): # 输出结果
        pass

counter = CodeLinesCounter(languages={'py': 'Python', 'c': 'C', 'cpp': 'C++', 'java': 'Java', 'js': 'JavaScript', 'html': 'HTML', 'css': 'CSS', 'txt': 'Plain text'}) # 创建CodeLinesCounter实例
counter.scan('E:/') # 扫描E盘（注意不能用'E:'）
counter.report() # 输出结果

完成，下面正式进入主要部分

2. 统计

2.1 文件扫描

首先，我们需要获取根目录下的文件列表。这可以用os.walk实现：
os.walk(rootdir)返回一个游走器（可迭代），包含根目录下每个子目录的文件及目录列表。我们来看一个例子：
有一文件夹Folder如下：

Folder
|   file1
|   Folder1
|       file2
|       file3
|   Folder2
    |   file4
    |   Folder3

运行如下代码：

import os

for root, dirs, files in os.walk('Folder'):
    print(root, dirs, files)

则输出如下：

Folder					['Folder1', 'Folder2']	['file1']
Folder\Folder1			[]						['file2', 'file3']
Folder\Folder2			['Folder3']				['file4']
Folder\Folder2\Folder3	[]						[]

其中第一项是当前的根目录，第二项为目录下的目录列表，第三项则为当前的文件列表。
因此，我们可以编写如下代码：

# -*- encoding: utf-8 -*-
from os.path import join, getsize, abspath
from os import walk

class CodeLinesCounter(object):
    SIZES = [('B', 1), ('KB', 1024), ('MB', 1024**2), ('GB', 1024**3), ('TB', 1024**4)]
    
    def __init__(self, languages):
        self._languages = languages
        self._results = {suffix: (0, 0, 0) for suffix in languages}
        self._successful = self._error = 0
    
    def scan(self, directory, log=False):
        if log: print('Scanning', directory)
        try:
            for root, _, files in walk(abspath(directory)):
                for filename in files:
                    suffix = filename[filename.rfind('.') + 1:]
                    filename = join(root, filename)
                    if suffix in self._results:
                        lines, size, numFiles = self._results[suffix]
                        lines += 1 # 暂不统计，先按一行计算
                        numFiles += 1
                        size += getsize(filename) # getsize返回文件大小（字节）
                        self._results[suffix] = (lines, size, numFiles)
                    if log: print(filename)
        except KeyboardInterrupt:
            print('\nUser stopped operation')
        else:
            if log: print('Scan finished')
    
    def report(self):
        print('Language\tLines\tSize\tFiles')
        for suffix, (lines, size, files) in sorted(self._results.items(), key=lambda x: x[1], reverse=True):
            print(self._languages[suffix], lines, self.__format_size(size), files, sep='\t')

    # 单位转换
    def __format_size(self, bytes):
        for suffix, size in self.SIZES:
            if bytes < size * 1024:
                return '%.2f %s' % (bytes / size, suffix)
        return '%.2f %s' % (bytes / self.SIZES[-1][1], 2, self.SIZES[-1][0])

counter = CodeLinesCounter(languages={'py': 'Python', 'c': 'C', 'cpp': 'C++', 'java': 'Java', 'js': 'JavaScript', 'html': 'HTML', 'css': 'CSS', 'txt': 'Plain text'})
counter.scan('E:/')
counter.report()

运行结果应类似于下面这样（手动整理了一下）：

Language        Lines   Size    		Files
C++     		667     671.51 KB       667
Python  		317     981.01 KB       317
HTML    		38      466.52 KB       38
Plain text      34      90.69 KB        34
JavaScript      19      1.43 MB			19
CSS     		9       341.04 KB       9
C       		2       20.45 KB        2
Java    		1       676.00 B        1

好，下面来到行数统计部分（表格输出后面会介绍）。

2.2 行数统计

众所周知，空行不应该算在代码行数中。因此，统计时需忽略空行。先写上如下代码（替换掉刚才的23行）：

with open(filename, 'r', encoding='utf-8') as f: # utf-8编码打开文件
    for line in f:
        if line and not line.isspace(): # 去掉空行
            lines += 1

但是，正当我们兴致勃勃地运行时——

Traceback (most recent call last):
  ...
  File "...\lib\codecs.py", line 322, in decode
    (result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb5 in position 355: invalid start byte

程序报错UnicodeDecodeError，分析后发现原因是部分文件使用了GBK编码，而utf-8编码无法正确打开，因此造成错误。
我们再次改进程序，使其尝试两种编码：

try:
    ln = 0
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            if line and not line.isspace():
                ln += 1
except UnicodeDecodeError: # 尝试使用GBK编码打开
    try:
        ln = 0
        with open(filename, 'r', encoding='gbk') as f:
            for line in f:
                if line and not line.isspace():
                    ln += 1
    except:
        print(filename, '[Error: unknown encoding]')
        self._error += 1
    else:
        lines += ln
except Exception as e:
    print(filename, '[Unknown error: %s]' % e)
    self._error += 1
    continue
lines += ln
if log: print(f'{filename} [{ln}]')
self._successful += 1

这次，我们得到了正确的结果：

Language        Lines   Size    		Files
C++     		35595   671.51 KB       667
JavaScript      24485   1.43 MB 		19
Python  		24130   982.16 KB       317
CSS     		8203    341.04 KB       9
HTML    		6138    466.52 KB       38
Plain text      741     90.69 KB        34
C       		557     20.45 KB        2
Java    		29      676.00 B        1

现在仅剩最后一步了——制表。

3. 制表

python输出表格可以使用PrettyTable库。具体用法如下：

# -*- encoding: utf-8 -*-
from os.path import join, getsize, abspath
from os import walk
from prettytable import PrettyTable

class CodeLinesCounter(object):
    SIZES = [('B', 1), ('KB', 1024), ('MB', 1024**2), ('GB', 1024**3), ('TB', 1024**4)]
    
    def __init__(self, languages):
        self._languages = languages
        self._results = {suffix: (0, 0, 0) for suffix in languages}
        self._successful = self._error = 0
    
    def scan(self, directory, log=False):
        if log: print('Scanning', directory)
        try:
            for root, _, files in walk(abspath(directory)):
                for filename in files:
                    suffix = filename[filename.rfind('.') + 1:]
                    filename = join(root, filename)
                    if suffix in self._results:
                        lines, size, numFiles = self._results[suffix]
                        numFiles += 1
                        size += getsize(filename)
                        try:
                            ln = 0
                            with open(filename, 'r', encoding='utf-8') as f:
                                for line in f:
                                    if line and not line.isspace():
                                        ln += 1
                        except UnicodeDecodeError: # Try 'gbk' encoding
                            try:
                                ln = 0
                                with open(filename, 'r', encoding='gbk') as f:
                                    for line in f:
                                        if line and not line.isspace():
                                            ln += 1
                            except:
                                print(filename, '[Error: unknown encoding]')
                                self._error += 1
                            else:
                                lines += ln
                        except Exception as e:
                            print(filename, '[Unknown error: %s]' % e)
                            self._error += 1
                            continue
                        lines += ln
                        if log: print(f'{filename} [{ln}]')
                        self._successful += 1
                        self._results[suffix] = (lines, size, numFiles)
                    elif log:
                        print(filename, '[None]')
        except KeyboardInterrupt:
            print('\nUser stopped operation')
        else:
            if log: print('Scan finished')
    
    def report(self):
        table = PrettyTable(['Language', 'Lines', 'Size', 'Files'], title=f'Scan result (OK {self._successful}, Error {self._error})') # 创建PrettyTable实例，添加标题
        for suffix, (lines, size, files) in sorted(self._results.items(), key=lambda x: x[1], reverse=True):
            table.add_row([self._languages[suffix], lines, self.__format_size(size), files]) # 添加行
        print(table) # 输出
    
    def __format_size(self, bytes):
        for suffix, size in self.SIZES:
            if bytes < size * 1024:
                return '%.2f %s' % (bytes / size, suffix)
        return '%.2f %s' % (bytes / self.SIZES[-1][1], 2, self.SIZES[-1][0])

counter = CodeLinesCounter(languages={'py': 'Python', 'c': 'C', 'cpp': 'C++', 'java': 'Java', 'js': 'JavaScript', 'html': 'HTML', 'css': 'CSS', 'txt': 'Plain text'})
counter.scan('E:/')
counter.report()

运行结果：

+----------------------------------------+
|     Scan result (OK 1087, Error 0)     |
+------------+-------+-----------+-------+
|  Language  | Lines |    Size   | Files |
+------------+-------+-----------+-------+
|    C++     | 35595 | 671.51 KB |  667  |
| JavaScript | 24485 |  1.43 MB  |   19  |
|   Python   | 24130 | 982.16 KB |  317  |
|    CSS     |  8203 | 341.04 KB |   9   |
|    HTML    |  6138 | 466.52 KB |   38  |
| Plain text |  741  |  90.69 KB |   34  |
|     C      |  557  |  20.45 KB |   2   |
|    Java    |   29  |  676.00 B |   1   |
+------------+-------+-----------+-------+

4. 总结

最终代码（无注释）：

# -*- encoding: utf-8 -*-
from os.path import join, getsize, abspath
from os import walk
from prettytable import PrettyTable

class CodeLinesCounter(object):
    SIZES = [('B', 1), ('KB', 1024), ('MB', 1024**2), ('GB', 1024**3), ('TB', 1024**4)]
    
    def __init__(self, languages):
        self._languages = languages
        self._results = {suffix: (0, 0, 0) for suffix in languages}
        self._successful = self._error = 0
    
    def scan(self, directory, log=False):
        if log: print('Scanning', directory)
        try:
            for root, _, files in walk(abspath(directory)):
                for filename in files:
                    suffix = filename[filename.rfind('.') + 1:]
                    filename = join(root, filename)
                    if suffix in self._results:
                        lines, size, numFiles = self._results[suffix]
                        numFiles += 1
                        size += getsize(filename)
                        try:
                            ln = 0
                            with open(filename, 'r', encoding='utf-8') as f:
                                for line in f:
                                    if line and not line.isspace():
                                        ln += 1
                        except UnicodeDecodeError: # Try 'gbk' encoding
                            try:
                                ln = 0
                                with open(filename, 'r', encoding='gbk') as f:
                                    for line in f:
                                        if line and not line.isspace():
                                            ln += 1
                            except:
                                print(filename, '[Error: unknown encoding]')
                                self._error += 1
                            else:
                                lines += ln
                        except Exception as e:
                            print(filename, '[Unknown error: %s]' % e)
                            self._error += 1
                            continue
                        lines += ln
                        if log: print(f'{filename} [{ln}]')
                        self._successful += 1
                        self._results[suffix] = (lines, size, numFiles)
                    elif log:
                        print(filename, '[None]')
        except KeyboardInterrupt:
            print('\nUser stopped operation')
        else:
            if log: print('Scan finished')
    
    def report(self):
        table = PrettyTable(['Language', 'Lines', 'Size', 'Files'], title=f'Scan result (OK {self._successful}, Error {self._error})')
        for suffix, (lines, size, files) in sorted(self._results.items(), key=lambda x: x[1], reverse=True):
            table.add_row([self._languages[suffix], lines, self.__format_size(size), files])
        print(table)
    
    def __format_size(self, bytes):
        for suffix, size in self.SIZES:
            if bytes < size * 1024:
                return '%.2f %s' % (bytes / size, suffix)
        return '%.2f %s' % (bytes / self.SIZES[-1][1], 2, self.SIZES[-1][0])

counter = CodeLinesCounter(languages={'py': 'Python', 'c': 'C', 'cpp': 'C++', 'java': 'Java', 'js': 'JavaScript', 'html': 'HTML', 'css': 'CSS', 'txt': 'Plain text'})
counter.scan('E:/')
counter.report()

后期改进：