0. 前言
最近突然想知道自己总共写了多少行代码,于是做了这样一个小工具……
1. 准备工作
先考虑一下希望得到的效果:
Language(语言) | Lines(代码行数) | Size(代码文件总大小) | Files(代码文件总数) |
A | 12345 | 300 KB | 193 |
B | 2345 | 165 KB | 98 |
如上,程序输出一个表格,将代码行数作为关键字排序。
代码框架:
# -*- encoding: utf-8 -*-
import ...
# 代码行数计数类
class CodeLinesCounter(object):
SIZES = [('B', 1), ('KB', 1024), ('MB', 1024**2), ('GB', 1024**3), ('TB', 1024**4)]
def __init__(self, languages):
self._languages = languages # 语言(dict,{文件后缀名:语言})
self._codelines = {suffix: (0, 0, 0) for suffix in languages} # 统计结果,{后缀名:(行数,大小,文件数)}
self._successful = self._error = 0 # 记录成功、失败文件个数
# @param directory: 要扫描的目录
# @param log: 是否打印日志
def scan(self, directory, log=False):
if log: print('Scanning', directory)
pass
def report(self): # 输出结果
pass
counter = CodeLinesCounter(languages={'py': 'Python', 'c': 'C', 'cpp': 'C++', 'java': 'Java', 'js': 'JavaScript', 'html': 'HTML', 'css': 'CSS', 'txt': 'Plain text'}) # 创建CodeLinesCounter实例
counter.scan('E:/') # 扫描E盘(注意不能用'E:')
counter.report() # 输出结果
完成,下面正式进入主要部分
2. 统计
2.1 文件扫描
首先,我们需要获取根目录下的文件列表。这可以用os.walk
实现:os.walk(rootdir)
返回一个游走器(可迭代),包含根目录下每个子目录的文件及目录列表。我们来看一个例子:
有一文件夹Folder
如下:
Folder
| file1
| Folder1
| file2
| file3
| Folder2
| file4
| Folder3
运行如下代码:
import os
for root, dirs, files in os.walk('Folder'):
print(root, dirs, files)
则输出如下:
Folder ['Folder1', 'Folder2'] ['file1']
Folder\Folder1 [] ['file2', 'file3']
Folder\Folder2 ['Folder3'] ['file4']
Folder\Folder2\Folder3 [] []
其中第一项是当前的根目录,第二项为目录下的目录列表,第三项则为当前的文件列表。
因此,我们可以编写如下代码:
# -*- encoding: utf-8 -*-
from os.path import join, getsize, abspath
from os import walk
class CodeLinesCounter(object):
SIZES = [('B', 1), ('KB', 1024), ('MB', 1024**2), ('GB', 1024**3), ('TB', 1024**4)]
def __init__(self, languages):
self._languages = languages
self._results = {suffix: (0, 0, 0) for suffix in languages}
self._successful = self._error = 0
def scan(self, directory, log=False):
if log: print('Scanning', directory)
try:
for root, _, files in walk(abspath(directory)):
for filename in files:
suffix = filename[filename.rfind('.') + 1:]
filename = join(root, filename)
if suffix in self._results:
lines, size, numFiles = self._results[suffix]
lines += 1 # 暂不统计,先按一行计算
numFiles += 1
size += getsize(filename) # getsize返回文件大小(字节)
self._results[suffix] = (lines, size, numFiles)
if log: print(filename)
except KeyboardInterrupt:
print('\nUser stopped operation')
else:
if log: print('Scan finished')
def report(self):
print('Language\tLines\tSize\tFiles')
for suffix, (lines, size, files) in sorted(self._results.items(), key=lambda x: x[1], reverse=True):
print(self._languages[suffix], lines, self.__format_size(size), files, sep='\t')
# 单位转换
def __format_size(self, bytes):
for suffix, size in self.SIZES:
if bytes < size * 1024:
return '%.2f %s' % (bytes / size, suffix)
return '%.2f %s' % (bytes / self.SIZES[-1][1], 2, self.SIZES[-1][0])
counter = CodeLinesCounter(languages={'py': 'Python', 'c': 'C', 'cpp': 'C++', 'java': 'Java', 'js': 'JavaScript', 'html': 'HTML', 'css': 'CSS', 'txt': 'Plain text'})
counter.scan('E:/')
counter.report()
运行结果应类似于下面这样(手动整理了一下):
Language Lines Size Files
C++ 667 671.51 KB 667
Python 317 981.01 KB 317
HTML 38 466.52 KB 38
Plain text 34 90.69 KB 34
JavaScript 19 1.43 MB 19
CSS 9 341.04 KB 9
C 2 20.45 KB 2
Java 1 676.00 B 1
好,下面来到行数统计部分(表格输出后面会介绍)。
2.2 行数统计
众所周知,空行不应该算在代码行数中。因此,统计时需忽略空行。先写上如下代码(替换掉刚才的23行):
with open(filename, 'r', encoding='utf-8') as f: # utf-8编码打开文件
for line in f:
if line and not line.isspace(): # 去掉空行
lines += 1
但是,正当我们兴致勃勃地运行时——
Traceback (most recent call last):
...
File "...\lib\codecs.py", line 322, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb5 in position 355: invalid start byte
程序报错UnicodeDecodeError
,分析后发现原因是部分文件使用了GBK
编码,而utf-8
编码无法正确打开,因此造成错误。
我们再次改进程序,使其尝试两种编码:
try:
ln = 0
with open(filename, 'r', encoding='utf-8') as f:
for line in f:
if line and not line.isspace():
ln += 1
except UnicodeDecodeError: # 尝试使用GBK编码打开
try:
ln = 0
with open(filename, 'r', encoding='gbk') as f:
for line in f:
if line and not line.isspace():
ln += 1
except:
print(filename, '[Error: unknown encoding]')
self._error += 1
else:
lines += ln
except Exception as e:
print(filename, '[Unknown error: %s]' % e)
self._error += 1
continue
lines += ln
if log: print(f'{filename} [{ln}]')
self._successful += 1
这次,我们得到了正确的结果:
Language Lines Size Files
C++ 35595 671.51 KB 667
JavaScript 24485 1.43 MB 19
Python 24130 982.16 KB 317
CSS 8203 341.04 KB 9
HTML 6138 466.52 KB 38
Plain text 741 90.69 KB 34
C 557 20.45 KB 2
Java 29 676.00 B 1
现在仅剩最后一步了——制表。
3. 制表
python
输出表格可以使用PrettyTable
库。具体用法如下:
# -*- encoding: utf-8 -*-
from os.path import join, getsize, abspath
from os import walk
from prettytable import PrettyTable
class CodeLinesCounter(object):
SIZES = [('B', 1), ('KB', 1024), ('MB', 1024**2), ('GB', 1024**3), ('TB', 1024**4)]
def __init__(self, languages):
self._languages = languages
self._results = {suffix: (0, 0, 0) for suffix in languages}
self._successful = self._error = 0
def scan(self, directory, log=False):
if log: print('Scanning', directory)
try:
for root, _, files in walk(abspath(directory)):
for filename in files:
suffix = filename[filename.rfind('.') + 1:]
filename = join(root, filename)
if suffix in self._results:
lines, size, numFiles = self._results[suffix]
numFiles += 1
size += getsize(filename)
try:
ln = 0
with open(filename, 'r', encoding='utf-8') as f:
for line in f:
if line and not line.isspace():
ln += 1
except UnicodeDecodeError: # Try 'gbk' encoding
try:
ln = 0
with open(filename, 'r', encoding='gbk') as f:
for line in f:
if line and not line.isspace():
ln += 1
except:
print(filename, '[Error: unknown encoding]')
self._error += 1
else:
lines += ln
except Exception as e:
print(filename, '[Unknown error: %s]' % e)
self._error += 1
continue
lines += ln
if log: print(f'{filename} [{ln}]')
self._successful += 1
self._results[suffix] = (lines, size, numFiles)
elif log:
print(filename, '[None]')
except KeyboardInterrupt:
print('\nUser stopped operation')
else:
if log: print('Scan finished')
def report(self):
table = PrettyTable(['Language', 'Lines', 'Size', 'Files'], title=f'Scan result (OK {self._successful}, Error {self._error})') # 创建PrettyTable实例,添加标题
for suffix, (lines, size, files) in sorted(self._results.items(), key=lambda x: x[1], reverse=True):
table.add_row([self._languages[suffix], lines, self.__format_size(size), files]) # 添加行
print(table) # 输出
def __format_size(self, bytes):
for suffix, size in self.SIZES:
if bytes < size * 1024:
return '%.2f %s' % (bytes / size, suffix)
return '%.2f %s' % (bytes / self.SIZES[-1][1], 2, self.SIZES[-1][0])
counter = CodeLinesCounter(languages={'py': 'Python', 'c': 'C', 'cpp': 'C++', 'java': 'Java', 'js': 'JavaScript', 'html': 'HTML', 'css': 'CSS', 'txt': 'Plain text'})
counter.scan('E:/')
counter.report()
运行结果:
+----------------------------------------+
| Scan result (OK 1087, Error 0) |
+------------+-------+-----------+-------+
| Language | Lines | Size | Files |
+------------+-------+-----------+-------+
| C++ | 35595 | 671.51 KB | 667 |
| JavaScript | 24485 | 1.43 MB | 19 |
| Python | 24130 | 982.16 KB | 317 |
| CSS | 8203 | 341.04 KB | 9 |
| HTML | 6138 | 466.52 KB | 38 |
| Plain text | 741 | 90.69 KB | 34 |
| C | 557 | 20.45 KB | 2 |
| Java | 29 | 676.00 B | 1 |
+------------+-------+-----------+-------+
4. 总结
最终代码(无注释):
# -*- encoding: utf-8 -*-
from os.path import join, getsize, abspath
from os import walk
from prettytable import PrettyTable
class CodeLinesCounter(object):
SIZES = [('B', 1), ('KB', 1024), ('MB', 1024**2), ('GB', 1024**3), ('TB', 1024**4)]
def __init__(self, languages):
self._languages = languages
self._results = {suffix: (0, 0, 0) for suffix in languages}
self._successful = self._error = 0
def scan(self, directory, log=False):
if log: print('Scanning', directory)
try:
for root, _, files in walk(abspath(directory)):
for filename in files:
suffix = filename[filename.rfind('.') + 1:]
filename = join(root, filename)
if suffix in self._results:
lines, size, numFiles = self._results[suffix]
numFiles += 1
size += getsize(filename)
try:
ln = 0
with open(filename, 'r', encoding='utf-8') as f:
for line in f:
if line and not line.isspace():
ln += 1
except UnicodeDecodeError: # Try 'gbk' encoding
try:
ln = 0
with open(filename, 'r', encoding='gbk') as f:
for line in f:
if line and not line.isspace():
ln += 1
except:
print(filename, '[Error: unknown encoding]')
self._error += 1
else:
lines += ln
except Exception as e:
print(filename, '[Unknown error: %s]' % e)
self._error += 1
continue
lines += ln
if log: print(f'{filename} [{ln}]')
self._successful += 1
self._results[suffix] = (lines, size, numFiles)
elif log:
print(filename, '[None]')
except KeyboardInterrupt:
print('\nUser stopped operation')
else:
if log: print('Scan finished')
def report(self):
table = PrettyTable(['Language', 'Lines', 'Size', 'Files'], title=f'Scan result (OK {self._successful}, Error {self._error})')
for suffix, (lines, size, files) in sorted(self._results.items(), key=lambda x: x[1], reverse=True):
table.add_row([self._languages[suffix], lines, self.__format_size(size), files])
print(table)
def __format_size(self, bytes):
for suffix, size in self.SIZES:
if bytes < size * 1024:
return '%.2f %s' % (bytes / size, suffix)
return '%.2f %s' % (bytes / self.SIZES[-1][1], 2, self.SIZES[-1][0])
counter = CodeLinesCounter(languages={'py': 'Python', 'c': 'C', 'cpp': 'C++', 'java': 'Java', 'js': 'JavaScript', 'html': 'HTML', 'css': 'CSS', 'txt': 'Plain text'})
counter.scan('E:/')
counter.report()
后期改进:
- 增加正则表达式忽略文件
-
matplotlib
绘图 -
PyQt5
GUI - ……(欢迎提出宝贵的意见!)