一、情景描述
1、磁盘文件太多,又舍不得删,单纯想清理出重复文件,人工太费力
2、成品下载:
百度云:https://pan.baidu.com/s/1W3pHU-dGi_mrd8M140Vogg
提取码:ji0r
3、成品使用:
repeat.exe
repeat.exe;
(3) 程序自动遍历当前文件夹及其子文件夹下的所有,逐一读取文件的“指纹”信息并记录 /RESULT/md5.his,同一“指纹”的多个文件的路径信息将以行的形式写入 /RESULT/record.log ;
(4) 保留最原始文件的同时将其他“副本”剪切到 /RESULT/REPEATS/ 文件夹,以备用户自主选择删除与否;
(5) 文件的深度遍历优先查找较大体积的文件,所以在遍历前期可能较为“卡顿”;
(6) 当遇到文件较多,遍历耗时太久的情况,可以直接关闭该程序,在下次遍历时已遍历文件将不再重复遍历,提高遍历的效率,减少无用功。注意:不可删除文件 /RESULT/md5.his 。
运行前:
观察发现 /根/son.jpg 和 /根/子/son.jpg 互为重复
/根/grand.jpg 和 /根/子/孙/grand.jpg 互为重复
/根/root.jpg 无重复
(为方便说明和区分,文件的所有副本不改名的存储在其他文件夹下)
运行时:
运行后:
结果显示 /根/son.jpg 的副本 /根/子/son.jpg
/根/grand.jpg 的副本 /根/子/孙/grand.jpg 均被移动到了 /根/RESULT/REPEATS/ 文件夹下
用户可以自行选择是否删除这些文件副本
二、需求分析
同一文件可能重复存在,文件名可能相同,也可能不同,自然希望保留原始文件(创建时间最早,文件名非“副本”、非“xx (2)”)
怎么判断两个文件为同一文件?如何像识别人体指纹一样准确区分?
文件太多,单个文件太大,一次遍历耗时太久,如何在二次遍历时重复利用上次遍历的结果?
如何避免某几个文件的权限不足对整体遍历的影响?
程序运行后的缓存垃圾如何清理?
三、代码实现
1、folder.py
# !/usr/bin/python3
# coding: utf-8
import os
import tool
def deep_list(path):
if not os.path.isdir(path):
return list()
try:
fs = os.listdir(path)
except PermissionError:
print("PermissionError:", path)
return list()
info = list()
for f in fs:
fp = tool.join(path, f)
if os.path.isfile(fp):
info.append(fp)
elif os.path.isdir(fp):
info.extend(deep_list(fp))
return info
2、file.py
# !/usr/bin/python3
# coding: utf-8
import hashlib
import os
import traceback
def md5(path):
if not os.path.isfile(path):
return None
try:
hashes = hashlib.md5()
f = open(path, "rb")
while True:
b = f.read(1024)
if not b:
break
hashes.update(b)
f.close()
md = hashes.hexdigest()
print("%s : %s" % (path, md))
return md
except:
traceback.print_exc()
return None
def name_order(path):
if not os.path.exists(path):
return None
path = str(path).lower().strip()
info = os.stat(path)
create = info.st_ctime_ns
if info.st_atime_ns < create:
create = info.st_atime_ns
if info.st_mtime_ns < create:
create = info.st_mtime_ns
suf = os.path.splitext(path)[1]
basename = os.path.basename(path)
name = basename.replace(suf, "").strip()
name = name.replace("(", "(")
name = name.replace(")", ")")
layer = len(path.split("\\"))
return "_".join((suf, str(create), name, str(layer)))
3、tool.py
# !/usr/bin/python3
# coding: utf-8
def join(path, *paths):
path = fmt(path)
for p in paths:
p = fmt(p)
path += "\\" + p
path = fmt(path)
return path
def fmt(path):
if path is None:
return ""
path = path.strip()
while path.find("/") >= 0:
path = path.replace("/", "\\")
while path.find("\\\\") >= 0:
path = path.replace("\\\\", "\\")
return path
4、mei.py
# !/usr/bin/python3
# coding: utf-8
import os
import re
import sys
import tool
def IS_MEI(basename):
return re.match("^_MEI\d+$", basename) and True or False
def remove():
for index, path in enumerate(sys.path):
basename = os.path.basename(path)
if not IS_MEI(basename):
continue
drive = os.path.splitdrive(path)[0]
if "" == drive:
path = tool.join(os.getcwd(), path)
if os.path.isdir(path):
try:
print("remove", path)
os.remove(path)
finally:
break
5、repeat.py(核心)
# !/usr/bin/python3
# coding: utf-8
import gc
import os
import sys
import time
import traceback
this = os.path.abspath(os.path.dirname(__file__))
module = os.path.split(this)[0]
sys.path.append(module)
for i, val in enumerate(sys.path):
print("[%s] %s" % (i + 1, val))
import file
import folder
import tool
import mei
SEGMENTER = ">>"
FOLDER_FOR_RESULT = "RESULT"
FOLDER_FOR_REPEATS = "REPEATS"
FILE_MD5_HIS = "md5.his"
FILE_MD5_TMP = "md5.tmp"
FILE_RECORD_LOG = "record.log"
def read_md5_his(cwd):
his_path = tool.join(cwd, FOLDER_FOR_RESULT, FILE_MD5_HIS)
if not os.path.exists(his_path):
return dict()
with open(his_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
fpMd5 = dict()
for line in lines:
row = line.split(SEGMENTER)
fp = row[0].strip()
md5 = row[1].strip()
if os.path.exists(fp) and len(md5) > 1:
fpMd5[fp] = md5
return fpMd5
def reverse_file_md5(fileMd5):
ls = list()
for fp, md5 in fileMd5.items():
ls.append((fp, md5))
ls.sort(key=lambda ele: ele[1])
md5Fp = dict()
for kv in ls:
fp = kv[0]
md5 = kv[1]
if md5 not in md5Fp:
fps = set()
else:
fps = md5Fp[md5]
fps.add(fp)
md5Fp[md5] = fps
return md5Fp
def scan_folder(cwd):
# step0: listdir
fps = folder.deep_list(cwd)
if len(fps) < 2: # must have it self
print("No FILE IN", cwd)
return
# step1: remove self and root_folder
self = os.path.abspath(sys.executable)
print("SELF IS", self)
if self in fps:
fps.remove(self)
result_path = tool.join(cwd, FOLDER_FOR_RESULT)
for index in range(len(fps) - 1, -1, -1):
fp = fps[index]
if str(fp).startswith(result_path):
fps.remove(fp)
if len(fps) < 1:
print("No FILE IN", cwd)
return
# step2: read history and write to tmp
tmp_path = tool.join(cwd, FOLDER_FOR_RESULT, FILE_MD5_TMP)
fpMd5 = read_md5_his(cwd)
if len(fpMd5) > 0:
rows = ""
for p, md5 in fpMd5.items():
rows += p + SEGMENTER + md5 + "\n"
with open(tmp_path, 'w', encoding='utf-8') as f:
f.write(rows)
print()
# step3: makedir
repeats_path = tool.join(cwd, FOLDER_FOR_RESULT, FOLDER_FOR_REPEATS)
if not os.path.exists(repeats_path):
os.makedirs(repeats_path)
fps = list(fps)
fps.sort(key=lambda fp: os.stat(fp).st_size, reverse=True)
# step4: read file's md5 and append to tmp
tmpMd5 = dict()
for index in range(len(fps)):
fp = fps[index]
if fp in fpMd5:
continue
md5 = file.md5(fp)
if md5 is not None:
tmpMd5[fp] = md5
fpMd5[fp] = md5
if 10 == len(tmpMd5) or (index == len(fps) - 1):
rows = ""
for p, md in tmpMd5.items():
rows += p + SEGMENTER + md + "\n"
with open(tmp_path, 'a', encoding='utf-8') as f:
f.write(rows)
tmpMd5.clear()
print()
# step5: remove the repeat
content = ""
md5Fp = reverse_file_md5(fpMd5)
for md5, fps in md5Fp.items():
if len(fps) < 2:
continue
fps = list(fps)
fps.sort(key=lambda fp: file.name_order(fp))
print("%s : %s" % (md5, ','.join(fps)))
content += ','.join(fps) + "\n"
for i in range(len(fps)):
if i == 0:
# not move the first
continue
old = fps[i]
fn = os.path.basename(old)
new = tool.join(repeats_path, fn)
if old == new:
continue
# if the new is exist
if os.path.exists(new):
try:
os.remove(new)
except:
traceback.print_exc()
# move the repeats to REPEAT/FILES folder
try:
os.rename(old, new)
except:
traceback.print_exc()
# step6: record the repeat
if "" == content:
print("No REPEAT FILE EXISTS")
else:
this_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
record = tool.join(cwd, FOLDER_FOR_RESULT, FILE_RECORD_LOG)
with open(record, 'a', encoding='utf-8') as f:
f.write("\n\n" + this_time + "\n" + content)
def save_file_md5(cwd):
# step7: update md5 history record file
try:
his_path = tool.join(cwd, FOLDER_FOR_RESULT, FILE_MD5_HIS)
tmp_path = tool.join(cwd, FOLDER_FOR_RESULT, FILE_MD5_TMP)
if os.path.exists(tmp_path):
if os.path.exists(his_path):
os.remove(his_path)
os.rename(tmp_path, his_path)
except:
traceback.print_exc()
if __name__ == '__main__':
try:
cwd = os.getcwd()
print("\nCURRENT PATH IS %s\n" % cwd)
scan_folder(cwd)
except:
traceback.print_exc()
finally:
save_file_md5(cwd)
gc.collect()
input("\nPRESS ANY KEYS TO EXIT\n")
mei.remove()
四、打包 exe
pyinstaller -F repeat.py
本处不作详解,详情参考:
《pyinstaller打包经验总结》
《pyinstaller打包exe爆满temp》