使用python对文件进行批量处理


代码有点长,包括为类了,主要是对文件进行批量处理使用


1、批量移动文件,符合某种后缀的


2、批量查找两个文件夹重复的文件


3、批量同步两个文件夹的文件


2和3,我现在用duplicate这个软件,同步的话使用File Synchronizer这个软件,代码的话不怎么用了


4、批量移动和复制文件的时候会遇到一个问题,就是若存在相同文件名的情况,这个时候可以在文件名后加“-1”,“-2”这种方式来解决,比之前的用随机时间的要好,那个产生的文件名太长了


5、批量提取docx文档中的图片(如何批量提取doc中的呢?去excelhome论坛搜索vba转doc为docx的代码,批量转换即可


 


Sub doc2docx()    'doc文件转docx文件

Dim myDialog As FileDialog
Set myDialog = Application.FileDialog(msoFileDialogFilePicker)
Dim oFile As Object
Dim oFilePath As Variant

With myDialog
.Filters.Clear '清除所有文件筛选器中的项目
.Filters.Add "所有 WORD2007 文件", "*.doc", 1 '增加筛选器的项目为所有doc文件
.AllowMultiSelect = True '允许多项选择
If .Show = -1 Then '确定
For Each oFilePath In .SelectedItems '在所有选取项目中循环
Set oFile = Documents.Open(oFilePath)
oFile.SaveAs FileName:=Replace(oFilePath, "doc", "docx"), FileFormat:=16
oFile.Close
Next
End If

End With

End Sub


  


使用python对文件进行批量处理_文件名使用python对文件进行批量处理_批量修改_02


1 from genericpath import exists
2 import os
3 import shutil
4 from os import path
5 from pathlib import Path
6 from hashlib import md5
7 from PIL import Image
8 import zipfile
9 from send2trash import send2trash
10
11 # revides by Stephen Shen @zju
12 # 2021-4-8 10:14:22
13 # https://rednafi.github.io/digressions/python/2020/04/13/python-pathlib.html#pathrenametarget
14 # https://pypi.org/project/Send2Trash/
15 # revised by Stephen Shen @zju
16 # 2021年3月15日09:24:10
17 # zipfile 模块使用说明
18 # https://www.cnblogs.com/ManyQian/p/9193199.html
19 #
20 # shutil.copyfile("oldfile","newfile") #oldfile和newfile都只能是文件
21 # shutil.copy("oldfile","newfile") #oldfile只能是文件夹,newfile可以是文件,也可以是目标目录
22 # #复制文件夹:
23 # shutil.copytree("olddir","newdir") #olddir和newdir都只能是目录,且newdir必须不存在
24 # #重命名文件(目录)
25 # os.rename("oldname","newname") #文件或目录都是使用这条命令
26 # #移动文件(目录)
27 # shutil.move("oldpos","newpos")
28
29
30 class PathBox():
31 def __init__(self):
32 pass
33
34 @staticmethod
35 def batchExtractPicsFromDocs(srcDir, dstDir, zipDir):
36 dst_dir = Path(dstDir)
37 src_dir = Path(srcDir)
38 zip_dir = Path(zipDir)
39
40 if not dst_dir.exists():
41 dst_dir.mkdir()
42 if not src_dir.exists():
43 src_dir.mkdir()
44 if not zip_dir.exists():
45 zip_dir.mkdir()
46
47 for root, dirs, files in os.walk(src_dir):
48 for f in files:
49 src_path = Path(root).joinpath(f)
50 if src_path.suffix in ['.docx']:
51 # dst_sub_dir = dst_dir.joinpath(src_path.stem)
52 # if not dst_sub_dir.exists():
53 # dst_sub_dir.mkdir()
54
55 zip_path = zip_dir.joinpath(src_path.stem+'.zip')
56 if not zip_path.exists():
57 PathBox.copyAsZip(src_path, zip_path)
58 print('{} is copied as zip file'.format(zip_path))
59 else:
60 print('{} is existed'.format(str(zip_path)))
61 # pics_dir = Path(PathBox.extractPics(
62 # dst_dir, zip_path, zip_dir))
63
64 # if not pics_dir.exists():
65 # PathBox.batchMoveFilesToOneFolder(
66 # pics_dir, dst_dir, ['.jpeg', '.png'])
67 else:
68 print('{} is not docx'.format(str(src_path)))
69
70 PathBox.extractZipFile(zip_dir, dst_dir)
71
72 @staticmethod
73 def copyAsZip(srcpath, dstpath):
74 shutil.copyfile(srcpath, dstpath)
75
76 @staticmethod
77 def extractPics(dstDir, zippath, zipDir):
78 # first clear the zipdir directory
79 # 将docx文档复制为*.zip格式
80
81 # 解压缩文件
82 try:
83 with zipfile.ZipFile(zippath, 'r') as f:
84 print('{zippath} is extracted'.format(zippath=zippath))
85 f.extractall(zipDir)
86 except:
87 print('{zippath} cannot be extracted'.format(zippath=zippath))
88 else:
89 picsDir = Path(zipDir).joinpath('word/media')
90 return picsDir
91
92 # if os.path.exists(picsDir):
93 # for pic in os.listdir(picsDir):
94 # oldpic=os.path.join(picsDir,pic)
95 # newpic=os.path.join(out_dir,pic)
96 # try:
97 # shutil.move(oldpic,newpic)
98 # except:
99 # print(inDir+' is skipped')
100
101 # filelist=os.listdir(zipDir)
102 # for f in filelist:
103 # filepath = os.path.join(zipDir, f )
104 # if os.path.isfile(filepath):
105 # os.remove(filepath)
106 # elif os.path.isdir(filepath):
107 # shutil.rmtree(filepath,True)
108
109 @staticmethod
110 def getFileMd5(file_name):
111 """
112 计算文件的md5
113 :param file_name:
114 :return:
115 """
116 m = md5() # 创建md5对象
117 with open(file_name, 'rb') as fobj:
118 while True:
119 data = fobj.read(4096)
120 if not data:
121 break
122 m.update(data) # 更新md5对象
123
124 return m.hexdigest() # 返回md5对象
125
126 @staticmethod
127 def syncFiles(srcDir, dstDir):
128 src_dir = Path(srcDir)
129 dst_dir = Path(dstDir)
130 for root, dirs, files in os.walk(src_dir):
131 for f in files:
132 src_path = Path(root).joinpath(f)
133 rel_path = src_path.relative_to(src_dir)
134 dst_path = dst_dir.joinpath(rel_path)
135 if dst_path.exists():
136 if os.path.getsize(src_path) == os.path.getsize(dst_path):
137 print('{} is existed'.format(str(src_path)))
138 else:
139 PathBox.copyFile(src_path, dst_path)
140 print('{} is copied'.format(str(src_path)))
141 pass
142 else:
143 if not dst_path.parent.exists():
144 dst_path.parent.mkdir(parents=True, exist_ok=True)
145 PathBox.copyFile(src_path, dst_path)
146 print('{} is copied'.format(str(src_path)))
147 pass
148
149 @staticmethod
150 def extractZipFile(srcDir, dstDir):
151 src_dir = srcDir
152 dst_dir = dstDir
153 for root, dirs, files in os.walk(src_dir):
154 for f in files:
155 file_path = Path(root).joinpath(f)
156 if file_path.suffix in ['.zip', '.rar']:
157 try:
158 with zipfile.ZipFile(str(file_path), 'r') as f:
159 zip_dir = Path(src_dir).joinpath(file_path.stem)
160 if not zip_dir.exists():
161 zip_dir.mkdir()
162 print('{} is extracted'.format(str(file_path)))
163 f.extractall(zip_dir)
164 except:
165 print('{} cannot be extracted'.format(str(file_path)))
166
167 @staticmethod
168 def getImageMd5(img_path):
169 try:
170 hash = md5()
171 img = open(img_path, 'rb')
172 hash.update(img.read())
173 img.close()
174 img_md5 = hash.hexdigest()
175 return img_md5
176 except:
177 return None
178
179 @staticmethod
180 def batchRenameFileSuffix(srcDir):
181 # 批量修改目录下指定类型的后缀
182 for root, dirs, files in os.walk(srcDir):
183 for f in files:
184 srcpath = Path(os.path.join(root, f))
185 if srcpath.suffix in ['.jpeg', '.jpg']:
186 newfilename = srcpath.stem+'.JPG'
187 srcpath.rename(srcpath.parent / newfilename)
188 print('{} is renamed'.format(srcpath))
189
190 @staticmethod
191 def compareTwoDirsByCount(srcDir, dstDir):
192 rootdirs = os.listdir(srcDir)
193
194 for rootdir in rootdirs:
195 srcpath = os.path.join(srcDir, rootdir)
196 dstpath = os.path.join(dstDir, rootdir)
197 src_count = 0
198 dst_count = 0
199 for root, dirs, files in os.walk(srcpath):
200 for f in files:
201 src_count += 1
202 for root, dirs, files in os.walk(dstpath):
203 for f in files:
204 dst_count += 1
205 if src_count == dst_count:
206 shutil.rmtree(srcpath)
207 print('{} is removed'.format(srcpath))
208
209 @staticmethod
210 def batchRenameFileName(srcDir):
211 # 批量修改目录下的文件名
212 index = 1
213 for root, dirs, files in os.walk(srcDir):
214 root_path = Path(root)
215 for f in files:
216 file_path = root_path.joinpath(f)
217 new_file_path = file_path
218 index = 1
219 while True:
220 new_file_name = str(index)+file_path.suffix
221 new_file_path = new_file_path.with_name(new_file_name)
222 if not new_file_path.exists():
223 break
224 else:
225 index += 1
226 file_path.rename(new_file_path)
227 print('{} is renamed'.format(str(file_path)))
228
229 @staticmethod
230 def tongji(srcDir):
231 for root, dirs, files in os.walk(srcDir):
232 if root == srcDir:
233 pass
234 else:
235 count = len(os.listdir(root))
236 print('{0} have total of {1} files'.format(root, count))
237
238 @staticmethod
239 def rmEmptyDirs(srcDir):
240 for root, dirs, files in os.walk(srcDir):
241 if root == srcDir:
242 pass
243 else:
244 count = len(sorted(Path(root).rglob('**/*.*')))
245 if count == 0:
246 try:
247 Path(root).rmdir()
248 # shutil.rmtree(root)
249 except:
250 pass
251
252 @staticmethod
253 def batchResizePics(srcDir):
254
255 dstDir = srcDir+"-resize"
256
257 size = (800, 600)
258
259 # print("picture resizing is processing,pleae wait...")
260 for root, dirs, files in os.walk(srcDir):
261 newroot = Path(root.replace(srcDir, dstDir))
262 if not newroot.exists():
263 newroot.mkdir(parents=True, exist_ok=True)
264 # os.mkdir(newroot)
265
266 for file in files:
267 (filename, extension) = os.path.splitext(file)
268 if extension in ['.jpg', '.jpeg', '.png', '.JPG', '.JPEG', '.PNG']:
269 newfile = newroot.joinpath(file)
270 oldfile = Path(root).joinpath(file)
271 try:
272 # print('processing')
273 im = Image.open(oldfile)
274 if not im.size == size:
275 im.thumbnail(size)
276 im.save(newfile, "jpeg")
277 print('{} is thumbnailed'.format(oldfile))
278 else:
279 try:
280 PathBox.moveFile(oldfile, newfile)
281 # shutil.move(oldfile, newfile)
282 # print('{} is moved'.format(oldfile))
283 except:
284 print(Exception)
285 except IOError:
286 pass
287 return dstDir
288
289 @staticmethod
290 def batchMoveNonePicsToOneFolder(srcDir, dstDir='parent'):
291 fexts = ['.jpg', '.png', '.jpeg', '.JPG', '.PNG', '.JPEG']
292 for root, dirs, files in os.walk(srcDir):
293 for f in files:
294 src_path = Path(root).joinpath(f)
295 if not src_path.suffix in fexts:
296 dst_path = Path(srcDir).joinpath(f)
297 shutil.move(src_path, dst_path)
298 print('{} is moved'.format(src_path))
299
300 @staticmethod
301 # 查询文件夹里是否有非IMG开头的文件
302 def excludeFilesByName(srcDir, dstDir='parent'):
303 src_dir = Path(srcDir)
304 if dstDir == 'parent':
305 dst_dir = src_dir
306 else:
307 dst_dir = Path(dstDir)
308 for root, dirs, files in os.walk(src_dir):
309 if dstDir == 'parent' and root == srcDir:
310 continue
311 else:
312 for f in files:
313 if f.startswith("IMG"):
314 pass
315 else:
316 src_path = Path(root).joinpath(f)
317 dst_path = dst_dir.joinpath(f)
318 PathBox.moveFile(src_path, dst_path)
319
320 @staticmethod
321 def batchMoveFilesToOneFolder(srcDir, dstDir='parent', fexts='all'):
322 src_dir = Path(srcDir)
323 if dstDir == 'parent':
324 dstDir = srcDir
325
326 for root, dirs, files in os.walk(srcDir):
327 if dstDir == 'parent' and root == srcDir:
328 pass
329 else:
330 for f in files:
331 src_path = Path(root).joinpath(f)
332 if not dstDir:
333 dst_path = src_dir.joinpath(f)
334 if fexts == 'all':
335 PathBox.moveFile(src_path, dst_path)
336 print('{} is moved'.format(src_path))
337 else:
338 if src_path.suffix in fexts:
339 PathBox.moveFile(src_path, dst_path)
340 print('{} is moved'.format(src_path))
341 else:
342 dst_path = Path(dstDir).joinpath(f)
343 if fexts == 'all':
344 PathBox.moveFile(src_path, dst_path)
345 print('{} is moved'.format(src_path))
346 else:
347 if src_path.suffix in fexts:
348 PathBox.moveFile(src_path, dst_path)
349 print('{} is moved'.format(src_path))
350
351 @staticmethod
352 def moveFile(src_path, dst_path):
353 index = 1
354 new_dst_path = dst_path
355 while True:
356 if new_dst_path.exists():
357 new_dst_path = dst_path.with_name(
358 dst_path.stem+'_'+str(index)+dst_path.suffix)
359 index += 1
360 else:
361 break
362 shutil.move(src_path, dst_path)
363 print('{} is moved'.format(src_path))
364
365 @staticmethod
366 def copyFile(src_path, dst_path):
367 index = 1
368 while True:
369 if dst_path.exists():
370 dst_path = dst_path.with_name(
371 dst_path.stem+'_'+str(index)+dst_path.suffix)
372 index += 1
373 else:
374 break
375 try:
376 shutil.copyfile(src_path, dst_path)
377 except:
378 print(Exception)
379
380 @staticmethod
381 def compareDirsDeleteTheSameFile(srcDir, dstDir, mode='keep'):
382 # compare two dirs and delete the same file in the srcDir
383 for root, dirs, files in os.walk(srcDir):
384 for f in files:
385 src_path = Path(os.path.join(root, f))
386 rel_path = src_path.relative_to(Path(srcDir))
387 dst_path = Path(dstDir).joinpath(rel_path)
388 if dst_path.exists():
389 if mode == 'keep':
390 pass
391 if mode == 'delete':
392 try:
393 send2trash(str(src_path))
394 # os.remove(src_path)
395 print('{} is removed'.format(src_path))
396 except:
397 print('{} cannot be removed'.format(src_path))
398 pass
399
400 @staticmethod
401 def batchRemoveTheSameFileByMD5(srcKeepDir, srcCompareDirs=[]):
402 zd = {}
403 src_keep_dir = Path(srcKeepDir)
404 for root, dirs, files in os.walk(src_keep_dir):
405 for f in files:
406 f_path = Path(root).joinpath(f)
407 img_md5 = PathBox.getFileMd5(f_path)
408 # img_md5 = PathBox.getImageMd5(f_path)
409 if img_md5:
410 if not img_md5 in zd.keys():
411 zd[img_md5] = f_path
412 else:
413 send2trash(str(f_path))
414 print('{} is removed'.format(f_path))
415
416 if srcCompareDirs:
417 for folder in srcCompareDirs:
418 src_compare_dir = Path(folder)
419 for root, dirs, files in os.walk(src_compare_dir):
420 for f in files:
421 f_path = Path(root).joinpath(f)
422 img_md5 = PathBox.getFileMd5(f_path)
423 if img_md5:
424 if not img_md5 in zd.keys():
425 zd[img_md5] = f_path
426 else:
427 src_path = f_path
428 # dst_path=os.path.join(dstDir,src_path.name)
429 send2trash(str(src_path))
430 # os.remove(src_path)
431 # shutil.move(src_path,dst_path)
432 print('{} is removed'.format(src_path))
433
434
435 if __name__ == '__main__':
436 # ----------------extract pics from docx--------------------
437 # srcDir = r'D:\Civil\32109012\_工具包\_参考资料\_桥梁检测报告'
438 # dstDir = r'D:\Civil\extract'
439 # zipDir = r'D:\Civil\zip'
440 # PathBox.batchExtractPicsFromDocs(srcDir, dstDir, zipDir)
441 # PathBox.batchMoveFilesToOneFolder(zipDir, dstDir, fexts=['.jpg', '.png', '.emf', '.jpeg'])
442
443 # ----------------bacth move files via fexts--------------------
444 # PathBox.batchMoveFilesToOneFolder(srcDir, fexts=['.doc'])
445 # srcDir = r'D:\Civil\32109012\_工具包\_softSmall'
446 # PathBox.batchMoveFilesToOneFolder(srcDir)
447
448 # ----------------bacth exclue the same file in dirs--------------------
449 srcKeepDir = r'D:\_soft'
450 # srcCompareDirs = [r'D:\test']
451 PathBox.batchRemoveTheSameFileByMD5(srcKeepDir)
452 # PathBox.batchRemoveTheSameFileByMD5(srcKeepDir, srcCompareDirs)
453
454 # ----------------bacth resize the images in dirs--------------------
455 # srcDir = r'D:\BaiduNetdiskDownload\温州东瓯DAQIAO'
456 # # dstDir = r'D:\衢州报告-resize'
457 # dstDir = PathBox.batchResizePics(srcDir)
458 # PathBox.compareDirsDeleteTheSameFile(srcDir, dstDir, mode='delete')
459 # PathBox.rmEmptyDirs(srcDir)
460
461 # ----------------bacth sync the files between two dirs----------------
462 # srcDir = r'C:\Users\Administrator\Documents\debug\wolf'
463 # dstDir = r'C:\Users\Administrator\Documents\debug\_待整理'
464 # PathBox.syncFiles(srcDir, dstDir)
465 # PathBox.compareDirsDeleteTheSameFile(srcDir, dstDir, mode='delete')
466 # PathBox.rmEmptyDirs(srcDir)
467
468 # ----------------bacth sync the files between two dirs----------------
469 # srcDir = r'D:\衢州报告'
470 # PathBox.excludeFilesByName(srcDir)

View Code