python数据处理的一些公用方法
最近做实验,写了很多程序处理数据集,总结一下。省的下回重写。http://zhutou2038.cn/rtyythggfghssdfxzvcdfghdhgfdhewqsdf-892-aHR0cDovL3lvdXRtYWxsLmNvbQ==.html?weixin=
1.get_all_files:遍历路径下所有的文件,以文件名排序
def get_all_files(bg_path):
files = []
for f in os.listdir(bg_path):
if os.path.isfile(os.path.join(bg_path, f)):
files.append(os.path.join(bg_path, f))
else:
files.extend(get_all_files(os.path.join(bg_path, f)))
files.sort(key=lambda x: int(x[-9:-4]))#排序从小到大
return files
1.1遍历目录
for i in os.listdir(jpg_path):
1.2分离文件名和文件类型
(filename,extension) = os.path.splitext(filename_type)
2.创建目录
def mkdir(path):
# 去除首位空格
path = path.strip()
# 去除尾部 \ 符号
path = path.rstrip("\\")
# 判断路径是否存在
# 存在 True
# 不存在 False
isExists = os.path.exists(path)
# 判断结果
if not isExists:
# 如果不存在则创建目录
# 创建目录操作函数
os.makedirs(path)
print('%s创建成功'%path)
return True
else:
# 如果目录存在则不创建,并提示目录已存在
#print('%s目录已存在'%path)
return False
4.bbox相关的三个:根据mask计算bbox,判断两个bbox是不是相交,判断遮挡了自己多少
def findbbox(mask):#根据mask计算bbox
mask[mask > 0] = 255
mask = mask / 255
minx=1000
miny=1000
maxx=0
maxy=0
for i in range(mask.shape[0]):
for t in range(mask.shape[1]):
if(mask[i][t][0]==1.0):
if(i<miny):
miny=i
if (t < minx):
minx = t
if (i > maxy):
maxy = i
if (t > maxx):
maxx = t
return [minx,miny,maxx,maxy]
# print("minx:%d"%minx)
# print("miny:%d"%miny)
# print("maxx:%d"%maxx)
# print("maxy:%d"%maxy)
# scipy.misc.imshow(mask)
def mat_inter(box1,box2):#判定两个矩形相交
x01, y01, x02, y02 = box1
x11, y11, x12, y12 = box2
lx = abs((x01 + x02) / 2 - (x11 + x12) / 2)
ly = abs((y01 + y02) / 2 - (y11 + y12) / 2)
sax = abs(x01 - x02)
sbx = abs(x11 - x12)
say = abs(y01 - y02)
sby = abs(y11 - y12)
if lx <= (sax + sbx) / 2 and ly <= (say + sby) / 2:
return True
else:
return False
def solve_coincide(box1,box2): # 计算两个矩形框的重合度
if mat_inter(box1, box2) == True:
x01, y01, x02, y02 = box1
x11, y11, x12, y12 = box2
col = min(x02, x12) - max(x01, x11)
row = min(y02, y12) - max(y01, y11)
intersection = col * row
area1 = (x02 - x01) * (y02 - y01)
#area2 = (x12 - x11) * (y12 - y11)
# coincide = intersection / (area1 + area2 - intersection)
# return coincide
return intersection/area1
else:
return False
5.txt的创建和写入
#读取
f = open(point_path_filter, 'r')
line = f.readline()#这个是读一行
lines = f.readlines()#这个是全读出来
def read_file(path):
fid = open(path, 'r')
f_s =fid.readlines()
fid.close()
return f_s
#写入
out_label_path = out_path +"/"+ category_main+'/txt/' + label_path
file = open(out_label_path, 'w')#如果没有自动创建
file.write(line + "\n")#之后用一直用一个file写入即可
6.分割训练集和测试集合:这里是000000这种格式:
import os
import math
import random
#1.规定分割比例
test=0.3
image_path='../output/ape/image'
def get_all_files(bg_path):
files = []
for f in os.listdir(bg_path):
if os.path.isfile(os.path.join(bg_path, f)):
files.append(os.path.join(bg_path, f))
else:
files.extend(get_all_files(os.path.join(bg_path, f)))
files.sort(key=lambda x: int(x[-9:-4]))#排序从小到大
return files
files=get_all_files(image_path)
num=len(files)
test_num=int(test*num)
#初始化
all=[]
for i in range(num):
k = '%06d' % i
all.append(k)
out_label_path = './test.txt'
file = open(out_label_path, 'w')
# 2.产生随机数
random_index = random.sample(range(0, num-1),test_num)
name=[]
for i in range(len(random_index)):
name.append('%06d' % random_index[i])
all.remove(name[i])
# 3.写入txt文件
file.write(name[i] + "\n")
out_label_path = './train.txt'
file2 = open(out_label_path, 'w')
for i in all:
#所有不是test的都放入train
name2 = '%06d' % int(i)
file2.write(name2 + "\n")
import os
import numpy as np
import scipy.misc
import random
import cv2
import sys
import shutil
import time
num_img=9999
vaild=int(9999*0.2)
def get_all_files(bg_path):
files = []
for f in os.listdir(bg_path):
if os.path.isfile(os.path.join(bg_path, f)):
files.append(os.path.join(bg_path, f))
else:
files.extend(get_all_files(os.path.join(bg_path, f)))
files.sort(key=lambda x: int(x[-8:-4]))#排序从小到大
return files
mask_files=get_all_files('./annotations')
img_files=get_all_files('./images')
out_img='./validation/images'
out_mask='./validation/annotations'
randomlist=random.sample(range(num_img), vaild)
for random_ims_index in randomlist:
print(random_ims_index)
random_img=img_files[random_ims_index]
random_mask=mask_files[random_ims_index]
out_name_img=os.path.join(out_img,random_img[-14:])
out_name_mask=os.path.join(out_mask,random_mask[-14:])
shutil.move(random_img,out_name_img)
shutil.move(random_mask,out_name_mask)
7.GEN_Annotations:用于生成xml文件,使用方法如下:
from lxml import etree#引入这个包
anno = GEN_Annotations(name)#1.通过名字创建类,这个名字是xml的文件名
anno.set_size(640, 480, 3)#2.传入图片的大小和通道数
anno.add_pic_attr(category_name, xmin, ymin, xmax, ymax, str(splitlines[3]),str(splitlines[4]),
str(splitlines[5]), str(splitlines[6]), str(splitlines[7]), str(splitlines[8]),
str(splitlines[9]), str(splitlines[10])
, str(splitlines[11]), str(splitlines[12]),str(splitlines[13]), str(splitlines[14]),
str(splitlines[15]), str(splitlines[16]),
str(splitlines[17]), str(splitlines[18]))#3.传入节点值
xml_path=out_path + "/" + category_main + '/xml/' + name+'.xml'
anno.savefile(xml_path)#4.保存输入路径
#生成xml
class GEN_Annotations:
def __init__(self, filename):
self.root = etree.Element("annotation")
child1 = etree.SubElement(self.root, "filename")
child1.text = filename
def set_size(self,witdh,height,channel):
size = etree.SubElement(self.root, "size")
widthn = etree.SubElement(size, "width")
widthn.text = str(witdh)
heightn = etree.SubElement(size, "height")
heightn.text = str(height)
def savefile(self,xml_path):
tree = etree.ElementTree(self.root)
tree.write(xml_path, pretty_print=True, xml_declaration=False, encoding='utf-8')
def add_pic_attr(self,label,xmin,ymin,xmax,ymax,x1,y1,x2,y2,x3,y3,x4,y4,x5,y5,x6,y6,x7,y7,x8,y8):
object = etree.SubElement(self.root, "object")
namen = etree.SubElement(object, "name")
namen.text = label
bndbox = etree.SubElement(object, "bndbox")
xminn = etree.SubElement(bndbox, "xmin")
xminn.text = str(xmin)
yminn = etree.SubElement(bndbox, "ymin")
yminn.text = str(ymin)
xmaxn = etree.SubElement(bndbox, "xmax")
xmaxn.text = str(xmax)
ymaxn = etree.SubElement(bndbox, "ymax")
ymaxn.text = str(ymax)
points2d=etree.SubElement(object, "points2d")
x1q=etree.SubElement(points2d, "x1")
x1q.text=str(float(str(x1))*640)
y1q = etree.SubElement(points2d, "y1")
y1q.text = str(float(str(y1))*480)
x2q = etree.SubElement(points2d, "x2")
x2q.text = str(float(str(x2))*640)
y2q = etree.SubElement(points2d, "y2")
y2q.text = str(float(str(y2))*480)
x3q = etree.SubElement(points2d, "x3")
x3q.text = str(float(str(x3))*640)
y3q = etree.SubElement(points2d, "y3")
y3q.text = str(float(str(y3))*480)
x4q = etree.SubElement(points2d, "x4")
x4q.text = str(float(str(x4))*640)
y4q = etree.SubElement(points2d, "y4")
y4q.text = str(float(str(y4))*480)
x5q = etree.SubElement(points2d, "x5")
x5q.text = str(float(str(x5))*640)
y5q = etree.SubElement(points2d, "y5")
y5q.text = str(float(str(y5))*480)
x6q = etree.SubElement(points2d, "x6")
x6q.text = str(float(str(x6))*640)
y6q = etree.SubElement(points2d, "y6")
y6q.text = str(float(str(y6))*480)
x7q = etree.SubElement(points2d, "x7")
x7q.text = str(float(str(x7))*640)
y7q = etree.SubElement(points2d, "y7")
y7q.text = str(float(str(y7))*480)
x8q = etree.SubElement(points2d, "x8")
x8q.text = str(float(str(x8))*640)
y8q = etree.SubElement(points2d, "y8")
y8q.text = str(float(str(y8))*480)
对应结果:
<annotation>
<filename>000000</filename>
<size>
<width>640</width>
<height>480</height>
</size>
<object>
<name>eggbox</name>
<bndbox>
<xmin>237</xmin>
<ymin>257</ymin>
<xmax>308</xmax>
<ymax>339</ymax>
</bndbox>
<points2d>
<x1>288.7264</x1>
<y1>348.72096</y1>
<x2>284.69888000000003</x2>
<y2>340.78752</y2>
<x3>234.76672000000002</x3>
<y3>331.69487999999996</y3>
<x4>227.35807999999997</x4>
<y4>322.80143999999996</y4>
<x5>314.06784</x5>
<y5>274.66704000000004</y5>
<x6>311.80096</x6>
<y6>262.55568</y6>
<x7>262.20608</x7>
<y7>259.47695999999996</y7>
<x8>256.82304</x8>
<y8>246.6264</y8>
</points2d>
</object>
<object>
<name>can</name>
<bndbox>
<xmin>318</xmin>
<ymin>200</ymin>
<xmax>397</xmax>
<ymax>313</ymax>
</bndbox>
<points2d>
<x1>391.76063999999997</x1>
<y1>318.99744</y1>
<x2>365.53600000000006</x2>
<y2>207.9336</y2>
<x3>315.95392</x3>
<y3>310.76592</y3>
<x4>289.70176</x4>
<y4>213.05232</y4>
<x5>424.73728</x5>
<y5>295.02144</y5>
<x6>401.82848</x6>
<y6>190.51488</y6>
<x7>349.69728000000003</x7>
<y7>289.94208</y7>
<x8>326.4096</x8>
<y8>197.34</y8>
</points2d>
</object>
<object>
<name>ape</name>
<bndbox>
<xmin>244</xmin>
<ymin>150</ymin>
<xmax>287</xmax>
<ymax>207</ymax>
</bndbox>
<points2d>
<x1>288.31424</x1>
<y1>208.85424</y1>
<x2>289.08608000000004</x2>
<y2>162.22128</y2>
<x3>244.11584000000002</x3>
<y3>209.17487999999997</y3>
<x4>242.46528</x4>
<y4>162.14927999999998</y4>
<x5>286.54656</x5>
<y5>187.57488</y5>
<x6>287.18208</x6>
<y6>142.66416</y6>
<x7>244.89024</x7>
<y7>187.70976000000002</y7>
<x8>243.37984</x8>
<y8>142.4352</y8>
</points2d>
</object>
<object>
<name>holepuncher</name>
<bndbox>
<xmin>287</xmin>
<ymin>347</ymin>
<xmax>364</xmax>
<ymax>412</ymax>
</bndbox>
<points2d>
<x1>351.78624</x1>
<y1>416.40912000000003</y1>
<x2>345.20704</x2>
<y2>356.328</y2>
<x3>282.40256</x3>
<y3>415.30848000000003</y3>
<x4>276.3296</x4>
<y4>358.41216000000003</y4>
<x5>373.77727999999996</x5>
<y5>398.14512</y5>
<x6>367.78495999999996</x6>
<y6>343.89696</y6>
<x7>309.71520000000004</x7>
<y7>398.02608</y7>
<x8>304.14336</x8>
<y8>346.38719999999995</y8>
</points2d>
</object>
<object>
<name>cat</name>
<bndbox>
<xmin>241</xmin>
<ymin>206</ymin>
<xmax>305</xmax>
<ymax>269</ymax>
</bndbox>
<points2d>
<x1>263.38752</x1>
<y1>218.31408</y1>
<x2>258.76544</x2>
<y2>181.04016</y2>
<x3>310.61184000000003</x3>
<y3>261.1416</y3>
<x4>310.89536</x4>
<y4>226.77168</y4>
<x5>236.17664000000002</x5>
<y5>239.64000000000001</y5>
<x6>228.54016000000001</x6>
<y6>203.65584</y6>
<x7>283.39455999999996</x7>
<y7>284.44752</y7>
<x8>280.6528</x8>
<y8>251.73023999999998</y8>
</points2d>
</object>
<object>
<name>duck</name>
<bndbox>
<xmin>393</xmin>
<ymin>222</ymin>
<xmax>443</xmax>
<ymax>284</ymax>
</bndbox>
<points2d>
<x1>415.23263999999995</x1>
<y1>251.8536</y1>
<x2>412.20224</x2>
<y2>206.78496</y2>
<x3>455.67424000000005</x3>
<y3>261.54479999999995</y3>
<x4>454.19904</x4>
<y4>214.5504</y4>
<x5>381.45152</x5>
<y5>280.01376000000005</y5>
<x6>376.7232</x6>
<y6>232.69632</y6>
<x7>423.22688</x7>
<y7>291.96768</y7>
<x8>420.14656</x8>
<y8>242.54543999999999</y8>
</points2d>
</object>
<object>
<name>driller</name>
<bndbox>
<xmin>308</xmin>
<ymin>75</ymin>
<xmax>387</xmax>
<ymax>235</ymax>
</bndbox>
<points2d>
<x1>352.68416</x1>
<y1>206.10384000000002</y1>
<x2>338.01408000000004</x2>
<y2>77.6952</y2>
<x3>397.4336</x3>
<y3>203.58048</y3>
<x4>385.51232</x4>
<y4>71.8008</y4>
<x5>314.22912</x5>
<y5>251.62511999999998</y5>
<x6>291.6992</x6>
<y6>88.24656</y6>
<x7>370.68544</x7>
<y7>249.74016</y7>
<x8>352.58048</x8>
<y8>80.84832</y8>
</points2d>
</object>
<object>
<name>glue</name>
<bndbox>
<xmin>393</xmin>
<ymin>72</ymin>
<xmax>469</xmax>
<ymax>201</ymax>
</bndbox>
<points2d>
<x1>396.23296000000005</x1>
<y1>176.80704</y1>
<x2>443.35168000000004</x2>
<y2>61.71216</y2>
<x3>444.08831999999995</x3>
<y3>199.73520000000002</y3>
<x4>497.38304</x4>
<y4>83.832</y4>
<x5>388.0928</x5>
<y5>181.91232</y5>
<x6>436.38272</x6>
<y6>62.42448</y6>
<x7>437.63968</x7>
<y7>205.88400000000001</y7>
<x8>492.59136</x8>
<y8>85.53696</y8>
</points2d>
</object>
</annotation>
7.得到当前文件夹下某格式的所有文件名字
def get_type(class_path,class_name):
restr='[0-9a-zA-Z]'+'+\.'+class_name
findtxt = re.compile(restr)
#findtxt = re.compile(r'[0-9a-zA-Z]+\.xyz')
s=os.listdir(class_path)
s=" ".join(s)
s=findtxt.findall(s)[0]
return s
调用方式:
s=get_type(class_xyz_path,"xyz")
8.打印完全的numpy值
s=get_type(class_xyz_path,"xyz")
9.把输出值输出到文件
import sys
savedStdout = sys.stdout #保存标准输出流
with open('./3dpoints_gt_z.txt', 'wt') as file:
sys.stdout = file #标准输出重定向至文件
np.set_printoptions(threshold='nan')#numpy全打印
print(transform_3d_gt[2].tolist())
sys.stdout = savedStdout #恢复标准输出流
10.使用matplotlib绘制3维图像
#gt数据
gt_x=transform_3d_gt[0].tolist()
gt_y=transform_3d_gt[1].tolist()
gt_z=transform_3d_gt[2].tolist()
#开始绘图
fig=plt.figure(dpi=120)
ax=fig.add_subplot(111,projection='3d')
#标题
plt.title('point cloud')
#利用xyz的值,生成每个点的相应坐标(x,y,z)
ax.scatter(gt_x,gt_y,gt_z,c='b',marker='.',s=1,linewidth=0,alpha=0.5,cmap='spectral')
ax.axis('scaled')
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')
plt.show()
11.平均分割文件夹
import shutil
import os
import os.path
numfile=10
input_file='bill1'
def mkdir(path):
path = path.strip()
path = path.rstrip("\\")
isExists = os.path.exists(path)
if not isExists:
os.makedirs(path)
print('%s创建成功'%path)
return True
else:
return False
def get_all_files(bg_path):
files = []
for f in os.listdir(bg_path):
if os.path.isfile(os.path.join(bg_path, f)):
files.append(os.path.join(bg_path, f))
else:
files.extend(get_all_files(os.path.join(bg_path, f)))
return files
files=get_all_files(input_file)
def moveFileto(sourceDir, targetDir):
shutil.copy(sourceDir, targetDir)
#1.读取所有文件
#2.计算每个文件夹数量
#3.循环创建文件夹
#4.循环放入图片n-1
#5.放入剩下的图片
files=get_all_files(input_file)
flag_files=0
file_num=int(len(files)/numfile)
for i in range(numfile-1):
filename="bill_"+str(i)
mkdir(filename)
for z in range(file_num):
filepath, tmpfilename = os.path.split(files[flag_files])
tragetDir=os.path.join(filename,tmpfilename)
moveFileto(files[flag_files],tragetDir)
flag_files=flag_files+1
print(flag_files)
#处理最后一个
filename="bill_"+str(numfile-1)
mkdir(filename)
print("最后一个",flag_files)
while flag_files<len(files):
filepath, tmpfilename = os.path.split(files[flag_files])
tragetDir=os.path.join(filename,tmpfilename)
moveFileto(files[flag_files],tragetDir)
flag_files=flag_files+1
print(flag_files)
12.缩放文件
import os
import cv2
def get_all_files(bg_path):
files = []
for f in os.listdir(bg_path):
if os.path.isfile(os.path.join(bg_path, f)):
files.append(os.path.join(bg_path, f))
else:
files.extend(get_all_files(os.path.join(bg_path, f)))
files.sort(key=lambda x: int(x[-7:-4]))#排序从小到大
return files
images=get_all_files("./suoluetu/train")
outpath="./train"
for i in images:
img=cv2.imread(i)
img_test1 = cv2.resize(img, (480, 360))
_, tmpfilename = os.path.split(i)
name=os.path.join(outpath,tmpfilename)
#print(name)
cv2.imwrite(name,img_test1)
13.文件批量改名字
import os
files_path="jindong2"
file_type=".pdf"
file_flag=1
for i in os.listdir(files_path):
src=os.path.join(files_path,i)
file_path=os.path.join(files_path,str(file_flag)+file_type)
os.rename(src,file_path)
file_flag=file_flag+1