1.如果不知道文本的编码可以使用chardet库测试
# 测试文本编码
import chardet
filename = "./all_data.txt"
with open(filename, "rb") as f:
data = f.read()
encodeing_type = chardet.detect(data)
print(encodeing_type)
2.转码,途中gb2312报错,百度后尝试范围更大的gb18030
# 将文本转为utf-8编码
import codecs
import chardet
# 输入输出文件
filename_in = "all_data.txt"
filename_out = "all_data_utf8.txt"
# 输入输出编码
encode_in = "gb18030" # gb2312会报错,可能范围小了
encode_out = "utf-8"
# 进行转码
with codecs.open(filename=filename_in, mode='r', encoding=encode_in) as fi:
data = fi.read()
with open(filename_out, mode='w', encoding=encode_out) as fo:
fo.write(data)
# 测试转码是否成功
with open(filename_out, "rb") as f:
data = f.read()
encodeing_type = chardet.detect(data)
print(encodeing_type)