参考
​​​https://github.com/google-research/bert/blob/master/tokenization.py​​ 使用

import tokenization
tokenizer = tokenization.BasicTokenizer(do_lower_case=True)

f = open("sample_text.txt",mode="r",encoding="utf-8")
lines = f.readlines()
f2 = open("vocab.txt",mode="w",encoding="utf-8")
f2.write("[PAD]")
f2.write("\n")
f2.write("[UNK]")
f2.write("\n")
f2.write("[CLS]")
f2.write("\n")
f2.write("[SEP]")
f2.write("\n")
f2.write("[MASK]")
f2.write("\n")
word_set = set()
for line in lines:
word_list = tokenizer.tokenize(line)
for word in word_list:
word_set.add(word)
for word in list(word_set):
if word!=" " and word!="":
f2.write(word)
f2.write("\n")
f.close()
f2.close()