from transformers import BlipProcessor, BlipTextConfig
from transformers.models.blip.modeling_blip_text import BlipTextLMHeadModel
from transformers import AutoTokenizer


model = BlipForConditionalGeneration.from_pretrained("huggingface.co/Salesforce/blip-image-captioning-base")

text_config = BlipTextConfig()
model.text_decoder = BlipTextLMHeadModel(text_config)

实际训练的时候就可以用BERT的tokenizer处理encode和decode

bertTokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

output_batch = model.generate(pixel_values=pixel_input_batch)      
                      
for i in range(0, batch_size):
    caption = bertTokenizer.decode(output_batch[i], skip_special_tokens=True)
    print(caption)