from transformers import BlipProcessor, BlipForConditionalGeneration, Blip2ForConditionalGeneration, Blip2Processor
from PIL import Image


processor = Blip2Processor.from_pretrained("huggingface.co/Salesforce/blip2-flan-t5-xl")
model = Blip2ForConditionalGeneration.from_pretrained("huggingface.co/Salesforce/blip2-flan-t5-xl")

# processor = BlipProcessor.from_pretrained("huggingface.co/Salesforce/blip-image-captioning-base")
# model = BlipForConditionalGeneration.from_pretrained("huggingface.co/Salesforce/blip-image-captioning-base")

img = Image.open("data/count_pingguo1.png")
inputs = processor(img, return_tensors="pt")

out = model.generate(**inputs)
caption = processor.decode(out[0], skip_special_tokens=True)

print(caption)