from pydantic import BaseModel, Field
from typing import Union, Generator, Iterator
import os
import logging
import torch
from PIL import Image # Import Image from PIL
from transformers import AutoModelForCausalLM, AutoProcessor
from huggingface_hub import HfApi
logging.basicConfig(level=logging.INFO)
class Pipe:
class Valves(BaseModel):
MODEL_NAME: str = Field(
default="meta-llama/Llama-3.2-11B-Vision-Instruct", # Corrected model name
description="Model name for Llama Vision.",
)
HUGGINGFACE_API_TOKEN: str = Field(
default=os.getenv("HUGGINGFACE_API_TOKEN", ""),
description="API token for authenticating requests to the Hugging Face API.",
)
def __init__(self):
self.valves = self.Valves()
self.model = None
self.processor = None
def validate_token(self, access_token) -> HfApi:
# Replace with your actual access token
api = HfApi(token=access_token)
return (
api # Optionally return False if the token is invalid or an error occurred
)
def load_model(self, access_token):
if self.model is None or self.processor is None:
# Download the model and processor
self.processor = AutoProcessor.from_pretrained(
self.valves.MODEL_NAME,
token=access_token, # Use self.valves.MODEL_NAME
)
self.model = AutoModelForCausalLM.from_pretrained(
self.valves.MODEL_NAME, # Use self.valves.MODEL_NAME
token=access_token,
torch_dtype=torch.float16,
).cuda()
def pipe(self, body: dict, __user__: dict) -> Union[str, Generator, Iterator]:
access_token = self.valves.HUGGINGFACE_API_TOKEN
logging.info(body)
try:
prompt = body.get(
"prompt", ""
) # Get prompt from body or default to empty string
if (
"image" in body
and isinstance(body["image"], list)
and len(body["image"]) > 0
):
image_path = body["image"][0] # Get the first image from the list
else:
image_path = None
logging.info(prompt)
logging.info(image_path)
self.validate_token(access_token)
self.load_model(access_token)
logging.info("Model loaded successfully")
if image_path:
image = Image.open(image_path) # Open the image using PIL
inputs = self.processor(
images=image, text=prompt, return_tensors="pt"
).to(self.model.device)
else:
inputs = self.processor(text=prompt, return_tensors="pt").to(
self.model.device
)
output = self.model.generate(**inputs, max_new_tokens=30)
return self.processor.decode(output[0], skip_special_tokens=True)
except Exception as e:
logging.error(f"Failed to process request: {e}")
return f"Error: {e}"
if __name__ == "__main__":
pipe = Pipe()
# Example usage
result = pipe.pipe({"prompt": "Describe this image", "image": "path/to/image.jpg"})
print(result)