NOTICE
Open WebUI Community is currently undergoing a major revamp to improve user experience and performance. Expected completion by year-end! ✨

Function
pipe
HuggingFace_LLama_Vision
Implementation of HuggingFace LLama Vision (Needs CUDA)
Function ID
huggingface_llama__vision
Creator
@abhinavasr
Downloads
413+

Function Content
python
from pydantic import BaseModel, Field
from typing import Union, Generator, Iterator
import os
import logging
import torch
from PIL import Image  # Import Image from PIL
from transformers import AutoModelForCausalLM, AutoProcessor
from huggingface_hub import HfApi

logging.basicConfig(level=logging.INFO)


class Pipe:
    class Valves(BaseModel):
        MODEL_NAME: str = Field(
            default="meta-llama/Llama-3.2-11B-Vision-Instruct",  # Corrected model name
            description="Model name for Llama Vision.",
        )
        HUGGINGFACE_API_TOKEN: str = Field(
            default=os.getenv("HUGGINGFACE_API_TOKEN", ""),
            description="API token for authenticating requests to the Hugging Face API.",
        )

    def __init__(self):
        self.valves = self.Valves()
        self.model = None
        self.processor = None

    def validate_token(self, access_token) -> HfApi:
        # Replace with your actual access token
        api = HfApi(token=access_token)
        return (
            api  # Optionally return False if the token is invalid or an error occurred
        )

    def load_model(self, access_token):
        if self.model is None or self.processor is None:
            # Download the model and processor
            self.processor = AutoProcessor.from_pretrained(
                self.valves.MODEL_NAME,
                token=access_token,  # Use self.valves.MODEL_NAME
            )
            self.model = AutoModelForCausalLM.from_pretrained(
                self.valves.MODEL_NAME,  # Use self.valves.MODEL_NAME
                token=access_token,
                torch_dtype=torch.float16,
            ).cuda()

    def pipe(self, body: dict, __user__: dict) -> Union[str, Generator, Iterator]:
        access_token = self.valves.HUGGINGFACE_API_TOKEN
        logging.info(body)
        try:
            prompt = body.get(
                "prompt", ""
            )  # Get prompt from body or default to empty string
            if (
                "image" in body
                and isinstance(body["image"], list)
                and len(body["image"]) > 0
            ):
                image_path = body["image"][0]  # Get the first image from the list
            else:
                image_path = None

            logging.info(prompt)
            logging.info(image_path)

            self.validate_token(access_token)
            self.load_model(access_token)
            logging.info("Model loaded successfully")
            if image_path:
                image = Image.open(image_path)  # Open the image using PIL
                inputs = self.processor(
                    images=image, text=prompt, return_tensors="pt"
                ).to(self.model.device)
            else:
                inputs = self.processor(text=prompt, return_tensors="pt").to(
                    self.model.device
                )

            output = self.model.generate(**inputs, max_new_tokens=30)
            return self.processor.decode(output[0], skip_special_tokens=True)
        except Exception as e:
            logging.error(f"Failed to process request: {e}")
            return f"Error: {e}"


if __name__ == "__main__":
    pipe = Pipe()
    # Example usage
    result = pipe.pipe({"prompt": "Describe this image", "image": "path/to/image.jpg"})
    print(result)