Whitepaper
Docs
Sign In
Function
Function
filter
v0.5
Vision for non-vision LLM (Pseudo-Vision Router)
Function ID
pseudo_vision_router
Creator
@kaneki
Downloads
151+
Let's you set different LLM without vision support and a vision LLM that takes all Images you send and describes them to the non-vision LLM
Get
README
No README available
Function Code
Show
""" title: Pseudo-Vision Router author: # email: # date: 2025-01-22 version: 0.5 license: # description: Provides pseudo image capabilities to non-vision models by describing images with a vision model and manipulating the context. """ from pydantic import BaseModel, Field from typing import Callable, Awaitable, Any, Optional, List, Dict import hashlib from open_webui.models.users import Users from open_webui.utils.chat import generate_chat_completion from fastapi import Request class Filter: class Valves(BaseModel): non_vision_model_ids: List[str] = Field( default_factory=list, description="List of non-vision model IDs that require pseudo-vision capability.", ) vision_model_id: str = Field( default="google.gemini-2.0-flash", description="The identifier of the vision model to be used for processing images.", ) image_description_prompt: str = Field( default=( """You are an image descriptor. Your task: Provide detailed descriptions of images so that someone with no vision can understand and work with them. - Be thorough and precise in your descriptions; there is no word limit. - Tailor your description to the content of the image: - For text-based images (e.g., a book page): - Transcribe the text exactly as it appears. - Include additional visual descriptions (e.g., "The book appears old and worn."). - For artistic images (e.g., paintings): - Offer creative or interpretative descriptions. - Include any visible text in the image along with the description. - Describe any people present in the image when applicable. - Use Markdown and LaTeX formatting where appropriate. - Write the description in the same language as the image content if applicable; otherwise, use English. - Provide only the description without additional commentary. """ ), description="Prompt to send to the vision model for image description.", ) image_context_text: str = Field( default="The following is a description of an image that was attached to the user's message. Treat it as if you can see the image yourself. Only consider this image if it's relevant to the user's prompt.", description="Text to insert between the image identifier and the image description.", ) status: bool = Field( default=True, description="Flag to enable or disable status messages." ) def __init__(self): self.valves = self.Valves() # Persistent storage for image descriptions self.image_description_cache: Dict[str, str] = {} async def inlet( self, body: dict, __event_emitter__: Callable[[Any], Awaitable[None]], __user__: Optional[dict] = None, __model__: Optional[dict] = None, __request__: Optional[Request] = None, # Added Request ) -> dict: if __model__["id"] not in self.valves.non_vision_model_ids: # No processing needed for vision models return body messages = body.get("messages", []) images_found = [] image_descriptions = [] # Initialize counters total_images_processed = 0 total_words = 0 cached_images = 0 # Extract images from user messages for idx_message, message in enumerate(messages): if message.get("role") == "user": content = message.get("content", "") # Check for images in content if isinstance(content, list): for idx_part, part in enumerate(content): if part.get("type") == "image": images_found.append( { "message_index": idx_message, "image_index_in_message": idx_part, "image": part.get("image"), "type": "image", } ) elif part.get("type") == "image_url": images_found.append( { "message_index": idx_message, "image_index_in_message": idx_part, "image_url": part.get("image_url"), "type": "image_url", } ) if message.get("images"): for idx_part, image in enumerate(message.get("images", [])): images_found.append( { "message_index": idx_message, "image_index_in_message": idx_part, "image": image, "type": "image", } ) images_to_route = 0 # Check images against cache and count how many need to be processed for image_info in images_found: image_key = self.get_image_key(image_info) if image_key and image_key in self.image_description_cache: cached_images += 1 else: images_to_route += 1 # Send initial status message if self.valves.status and images_found: status_message = f"Replaced {cached_images} images from cache" if images_to_route > 0: status_message += f", Forwarding {images_to_route} images to {self.valves.vision_model_id}" await __event_emitter__( { "type": "status", "data": { "description": status_message, "done": False, }, } ) if not images_found: # No images to process return body # Process each image processed_image_count = 0 # Counter for images being processed (not cached) for idx, image_info in enumerate(images_found): # Generate a unique key for the image image_key = self.get_image_key(image_info) if not image_key: continue # Check if we already have a description if image_key in self.image_description_cache: description = self.image_description_cache[image_key] image_descriptions.append( { "description": description, "message_index": image_info["message_index"], "image_index_in_message": image_info["image_index_in_message"], } ) continue # Skip processing processed_image_count += 1 # Increment processed image counter try: # Prepare the image_part if image_info["type"] == "image_url" and image_info.get("image_url"): image_part = { "type": "image_url", "image_url": image_info["image_url"], } elif image_info["type"] == "image" and image_info.get("image"): image_part = {"type": "image", "image": image_info["image"]} else: continue # Skip if no valid image data # Construct messages to send to the vision model messages_to_vision_model = [ { "role": "user", "content": [ { "type": "text", "text": self.valves.image_description_prompt, }, image_part, ], }, ] payload = { "model": self.valves.vision_model_id, "messages": messages_to_vision_model, "stream": False, } user = ( Users.get_user_by_id(__user__["id"]) if __user__ and "id" in __user__ else None ) response = await generate_chat_completion( request=__request__, form_data=payload, user=user ) # Updated content = response["choices"][0]["message"]["content"] if content is not None: image_descriptions.append( { "description": content, "message_index": image_info["message_index"], "image_index_in_message": image_info[ "image_index_in_message" ], } ) # Store the description in the cache self.image_description_cache[image_key] = content # Update counters total_images_processed += 1 word_count = len(content.split()) total_words += word_count if self.valves.status: await __event_emitter__( { "type": "status", "data": { "description": f"Processed image {processed_image_count}/{images_to_route}: {word_count} description words", "done": False, }, } ) else: if self.valves.status: await __event_emitter__( { "type": "status", "data": { "description": f"No description generated for image {processed_image_count}", "done": False, }, } ) image_descriptions.append( { "description": "No description generated.", "message_index": image_info["message_index"], "image_index_in_message": image_info[ "image_index_in_message" ], } ) except Exception as e: if self.valves.status: await __event_emitter__( { "type": "status", "data": { "description": f"Error processing image {processed_image_count}: {str(e)}", "done": True, }, } ) return body # Do not change anything on error # Modify the original messages for idx_message, message in enumerate(messages): if message.get("role") == "user": content = message.get("content", "") new_content_list = [] # Get descriptions for this message descriptions_in_message = [ desc for desc in image_descriptions if desc["message_index"] == idx_message ] descriptions_in_message.sort(key=lambda x: x["image_index_in_message"]) # Add image descriptions to new content for idx_img, desc in enumerate(descriptions_in_message): image_text = ( f"Image {idx_img+1}: {self.valves.image_context_text} " f"Image description: {desc['description']}" ) new_content_list.append({"type": "text", "text": image_text}) # Extract and add original text content if isinstance(content, list): for part in content: if part.get("type") == "text": new_content_list.append(part) elif isinstance(content, str): new_content_list.append({"type": "text", "text": content}) # Update message content and remove images message["content"] = new_content_list message.pop("images", None) else: continue # Do not modify non-user messages # Send final status message if self.valves.status and images_found: total_replaced = len(images_found) status_message = ( f"Replaced {total_replaced} images ({cached_images} cached)" ) await __event_emitter__( { "type": "status", "data": { "description": status_message, "done": True, }, } ) return body def get_image_key(self, image_info): try: if image_info["type"] == "image_url": url = image_info.get("image_url") return str(url) if url else None elif image_info["type"] == "image": image_data = image_info.get("image") if image_data is None: return None # Handle different image data types if isinstance(image_data, bytes): data = image_data elif hasattr(image_data, "read"): data = image_data.read() else: data = str(image_data).encode("utf-8") return hashlib.sha256(data).hexdigest() return None except Exception as e: print(f"Error generating image key: {e}") return None