Gemini Vision for Text LLM Function • Open WebUI Community

""" title: Gemini Vision authors: MMie author_url: https://github.com/open-webui version: 0.1.0 license: MIT required_open_webui_version: 0.3.8 environment_variables: - GOOGLE_API_KEY (required for image processing) - GOOGLE_VISION_MODEL (optional, default: gemini-2.0-flash) """ import os import time import re import logging from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple, Union import google.generativeai as genai from pydantic import BaseModel, Field class CacheEntry: def __init__(self, description: str, ts: Optional[float] = None): self.description = description self.timestamp = ts if ts is not None else time.time() class Filter: class Valves(BaseModel): priority: int = Field( default=0, description="Execution priority of this filter", ) GOOGLE_API_KEY: str = Field( default=os.getenv("GOOGLE_API_KEY", ""), description="Google API Key for Gemini Vision", ) GOOGLE_VISION_MODEL: str = Field( default=os.getenv("GOOGLE_VISION_MODEL", "gemini-2.0-flash"), description="Gemini model ID (e.g., gemini-2.0-flash or gemini-1.5-pro)", ) status: bool = Field( default=True, description="Send status messages to the UI", ) max_images: int = Field( default=6, description="Maximum number of images per user message", ) cache_ttl_sec: int = Field( default=1800, description="Cache lifetime for image descriptions (seconds)", ) VISION_PROMPT: str = """You are an expert in precise, lossless image analysis. Perform the following tasks accurately and without omissions: A) 1:1-TRANSCRIPTION OF ALL TEXT ELEMENTS: - Transcribe ALL character strings exactly: titles, headings, labels, legends, axis labels, units, diagram texts, arrow texts, notes, formulas, numbering. - Preserve capitalization, special characters, spaces, line breaks. - Do not omit anything, do not summarize, do not rephrase. - Mark unreadable characters as [?]. B) VISUAL ANALYSIS: - Describe completely and in detail all relevant objects, symbols, diagrams, curves, axes (with scales, units, marked values), geometric shapes, colors, relative positions, and relationships. C) TECHNICAL INTERPRETATION: - Explain fully what is depicted - do not summarize (e.g., physical system, connections). - State clear key statements. - Render formulas exactly as LaTeX if recognizable. OUTPUT THE RESULT EXACTLY IN THIS FORMAT: TEXT-IN-IMAGE: <verbatim transcription, block by block> VISUAL-ELEMENTS: <detailed description> INTERPRETATION: <brief technical classification>""" def __init__(self): logging.basicConfig(level=logging.INFO) self.valves = self.Valves() self.image_cache: Dict[str, CacheEntry] = {} self._re_two_newlines = re.compile(r"\n\n+") def _clean_expired_cache(self): now = time.time() ttl = float(self.valves.cache_ttl_sec) to_delete = [] for key, entry in self.image_cache.items(): if now - entry.timestamp > ttl: to_delete.append(key) for k in to_delete: del self.image_cache[k] def _extract_images_and_text(self, message: Dict) -> Tuple[List[Dict], str]: images: List[Dict] = [] text_parts: List[str] = [] content = message.get("content", "") if isinstance(content, list): for item in content: if item.get("type") == "text": t = item.get("text", "") if isinstance(t, str): text_parts.append(t) elif item.get("type") == "image_url": images.append(item) elif isinstance(content, str): text_parts.append(content) if not images: raw_images = message.get("images") if isinstance(raw_images, list): for im in raw_images: if isinstance(im, str): images.append({"type": "image_url", "image_url": {"url": im}}) elif isinstance(im, dict): if "url" in im and isinstance(im["url"], str): images.append( {"type": "image_url", "image_url": {"url": im["url"]}} ) elif "b64" in im and isinstance(im["b64"], str): images.append( { "type": "image_url", "image_url": { "url": "data:image/jpeg;base64," + im["b64"] }, } ) text = " ".join(tp for tp in text_parts if isinstance(tp, str)).strip() return images, text def _build_image_part(self, image_url: str) -> Dict: if image_url.startswith("data:image"): try: header, b64 = image_url.split(",", 1) mime = "image/jpeg" if ":" in header and ";" in header: mime = header.split(":", 1)[1].split(";", 1)[0] return {"inline_data": {"mime_type": mime, "data": b64}} except Exception: return {"image_url": image_url} else: return {"image_url": image_url} async def _emit( self, emitter: Optional[Callable[[Any], Awaitable[None]]], text: str, done: bool ): if self.valves.status and emitter: await emitter( {"type": "status", "data": {"description": text, "done": done}} ) async def _describe_image_with_gemini(self, image_item: Dict, emitter=None) -> str: try: if not self.valves.GOOGLE_API_KEY: return "[Note: Image analysis disabled (GOOGLE_API_KEY missing).]" image_url = image_item.get("image_url", {}).get("url", "") if not isinstance(image_url, str) or not image_url: return "[Image error: invalid image URL]" self._clean_expired_cache() cache_key = ( image_url if not image_url.startswith("data:image") else image_url[:64] ) if cache_key in self.image_cache: return self.image_cache[cache_key].description await self._emit(emitter, "Analyzing image with Gemini...", False) genai.configure(api_key=self.valves.GOOGLE_API_KEY) model = genai.GenerativeModel(self.valves.GOOGLE_VISION_MODEL) image_part = self._build_image_part(image_url) resp = model.generate_content([self.VISION_PROMPT, image_part]) description = (resp.text or "").strip() if hasattr(resp, "text") else "" if not description: description = "[Note: No description received from Gemini.]" self.image_cache[cache_key] = CacheEntry(description) await self._emit(emitter, "Image analysis completed", True) return description except Exception as e: await self._emit(emitter, f"Error: {str(e)}", True) return f"[Image error: {str(e)}]" async def inlet( self, body: Dict, __event_emitter__: Optional[Callable[[Any], Awaitable[None]]] = None, user: Optional[Dict] = None, model: Optional[Dict] = None, ) -> Dict: """ Inlet function corrected: No double **kwargs anymore. user and model are explicitly queried as Optional[Dict]. """ messages = body.get("messages") if not isinstance(messages, list) or not messages: return body if not self.valves.GOOGLE_API_KEY: # Only log, but continue logging.warning("Gemini Vision Filter: No API Key set.") new_messages: List[Dict] = [] for msg in messages: role = msg.get("role", "") if role != "user": new_messages.append(msg) continue images, text = self._extract_images_and_text(msg) if not images: new_messages.append(msg) continue # Check limit limit = max(0, int(self.valves.max_images)) if limit and len(images) > limit: images = images[:limit] # If no key, remove images instead of throwing error if not self.valves.GOOGLE_API_KEY: combined = ( (text + "\n\n" if text else "") + "[Images removed: Please set GOOGLE_API_KEY in the filter valves]" ) new_messages.append({"role": role, "content": combined.strip()}) continue descriptions: List[str] = [] for idx, image in enumerate(images, 1): desc = await self._describe_image_with_gemini(image, __event_emitter__) descriptions.append( f"--- IMAGE {idx} ANALYSIS ---\n{desc}\n-----------------------" ) combined_content = (text + "\n\n" if text else "") + "\n\n".join( descriptions ) combined_content = self._re_two_newlines.sub( "\n\n", combined_content ).strip() new_messages.append({"role": role, "content": combined_content}) body["messages"] = new_messages return body