"""
title: Gemini Vision
authors: MMie
author_url: https://github.com/open-webui
version: 0.1.0
license: MIT
required_open_webui_version: 0.3.8
environment_variables:
- GOOGLE_API_KEY (required for image processing)
- GOOGLE_VISION_MODEL (optional, default: gemini-2.0-flash)
"""
import os
import time
import re
import logging
from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple, Union
import google.generativeai as genai
from pydantic import BaseModel, Field
class CacheEntry:
def __init__(self, description: str, ts: Optional[float] = None):
self.description = description
self.timestamp = ts if ts is not None else time.time()
class Filter:
class Valves(BaseModel):
priority: int = Field(
default=0,
description="Execution priority of this filter",
)
GOOGLE_API_KEY: str = Field(
default=os.getenv("GOOGLE_API_KEY", ""),
description="Google API Key for Gemini Vision",
)
GOOGLE_VISION_MODEL: str = Field(
default=os.getenv("GOOGLE_VISION_MODEL", "gemini-2.0-flash"),
description="Gemini model ID (e.g., gemini-2.0-flash or gemini-1.5-pro)",
)
status: bool = Field(
default=True,
description="Send status messages to the UI",
)
max_images: int = Field(
default=6,
description="Maximum number of images per user message",
)
cache_ttl_sec: int = Field(
default=1800,
description="Cache lifetime for image descriptions (seconds)",
)
VISION_PROMPT: str = """You are an expert in precise, lossless image analysis.
Perform the following tasks accurately and without omissions:
A) 1:1-TRANSCRIPTION OF ALL TEXT ELEMENTS:
- Transcribe ALL character strings exactly: titles, headings, labels, legends, axis labels, units, diagram texts, arrow texts, notes, formulas, numbering.
- Preserve capitalization, special characters, spaces, line breaks.
- Do not omit anything, do not summarize, do not rephrase.
- Mark unreadable characters as [?].
B) VISUAL ANALYSIS:
- Describe completely and in detail all relevant objects, symbols, diagrams, curves, axes (with scales, units, marked values), geometric shapes, colors, relative positions, and relationships.
C) TECHNICAL INTERPRETATION:
- Explain fully what is depicted - do not summarize (e.g., physical system, connections).
- State clear key statements.
- Render formulas exactly as LaTeX if recognizable.
OUTPUT THE RESULT EXACTLY IN THIS FORMAT:
TEXT-IN-IMAGE:
<verbatim transcription, block by block>
VISUAL-ELEMENTS:
<detailed description>
INTERPRETATION:
<brief technical classification>"""
def __init__(self):
logging.basicConfig(level=logging.INFO)
self.valves = self.Valves()
self.image_cache: Dict[str, CacheEntry] = {}
self._re_two_newlines = re.compile(r"\n\n+")
def _clean_expired_cache(self):
now = time.time()
ttl = float(self.valves.cache_ttl_sec)
to_delete = []
for key, entry in self.image_cache.items():
if now - entry.timestamp > ttl:
to_delete.append(key)
for k in to_delete:
del self.image_cache[k]
def _extract_images_and_text(self, message: Dict) -> Tuple[List[Dict], str]:
images: List[Dict] = []
text_parts: List[str] = []
content = message.get("content", "")
if isinstance(content, list):
for item in content:
if item.get("type") == "text":
t = item.get("text", "")
if isinstance(t, str):
text_parts.append(t)
elif item.get("type") == "image_url":
images.append(item)
elif isinstance(content, str):
text_parts.append(content)
if not images:
raw_images = message.get("images")
if isinstance(raw_images, list):
for im in raw_images:
if isinstance(im, str):
images.append({"type": "image_url", "image_url": {"url": im}})
elif isinstance(im, dict):
if "url" in im and isinstance(im["url"], str):
images.append(
{"type": "image_url", "image_url": {"url": im["url"]}}
)
elif "b64" in im and isinstance(im["b64"], str):
images.append(
{
"type": "image_url",
"image_url": {
"url": "data:image/jpeg;base64," + im["b64"]
},
}
)
text = " ".join(tp for tp in text_parts if isinstance(tp, str)).strip()
return images, text
def _build_image_part(self, image_url: str) -> Dict:
if image_url.startswith("data:image"):
try:
header, b64 = image_url.split(",", 1)
mime = "image/jpeg"
if ":" in header and ";" in header:
mime = header.split(":", 1)[1].split(";", 1)[0]
return {"inline_data": {"mime_type": mime, "data": b64}}
except Exception:
return {"image_url": image_url}
else:
return {"image_url": image_url}
async def _emit(
self, emitter: Optional[Callable[[Any], Awaitable[None]]], text: str, done: bool
):
if self.valves.status and emitter:
await emitter(
{"type": "status", "data": {"description": text, "done": done}}
)
async def _describe_image_with_gemini(self, image_item: Dict, emitter=None) -> str:
try:
if not self.valves.GOOGLE_API_KEY:
return "[Note: Image analysis disabled (GOOGLE_API_KEY missing).]"
image_url = image_item.get("image_url", {}).get("url", "")
if not isinstance(image_url, str) or not image_url:
return "[Image error: invalid image URL]"
self._clean_expired_cache()
cache_key = (
image_url if not image_url.startswith("data:image") else image_url[:64]
)
if cache_key in self.image_cache:
return self.image_cache[cache_key].description
await self._emit(emitter, "Analyzing image with Gemini...", False)
genai.configure(api_key=self.valves.GOOGLE_API_KEY)
model = genai.GenerativeModel(self.valves.GOOGLE_VISION_MODEL)
image_part = self._build_image_part(image_url)
resp = model.generate_content([self.VISION_PROMPT, image_part])
description = (resp.text or "").strip() if hasattr(resp, "text") else ""
if not description:
description = "[Note: No description received from Gemini.]"
self.image_cache[cache_key] = CacheEntry(description)
await self._emit(emitter, "Image analysis completed", True)
return description
except Exception as e:
await self._emit(emitter, f"Error: {str(e)}", True)
return f"[Image error: {str(e)}]"
async def inlet(
self,
body: Dict,
__event_emitter__: Optional[Callable[[Any], Awaitable[None]]] = None,
user: Optional[Dict] = None,
model: Optional[Dict] = None,
) -> Dict:
"""
Inlet function corrected: No double **kwargs anymore.
user and model are explicitly queried as Optional[Dict].
"""
messages = body.get("messages")
if not isinstance(messages, list) or not messages:
return body
if not self.valves.GOOGLE_API_KEY:
# Only log, but continue
logging.warning("Gemini Vision Filter: No API Key set.")
new_messages: List[Dict] = []
for msg in messages:
role = msg.get("role", "")
if role != "user":
new_messages.append(msg)
continue
images, text = self._extract_images_and_text(msg)
if not images:
new_messages.append(msg)
continue
# Check limit
limit = max(0, int(self.valves.max_images))
if limit and len(images) > limit:
images = images[:limit]
# If no key, remove images instead of throwing error
if not self.valves.GOOGLE_API_KEY:
combined = (
(text + "\n\n" if text else "")
+ "[Images removed: Please set GOOGLE_API_KEY in the filter valves]"
)
new_messages.append({"role": role, "content": combined.strip()})
continue
descriptions: List[str] = []
for idx, image in enumerate(images, 1):
desc = await self._describe_image_with_gemini(image, __event_emitter__)
descriptions.append(
f"--- IMAGE {idx} ANALYSIS ---\n{desc}\n-----------------------"
)
combined_content = (text + "\n\n" if text else "") + "\n\n".join(
descriptions
)
combined_content = self._re_two_newlines.sub(
"\n\n", combined_content
).strip()
new_messages.append({"role": role, "content": combined_content})
body["messages"] = new_messages
return body