Chat2HTML • Open WebUI Community

""" title: Chat2HTML author: BobbyLLM (orginal idea: based on https://openwebui.com/f/joselico/doc_builder_md_pdf) version: 1.0.0 """ import os import re import datetime import base64 import json from pathlib import Path from typing import Optional from pydantic import BaseModel, Field class Action: class Valves(BaseModel): save_dir: str = Field( default="", description="Override folder where Markdown files are stored.", ) def __init__(self): self.valves = self.Valves() # Default exports directory: <cwd>/exports/md self.default_md_dir = Path(os.getcwd()) / "exports" / "md" self.default_md_dir.mkdir(parents=True, exist_ok=True) # ------------------------------------------------------------------ # Helpers # ------------------------------------------------------------------ def _nowstamp(self, date_only: bool = False) -> str: fmt = "%Y-%m-%d" if date_only else "%Y-%m-%d %H:%M" return datetime.datetime.now().strftime(fmt) def _default_basename(self) -> str: return f"document-{datetime.datetime.now().strftime('%Y%m%d-%H%M')}" def _sanitize_name(self, name: str, ext: str) -> str: name = (name or "").strip() or self._default_basename() # Remove control chars name = re.sub(r"[\x00-\x1F\x7F]+", " ", name) # Replace filesystem-unsafe chars name = re.sub(r"[\\/:*?\"<>|]+", "-", name) # Collapse spaces name = re.sub(r"\s{2,}", " ", name).strip() # Normalize dots and spaces name = re.sub(r"\s*\.\s*", ".", name) name = name.lstrip(".-_ ").strip() # Strip known extensions from the end known_exts = ("md", "markdown", "txt", "html", "htm", "doc", "docx", "rtf") base = name for e in (ext,) + known_exts: new_base = re.sub( rf"\.\s*{re.escape(e)}\s*$", "", base, flags=re.IGNORECASE ) if new_base != base: base = new_base break base = base.rstrip(" .") base = (base or self._default_basename())[:120] return f"{base}.{ext}" def _emit_code_fence(self, txt: str, lang: str = "") -> str: lang = (lang or "").strip() if not re.fullmatch(r"[A-Za-z0-9_+.\-#]*", lang): lang = "" return f"```{lang}\n{txt or ''}\n```" def _flat_content(self, content) -> str: if content is None: return "" if isinstance(content, str): return content if isinstance(content, list): parts = [] for it in content: if isinstance(it, dict): typ = (it.get("type") or "").lower() lang = it.get("language") or it.get("lang") or "" code_txt = ( it.get("text") or it.get("content") or it.get("code") or (isinstance(it.get("data"), dict) and it["data"].get("code")) or "" ) if typ in { "code", "codeblock", "code_block", "pre", "markdown_code", "code-cell", }: parts.append(self._emit_code_fence(code_txt, lang)) else: parts.append(it.get("text") or it.get("content") or "") elif isinstance(it, str): parts.append(it) return "\n".join(parts) if isinstance(content, dict): typ = (content.get("type") or "").lower() lang = content.get("language") or content.get("lang") or "" t = ( content.get("text") or content.get("content") or content.get("code") or "" ) if typ in { "code", "codeblock", "code_block", "pre", "markdown_code", "code-cell", }: return self._emit_code_fence(t, lang) return t return str(content) def _auto_fence_code_in_text(self, text: str) -> str: if "```" in (text or ""): return text or "" lines = (text or "").splitlines() out, buf = [], [] def is_md_heading(s: str) -> bool: return re.match(r"^\s{0,3}#{1,6}\s+\S", s) is not None def codey(l: str) -> bool: if re.match(r"^\s{4,}|\t", l): return True if is_md_heading(l): return False if l.lstrip().startswith("#"): return True if re.match( r"^\s*(import|from|def|class|try|except|with|for|while|if|elif|else|return|print\(|@)", l, ): return True if l.rstrip().endswith(":"): return True return False in_seq = False for l in lines: if codey(l): buf.append(l) in_seq = True else: if in_seq: if len(buf) >= 3: out.append("```") out.extend(buf) out.append("```") else: out.extend(buf) buf, in_seq = [], False out.append(l) if in_seq: if len(buf) >= 3: out.append("```") out.extend(buf) out.append("```") else: out.extend(buf) return "\n".join(out) # ------------------------------------------------------------ # INLINE JPEG IMAGE SUPPORT # ------------------------------------------------------------ def _extract_inline_images( self, text: str, out_dir: Path, images_subdir: str = "images", ) -> str: """ Find patterns like: ![alt](data:image/jpeg;base64,XXXX) Save them as: <out_dir>/images/image_001.jpg Replace in Markdown with: ![alt](images/image_001.jpg) [C:\\...\images\\image_001.jpg] """ img_dir = out_dir / images_subdir img_dir.mkdir(parents=True, exist_ok=True) pattern = re.compile( r"!\[(.*?)\]$data:image\/(?:jpeg|jpg);base64,([A-Za-z0-9+/=]+)$", re.IGNORECASE, ) out = text i = 1 for match in pattern.finditer(text): alt = match.group(1) or "image" b64 = match.group(2) fname = f"image_{i:03d}.jpg" fpath = img_dir / fname try: fpath.write_bytes(base64.b64decode(b64)) except Exception: # If decoding fails, leave this inline image as-is continue # Markdown-relative path for renderers rel_path = f"{images_subdir}/{fname}" # Raw absolute Windows path windows_path = str(fpath.resolve()) # One tidy inline replacement: # ![Generated Image](images/image_001.jpg) [C:\...\images\image_001.jpg] replacement = f"![{alt}]({rel_path}) [{windows_path}]" out = out.replace(match.group(0), replacement, 1) i += 1 return out # ------------------------------------------------------------ # FILE → DATA URL HELPER (optional, unused here) # ------------------------------------------------------------ def _file_to_data_url(self, path: Path) -> Optional[str]: if not path or not path.is_file(): return None ext = path.suffix.lower() mime_map = { ".jpg": "image/jpeg", ".jpeg": "image/jpeg", ".png": "image/png", ".gif": "image/gif", ".webp": "image/webp", ".svg": "image/svg+xml", } mime = mime_map.get(ext, "application/octet-stream") try: data = path.read_bytes() except Exception: return None b64 = base64.b64encode(data).decode("ascii") return f"data:{mime};base64,{b64}" # ------------------------------------------------------------ # HTML conversion (rough manual) # ------------------------------------------------------------ def _escape_html(self, s: str) -> str: return s.replace("&", "&").replace("<", "<").replace(">", ">") def _md_to_html(self, md: str, title: str) -> str: """ Very rough Markdown → HTML: - # / ## / ... headings - --- as <hr> - our image+path pattern as clickable <a><img></a> - everything else as <p> - basic fenced code block support """ lines = md.splitlines() html_lines = [] in_code = False code_buf = [] for raw_line in lines: line = raw_line.rstrip("\n") if line.startswith("```"): if not in_code: in_code = True code_buf = [] else: # closing fence in_code = False code_html = self._escape_html("\n".join(code_buf)) html_lines.append(f"<pre><code>{code_html}</code></pre>") code_buf = [] continue if in_code: code_buf.append(line) continue stripped = line.strip() if not stripped: continue # Headings m = re.match(r"^(#{1,6})\s+(.*)", stripped) if m: level = len(m.group(1)) content = self._escape_html(m.group(2)) html_lines.append(f"<h{level}>{content}</h{level}>") continue # Horizontal rule if re.fullmatch(r"-{3,}", stripped): html_lines.append("<hr />") continue # Our image + Windows path pattern: # ![alt](images/image_001.jpg) [C:\...\images\image_001.jpg] m = re.match(r"!\[(.*?)\]$([^)]+)$\s+\[(.+)\]", stripped) if m: alt = self._escape_html(m.group(1)) src = self._escape_html(m.group(2)) win_path = self._escape_html(m.group(3)) html_lines.append( f'<p><a href="{src}"><img src="{src}" alt="{alt}" /></a> ' f'<span class="img-path">{win_path}</span></p>' ) continue # Default paragraph html_lines.append(f"<p>{self._escape_html(stripped)}</p>") body = "\n".join(html_lines) title_html = self._escape_html(title or "Export") return ( "<!DOCTYPE html>\n<html>\n<head>\n" '<meta charset="utf-8">\n' f"<title>{title_html}</title>\n" "<style>" "body{font-family:system-ui,Arial,sans-serif;max-width:800px;margin:2em auto;line-height:1.5;}" "pre{background:#f4f4f4;padding:1em;overflow:auto;}" "code{font-family:Consolas,monospace;}" "h1,h2,h3{margin-top:1.4em;}" ".img-path{font-size:0.9em;color:#555;margin-left:0.5em;}" "</style>\n" "</head>\n<body>\n" f"{body}\n" "</body>\n</html>" ) # ------------------------------------------------------------ # Browser-side forced download (kept for reference, NOT USED) # ------------------------------------------------------------ async def _force_download_md( self, __event_call__, filename: str, md_text: str ) -> Optional[str]: if not __event_call__ or not callable(__event_call__): return "err:execute" b64 = base64.b64encode(md_text.encode("utf-8")).decode("ascii") js = f""" (function(){{ try {{ const a = document.createElement('a'); const b64 = {json.dumps(b64)}; const fname = {json.dumps(filename)}; a.href = "data:text/markdown;charset=utf-8;base64," + b64; a.download = fname; document.body.appendChild(a); a.click(); setTimeout(() => a.remove(), 0); return "ok"; }} catch(e) {{ return "err:" + (e && e.message ? e.message : 'unknown'); }} }})(); """ try: return await __event_call__({"type": "execute", "data": {"code": js}}) except Exception: return "err:execute" async def _prompt( self, __event_call__, title: str, placeholder: str = "", ) -> str: if not __event_call__ or not callable(__event_call__): return "" safe_id = re.sub(r"[^A-Za-z0-9_]+", "_", title).strip("_") or "prompt" try: r = await __event_call__( { "type": "input", "data": { "id": safe_id, "title": title, "message": "", "placeholder": placeholder, }, } ) except Exception: return "" d = r.get("data") if isinstance(r, dict) else r return d if isinstance(d, str) else "" async def _notify(self, emitter, payload): if not emitter or not callable(emitter): return try: await emitter(payload) except Exception: pass # ------------------------------------------------------------ # Directory handling # ------------------------------------------------------------ def _output_md_dir(self, save_dir: str) -> Path: """ Where to save .md/.txt/.html and images/: - If valves.save_dir is non-empty, use that (after stripping whitespace). - Else, use default exports/md. """ save_dir = (save_dir or "").strip() # strip \n, spaces, etc. if save_dir: target = Path(save_dir).expanduser() else: target = self.default_md_dir target.mkdir(parents=True, exist_ok=True) return target def _unique_path(self, base_path: Path) -> Path: """ If base_path exists, append a timestamp suffix so we don't overwrite: e.g. test.md -> test-20250101-120102.md """ if not base_path.exists(): return base_path ts = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") return base_path.with_name(f"{base_path.stem}-{ts}{base_path.suffix}") # ------------------------------------------------------------ # Main action # ------------------------------------------------------------ async def action( self, body: dict, user: Optional[dict] = None, __event_emitter__=None, __event_call__=None, ): try: messages = body.get("messages", []) or [] # Ask what to export selection = ( await self._prompt( __event_call__, "What would you like exported", "1 = LLM's last output • 2 = User's last input • 3 = Entire conversation to date", ) ).strip()[:1] if not selection: await self._notify( __event_emitter__, { "type": "notification", "data": { "type": "warning", "content": "Cancelled: source selection is required.", }, }, ) return # Choose source text if selection == "1": last = next( (m for m in reversed(messages) if m.get("role") == "assistant"), None, ) src = ( self._flat_content((last or {}).get("content")) or "(No recent assistant message.)" ) elif selection == "2": last = next( (m for m in reversed(messages) if m.get("role") == "user"), None ) src = ( self._flat_content((last or {}).get("content")) or "(No recent user message.)" ) elif selection == "3": # Better conversation formatting with headings lines = [] for m in messages: r = m.get("role") if r not in ("user", "assistant", "system"): continue t = self._flat_content(m.get("content")).strip() if not t: continue if r == "user": lines.append(f"## User\n\n{t}\n") elif r == "assistant": lines.append(f"## Assistant\n\n{t}\n") else: # keep system notes as HTML-style comments in MD lines.append( f"\n" ) src = "\n---\n".join(lines).strip() or "(Empty conversation)" else: await self._notify( __event_emitter__, { "type": "notification", "data": {"type": "warning", "content": "Invalid option."}, }, ) return # Auto-fence obvious code blocks if no fences exist yet src = self._auto_fence_code_in_text(src) # File name prompt suggested = self._default_basename() base_name = ( await self._prompt( __event_call__, "What do you want to call this file?", suggested ) ).strip() or suggested # We'll use the base name without extension and add ext per format base_sanitized_md = self._sanitize_name(base_name, "md") base_stem = Path(base_sanitized_md).stem title = base_stem now = self._nowstamp(date_only=True) # Resolve output directory (user-set save_dir or default) out_dir = self._output_md_dir(self.valves.save_dir) # Convert inline base64 JPEGs to files in out_dir/images src = self._extract_inline_images(src, out_dir, images_subdir="images") # Ask for export format fmt = ( await self._prompt( __event_call__, "Export format", "1 = Markdown (.md) • 2 = Plain text (.txt) • 3 = HTML (.html)", ) ).strip()[:1] or "1" # Build content per format if fmt == "2": # Plain text text_content = f"{title}\n{now}\n\n{'-' * 40}\n\n{src}\n" ext = "txt" content = text_content elif fmt == "3": # HTML (rough Markdown → HTML) md_full = f"# {title}\n\n*{now}*\n\n---\n\n{src}\n" html_content = self._md_to_html(md_full, title) ext = "html" content = html_content else: # Default: Markdown md_text = f"# {title}\n\n*{now}*\n\n---\n\n{src}\n" ext = "md" content = md_text # Final filename with chosen extension final_name = self._sanitize_name(base_stem, ext) file_path = out_dir / final_name file_path = self._unique_path(file_path) # avoid overwriting # Write file file_path.write_text(content, encoding="utf-8") await self._notify( __event_emitter__, { "type": "notification", "data": { "type": "success", "content": f"Export saved: {file_path}", }, }, ) except Exception as exc: await self._notify( __event_emitter__, { "type": "message", "data": {"content": f"Doc Builder error (export): {exc}"}, }, )