We're Hiring!
Whitepaper
Docs
Sign In
Tool
Tool
v0.0.2
Web Search and scraping with Searxng and Docling
Last Updated
7 months ago
Created
7 months ago
Tool ID
web_search_searxng_docling
Creator
@kemon
Downloads
336+
Get
Sponsored by Open WebUI Inc.
We are hiring!
Shape the way humanity engages with
intelligence
.
Description
This tool uses searxng to search the web and docling to scrape the webpages
README
Tool Code
Show
""" title: Searxng Search and Docling Scraper author: Kemon description: This tool uses searxng to search the web and docling to scrape the webpages required_open_webui_version: 0.6.0 requirements: version: 0.0.2 licence: MIT """ import requests from datetime import date, datetime from pydantic import BaseModel, Field from typing import List, Optional, Callable, Awaitable class EventEmitter: def __init__(self, event_emitter: Callable[[dict], Awaitable[None]] = None): self.event_emitter = event_emitter async def progress_update(self, description): await self.emit(description) async def error_update(self, description): await self.emit(description, "error", True) async def success_update(self, description): await self.emit(description, "success", True) async def emit(self, description="Unknown State", status="in_progress", done=False): if self.event_emitter: await self.event_emitter( { "type": "status", "data": { "status": status, "description": description, "done": done, }, } ) async def citation(self, document, url, name=None): if self.event_emitter: if name == None: name = url await self.event_emitter( { "type": "citation", "data": { "document": [document], "metadata": [ { "date_accessed": datetime.now().isoformat(), "source": url, } ], "source": {"name": name, "url": url}, }, } ) class Tools: class Valves(BaseModel): DOCLING_URL: str = Field( default="http://host.docker.internal:5001", description="Docling URL", ) DOCLING_TIMEOUT: float = Field( default=5, description="Docling URL Timeout", ) SEARXNG_URL: str = Field( default="http://host.docker.internal:8080", description="SearXNG URL", ) SEARXNG_SCORE_FILTER: float = Field( default=1, description="SearXNG Score Filter", ) SEARXNG_SCORE_RESULT: int = Field( default=5, description="SearXNG Score Result", ) def __init__(self): self.valves = self.Valves() self.citation = False pass async def docling( self, urls: list, __event_emitter__: Callable[[dict], Awaitable[None]], ) -> str: """ Web scrape the website provided and get the content of it. :param urls: List of urls. :return: Return relevant context. """ docling_url = self.valves.DOCLING_URL docling_timeout = self.valves.DOCLING_TIMEOUT emitter = EventEmitter(__event_emitter__) await emitter.progress_update("Scraping") api_url = f"{docling_url}/v1alpha/convert/source" headers = {"accept": "application/json", "Content-Type": "application/json"} all_responses = [] for url in urls: # add https to url if it does not start with http or https if not url.startswith("http") and not url.startswith("https"): url = f"https://{url}" data = { "options": {"to_formats": ["md", "doctags"]}, "http_sources": [{"url": url}], } if url.endswith(".pdf") or url.endswith(".docx"): docling_timeout = docling_timeout * 2 try: await emitter.progress_update(f"Scraping {url}") response = requests.post( api_url, headers=headers, json=data, timeout=docling_timeout ) await emitter.success_update("Scraping complete") except requests.exceptions.Timeout: await emitter.error_update(f"Timeout occurred for {url}") continue except requests.exceptions.RequestException as e: await emitter.error_update(f"{e}") return f"{e}" if response.status_code == 200: data = response.json() # Extract md_content and md_doctags from the JSON object md_content = data["document"]["md_content"] md_doctags = data["document"]["doctags_content"] # Use a ternary operator to set the 'content' variable content = md_content if md_content != "" else md_doctags all_responses.append(f"<url:{url}>\n{content}\n</url:{url}>\n") await emitter.citation(content, url) else: await emitter.error_update(f"Error fetching {url}") return "".join(all_responses) async def searxng( self, search_query: str, __event_emitter__: Callable[[dict], Awaitable[None]] ) -> str: """ Search the web and get the content of the relevant pages. Search for unknown knowledge, news, info, public contact info, weather, etc. :param query: Web Query used in search engine. :return: Provide the answer to the question as promptly as possible. """ searxng_url = self.valves.SEARXNG_URL searxng_score_filter = self.valves.SEARXNG_SCORE_FILTER searxng_score_result = self.valves.SEARXNG_SCORE_RESULT emitter = EventEmitter(__event_emitter__) api_url = f"{searxng_url}/search?q={search_query}&format=json&safesearch=0" headers = {"Content-Type": "application/json", "Accept": "application/json"} try: await emitter.progress_update("Searching") response = requests.get(api_url, headers=headers) except requests.exceptions.Timeout: await emitter.error_update(f"Timeout occurred for {url}") return f"Timeout occurred for {url}" except requests.exceptions.RequestException as e: await emitter.error_update(f"{e}") return f"{e}" await emitter.progress_update("Search Complete") if response.status_code == 200: data = response.json() await emitter.progress_update("Generating Response") data["results"] = [ result["url"] for result in data["results"] if result["score"] > searxng_score_filter ] results = await self.docling( data["results"][:searxng_score_result], __event_emitter__ ) await emitter.success_update(f"Complete! Search Query: {search_query}") return results else: await emitter.error_update( f"Failed to fetch results from SearXNG: {response.status_code}" ) return [f"Failed to fetch results from SearXNG: {response.status_code}"]