Whitepaper
Docs
Sign In
Tool
Tool
v0.0.2
Web Search and scraping with Searxng and Docling
Tool ID
web_search_searxng_docling
Creator
@kemon
Downloads
72+
This tool uses searxng to search the web and docling to scrape the webpages
Get
README
Web Search and scraping with Searxng and Docling
This tool uses searxng to search the web and docling to scrape the webpages.
Requires searxng and docling.
Tool Code
Show
""" title: Searxng Search and Docling Scraper author: Kemon description: This tool uses searxng to search the web and docling to scrape the webpages required_open_webui_version: 0.6.0 requirements: version: 0.0.2 licence: MIT """ import requests from datetime import date, datetime from pydantic import BaseModel, Field from typing import List, Optional, Callable, Awaitable class EventEmitter: def __init__(self, event_emitter: Callable[[dict], Awaitable[None]] = None): self.event_emitter = event_emitter async def progress_update(self, description): await self.emit(description) async def error_update(self, description): await self.emit(description, "error", True) async def success_update(self, description): await self.emit(description, "success", True) async def emit(self, description="Unknown State", status="in_progress", done=False): if self.event_emitter: await self.event_emitter( { "type": "status", "data": { "status": status, "description": description, "done": done, }, } ) async def citation(self, document, url, name=None): if self.event_emitter: if name == None: name = url await self.event_emitter( { "type": "citation", "data": { "document": [document], "metadata": [ { "date_accessed": datetime.now().isoformat(), "source": url, } ], "source": {"name": name, "url": url}, }, } ) class Tools: class Valves(BaseModel): DOCLING_URL: str = Field( default="http://host.docker.internal:5001", description="Docling URL", ) DOCLING_TIMEOUT: float = Field( default=5, description="Docling URL Timeout", ) SEARXNG_URL: str = Field( default="http://host.docker.internal:8080", description="SearXNG URL", ) SEARXNG_SCORE_FILTER: float = Field( default=1, description="SearXNG Score Filter", ) SEARXNG_SCORE_RESULT: int = Field( default=5, description="SearXNG Score Result", ) def __init__(self): self.valves = self.Valves() self.citation = False pass async def docling( self, urls: list, __event_emitter__: Callable[[dict], Awaitable[None]], ) -> str: """ Web scrape the website provided and get the content of it. :param urls: List of urls. :return: Return relevant context. """ docling_url = self.valves.DOCLING_URL docling_timeout = self.valves.DOCLING_TIMEOUT emitter = EventEmitter(__event_emitter__) await emitter.progress_update("Scraping") api_url = f"{docling_url}/v1alpha/convert/source" headers = {"accept": "application/json", "Content-Type": "application/json"} all_responses = [] for url in urls: # add https to url if it does not start with http or https if not url.startswith("http") and not url.startswith("https"): url = f"https://{url}" data = { "options": {"to_formats": ["md", "doctags"]}, "http_sources": [{"url": url}], } if url.endswith(".pdf") or url.endswith(".docx"): docling_timeout = docling_timeout * 2 try: await emitter.progress_update(f"Scraping {url}") response = requests.post( api_url, headers=headers, json=data, timeout=docling_timeout ) await emitter.success_update("Scraping complete") except requests.exceptions.Timeout: await emitter.error_update(f"Timeout occurred for {url}") continue except requests.exceptions.RequestException as e: await emitter.error_update(f"{e}") return f"{e}" if response.status_code == 200: data = response.json() # Extract md_content and md_doctags from the JSON object md_content = data["document"]["md_content"] md_doctags = data["document"]["doctags_content"] # Use a ternary operator to set the 'content' variable content = md_content if md_content != "" else md_doctags all_responses.append(f"<url:{url}>\n{content}\n</url:{url}>\n") await emitter.citation(content, url) else: await emitter.error_update(f"Error fetching {url}") return "".join(all_responses) async def searxng( self, search_query: str, __event_emitter__: Callable[[dict], Awaitable[None]] ) -> str: """ Search the web and get the content of the relevant pages. Search for unknown knowledge, news, info, public contact info, weather, etc. :param query: Web Query used in search engine. :return: Provide the answer to the question as promptly as possible. """ searxng_url = self.valves.SEARXNG_URL searxng_score_filter = self.valves.SEARXNG_SCORE_FILTER searxng_score_result = self.valves.SEARXNG_SCORE_RESULT emitter = EventEmitter(__event_emitter__) api_url = f"{searxng_url}/search?q={search_query}&format=json&safesearch=0" headers = {"Content-Type": "application/json", "Accept": "application/json"} try: await emitter.progress_update("Searching") response = requests.get(api_url, headers=headers) except requests.exceptions.Timeout: await emitter.error_update(f"Timeout occurred for {url}") return f"Timeout occurred for {url}" except requests.exceptions.RequestException as e: await emitter.error_update(f"{e}") return f"{e}" await emitter.progress_update("Search Complete") if response.status_code == 200: data = response.json() await emitter.progress_update("Generating Response") data["results"] = [ result["url"] for result in data["results"] if result["score"] > searxng_score_filter ] results = await self.docling( data["results"][:searxng_score_result], __event_emitter__ ) await emitter.success_update(f"Complete! Search Query: {search_query}") return results else: await emitter.error_update( f"Failed to fetch results from SearXNG: {response.status_code}" ) return [f"Failed to fetch results from SearXNG: {response.status_code}"]