Whitepaper
Docs
Sign In
Tool
Tool
v0.2.3
Research
Tool ID
research
Creator
@t30d0r99
Downloads
1.2K+
Research using SRNXG with BeautifulSoup (Optimized Deep Research)
Get
README
No README available
Tool Code
Show
""" title:Web Search using SRNXG with BeautifulSoup (Optimized Deep Research) author: Teodor Cucu (Credits to Jacob DeLacerda for giving me the Idea) version: 0.2.3 github: https://github.com/the-real-t30d0r/research-openwebui license: MIT YOU NEED THIS PROMPT IN ORDER TO USE IT: https://openwebui.com/p/t30d0r99/research """ #!/usr/bin/env python3 import os import json from urllib.parse import urlparse import re import unicodedata from pydantic import BaseModel, Field import asyncio import aiohttp from typing import Any, Callable from bs4 import BeautifulSoup class HelpFunctions: def get_base_url(self, url: str) -> str: url_components = urlparse(url) return f"{url_components.scheme}://{url_components.netloc}" def generate_excerpt(self, content: str, max_length: int = 200) -> str: return content[:max_length] + "..." if len(content) > max_length else content def format_text(self, text: str) -> str: text = unicodedata.normalize("NFKC", text) text = re.sub(r"\s+", " ", text) return text.strip() def remove_emojis(self, text: str) -> str: return "".join(c for c in text if not unicodedata.category(c).startswith("So")) def truncate_to_n_words(self, text: str, n: int) -> str: words = text.split() return " ".join(words[:n]) async def fallback_scrape_async( self, url_site: str, timeout: int = 20, retries: int = 3 ) -> Any: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" } for attempt in range(retries): try: async with aiohttp.ClientSession() as session: async with session.get( url_site, headers=headers, timeout=timeout ) as response: response.raise_for_status() html_content = await response.text() soup = BeautifulSoup(html_content, "html.parser") for script in soup(["script", "style"]): script.extract() text = soup.get_text(separator=" ") formatted_text = self.format_text(text) if len(formatted_text) < 50: raise ValueError("Content too short") return formatted_text except Exception as e: await asyncio.sleep(1) return None async def process_search_result(self, result: dict, valves: Any) -> Any: url_site = result.get("url", "") if not url_site: return None if valves.IGNORED_WEBSITES: base_url = self.get_base_url(url_site) if any( ignored.strip() in base_url for ignored in valves.IGNORED_WEBSITES.split(",") ): return None content = await self.fallback_scrape_async(url_site) if not content: content = result.get("content", "") if not content or len(content) < 50: return None return { "title": self.remove_emojis(result.get("title", "")), "url": url_site, "content": self.truncate_to_n_words( content, valves.PAGE_CONTENT_WORDS_LIMIT ), "snippet": self.remove_emojis(result.get("content", "")), } class EventEmitter: def __init__(self, event_emitter: Callable[[dict], Any] = None): self.event_emitter = event_emitter async def emit(self, description="Unknown State", status="in_progress", done=False): if self.event_emitter: await self.event_emitter( { "type": "status", "data": { "status": status, "description": description, "done": done, }, } ) class Tools: class Valves(BaseModel): SRNXG_API_BASE_URL: str = Field( default="http://0.0.0.0:9090", description="Local SearXNG API base URL" ) IGNORED_WEBSITES: str = Field( default="", description="Comma-separated list of websites to ignore" ) TOTAL_PAGES_COUNT: int = Field( default=100, description="Number of pages to search per query" ) RETURNED_PAGES_COUNT: int = Field( default=100, description="Number of pages to return" ) PAGE_CONTENT_WORDS_LIMIT: int = Field( default=6000, description="Word limit per page for context" ) CITATION_LINKS: bool = Field( default=True, description="Include citation metadata" ) MAX_ITERATIONS: int = Field( default=5, description="Maximum iterations per query" ) def __init__(self): self.valves = self.Valves() self.headers = { "X-No-Cache": "true", "X-With-Images-Summary": "true", "X-With-Links-Summary": "true", } def refine_query(self, topic: str, iteration: int) -> str: refine_terms = [ "detailed analysis", "comprehensive review", "in-depth insights", "extended study", "thorough investigation", ] term = refine_terms[min(iteration, len(refine_terms) - 1)] return f"{topic} {term}" def generate_report(self, topic: str, results: list) -> str: report = f"# Deep Research Report on {topic}\n\n" report += "## Mission Outcome and Planning\n" report += ( "A series of iterative internet searches were performed. For each source, key insights, adjustments to current findings, and missing information were noted. This process repeated until sufficient data was gathered or " + str(self.valves.MAX_ITERATIONS) + " iterations were reached.\n\n" ) if not results: report += "No relevant sources were found.\n" return report for idx, result in enumerate(results, start=1): report += f"### Source {idx}: {result['title']}\n" report += f"**URL:** {result['url']}\n\n" report += "#### Key Insights\n" report += f"{result['content'][:300]}...\n\n" report += "#### Adjustments to Current Findings\n" report += "To be determined based on further analysis.\n\n" report += "#### Missing Information\n" report += "To be determined based on further analysis.\n\n" report += "## Citations\n" for idx, result in enumerate(results, start=1): report += f"{idx}. [{result['title']}]({result['url']})\n" return report async def search_web( self, query: str, __event_emitter__: Callable[[dict], Any] = None ) -> str: functions = HelpFunctions() emitter = EventEmitter(__event_emitter__) topic = query search_query = query await emitter.emit( description=f"Internet search initiated for: {topic}. Please wait...", status="in_progress", done=False, ) all_results = [] seen_urls = set() max_iterations = self.valves.MAX_ITERATIONS for iteration in range(max_iterations): offset = iteration * self.valves.TOTAL_PAGES_COUNT await emitter.emit( description=f"Iteration {iteration+1}: Searching for '{search_query}' with offset {offset}...", status="in_progress", done=False, ) params = { "q": search_query, "format": "json", "number_of_results": self.valves.TOTAL_PAGES_COUNT, "offset": offset, } try: async with aiohttp.ClientSession() as session: async with session.get( f"{self.valves.SRNXG_API_BASE_URL}/search", params=params, headers=self.headers, timeout=120, ) as response: response.raise_for_status() json_data = await response.json() search_items = json_data.get("results", []) except Exception as e: await emitter.emit( description=f"Search error: {str(e)}", status="error", done=True ) break if not search_items: await emitter.emit( description="No more search results found. Ending search iterations.", status="in_progress", done=False, ) break new_results = [] tasks = [ functions.process_search_result( { "title": item.get("title", ""), "url": item.get("url", ""), "content": item.get("snippet", ""), }, self.valves, ) for item in search_items ] processed_results = await asyncio.gather(*tasks) for res in processed_results: if res and res["url"] not in seen_urls: seen_urls.add(res["url"]) new_results.append(res) if not new_results: break all_results.extend(new_results) if self.valves.CITATION_LINKS and __event_emitter__: for result in new_results: await __event_emitter__( { "type": "citation", "data": { "document": [result["content"]], "metadata": [{"source": result["url"]}], "source": {"name": result["title"]}, }, } ) await emitter.emit( description=f"Iteration {iteration+1} added {len(new_results)} valid results (Total: {len(all_results)})", status="in_progress", done=False, ) if len(all_results) >= self.valves.RETURNED_PAGES_COUNT: break if not all_results: await emitter.emit( description="Web search completed. No relevant sources were found.", status="complete", done=True, ) report = self.generate_report(topic, []) else: if self.valves.CITATION_LINKS and __event_emitter__: citation_summary = [ {"title": r["title"], "url": r["url"]} for r in all_results ] await __event_emitter__( { "type": "citation_summary", "data": { "message": f"Visited {len(all_results)} websites.", "citations": citation_summary, }, } ) await emitter.emit( description=f"Web search completed. Retrieved content from {len(all_results)} pages.", status="complete", done=True, ) report = self.generate_report(topic, all_results) return report async def main(): async def my_event_handler(event: dict): print( f"Event: {event['data']['status']} - {event['data']['description']} (Done: {event['data']['done']})" ) tools = Tools() report = await tools.search_web("test", __event_emitter__=my_event_handler) print("\n--- REPORT ---\n") print(report) if __name__ == "__main__": asyncio.run(main())