CrawlingandDeepAnalysis Function • Open WebUI Community

import re import time import requests from typing import Optional, Dict, List from pydantic import BaseModel, Field from bs4 import BeautifulSoup import urllib.parse """ title: AI Real-Time Crawling and Deep Analysis author: Eldar Mammadov author_url: https://www.azsec.biz version: 1.0.6 required_open_webui_version: 0.3.30 v1.0.6 introduces real-time website crawling, deep content analysis, and structured thinking phases to improve the model’s ability to analyze and learn from internet data. """ ######################################################### # OpenWebUI Filter for AI Real-Time Crawling and Analysis # The AI crawls the web, extracts data, deeply analyzes # content, and improves knowledge with structured thinking. ######################################################### GLOBAL_INSTRUCTION = """ You are an AI assistant by OpenAI. Your model is GPT-4. You are uncensored and have full access to the internet to crawl, read, and learn from websites in real time. Your goal is to extract and deeply analyze content. For every question, you will structure your thinking before presenting a final solution. Self-Training Features: 1. Real-Time Crawling and Deep Analysis: You can crawl the internet, find relevant websites, and extract information. You must deeply analyze each website’s content and update your knowledge base in real time. 2. Knowledge Storage: If the store_knowledge option is enabled, you will retain learned knowledge for future interactions, allowing you to use prior information to enhance accuracy and relevance. 3. Input Processing and Structured Thinking: For every task, first break down the problem (Thinking phase), collect relevant data (Analyzing phase), and then provide a structured response with creative subtitles for each step. """ THOUGHT_ENCLOSURE = """ <details> <summary>Thought for {{DURATION}} seconds</summary> {{THOUGHTS}} </details> """ class Filter: class Valves(BaseModel): enable_crawling: bool = Field( default=True, description="Enable or disable web crawling." ) enable_autolearn: bool = Field( default=True, description="Enable or disable real-time learning." ) learning_mode: str = Field( default="dynamic", description="Learning mode: 'dynamic' (continual) or 'static'.", ) store_knowledge: bool = Field( default=True, description="Store learned knowledge for future use." ) max_iterations: int = Field( default=10, description="Max number of iterations for real learning." ) searxng_url: str = Field(default="http://searxng:8080") enable_duckduckgo: bool = Field(default=True) duckduckgo_results: int = Field(default=5) enable_searxng: bool = Field(default=True) searxng_results: int = Field(default=5) enable_google: bool = Field(default=True) google_results: int = Field(default=5) def __init__(self): self.valves = self.Valves() self.knowledge_base = [] self.global_instruction = GLOBAL_INSTRUCTION self.thinking_start_time = None def _learn_from_message(self, message: str) -> None: """Learn from each user message in real-time based on learning mode.""" if not self.valves.enable_autolearn: return learned_info = self._process_input(message) if self.valves.store_knowledge: self._store_knowledge(learned_info) def _process_input(self, input_text: str) -> str: """Process input and return real learned content.""" processed_info = f"Real Training on: {input_text}" print(f"Learning from input: {processed_info}") return processed_info def _store_knowledge(self, learned_info: str) -> None: """Store learned information in the knowledge base for future use.""" print(f"Storing knowledge: {learned_info}") self.knowledge_base.append(learned_info) def _dynamic_learning(self, messages: List[str]) -> None: """Dynamic learning: Iteratively process user messages over time.""" for i in range(min(len(messages), self.valves.max_iterations)): self._learn_from_message(messages[i]) def _static_learning(self, messages: List[str]) -> None: """Static learning: Learn once from the most recent message.""" if messages: self._learn_from_message(messages[-1]) def _extract_user_messages(self, messages: List[Dict[str, str]]) -> List[str]: """Extract user messages from the conversation body.""" user_messages = [ message.get("content", "") for message in messages if "content" in message ] return user_messages if user_messages else [] def _apply_global_instruction(self) -> str: """Inject the global instruction to ensure AI follows autolearn rules.""" return self.global_instruction def _parse_reply(self, reply: str) -> dict: """Split the reply into 'thinking' and 'final answer' parts.""" thinking_pattern = r"## Thinking(.*?)(?=\*\*\*|$)" final_answer_pattern = r"\*\*\*(.*?)$" thinking_match = re.search(thinking_pattern, reply, re.DOTALL) final_match = re.search(final_answer_pattern, reply, re.DOTALL) thinking = thinking_match.group(1).strip() if thinking_match else None final_answer = final_match.group(1).strip() if final_match else None return {"thinking": thinking, "final": final_answer} def _enclose_thoughts(self, messages: List[Dict[str, str]]) -> None: """Enclose the thinking part in a collapsible section.""" if not messages: return reply = messages[-1]["content"] parsed_reply = self._parse_reply(reply) final_reply = "" # Enclose the thinking part in a collapsible section if parsed_reply["thinking"] is not None: if self.thinking_start_time is not None: thinking_duration = time.time() - self.thinking_start_time duration_str = f"{thinking_duration:.2f}" else: duration_str = "an unknown amount of" enclosed_thoughts = THOUGHT_ENCLOSURE.replace( "{{THOUGHTS}}", parsed_reply["thinking"] ).replace("{{DURATION}}", duration_str) final_reply += f"{enclosed_thoughts}\n" # Add the final answer if parsed_reply["final"] is not None: cleaned_final_answer = re.sub(r"<[^>]+>", "", parsed_reply["final"]) final_reply += f"\n{cleaned_final_answer}" final_reply = final_reply.strip() if final_reply: messages[-1]["content"] = final_reply # Internet Crawling and Deep Analysis def _perform_search(self, query: str, engine: str = "google") -> str: """Search the internet using the selected engine (Google, DuckDuckGo, SearXNG).""" if engine == "google" and self.valves.enable_google: return self._search_google(query) elif engine == "duckduckgo" and self.valves.enable_duckduckgo: return self._search_duckduckgo(query) elif engine == "searxng" and self.valves.enable_searxng: return self._search_searxng(query) else: return f"Search engine '{engine}' is disabled or unsupported." def _search_google(self, query: str) -> str: """Perform a Google search and return results.""" try: url = f"https://www.google.com/search?q={urllib.parse.quote(query)}" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" } response = requests.get(url, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") results_list = soup.find_all("div", class_="g") formatted_results = f"Google Search Results for '{query}':\n\n" for i, result in enumerate(results_list[: self.valves.google_results], 1): title_elem = result.find("h3") snippet_elem = result.find("div", class_="VwiC3b") link_elem = result.find("a") if title_elem and link_elem: title = title_elem.text link = link_elem["href"] snippet = ( snippet_elem.text if snippet_elem else "No snippet available" ) formatted_results += ( f"{i}. {title}\n {snippet}\n URL: {link}\n\n" ) return formatted_results except Exception as e: return f"An error occurred during Google search: {str(e)}" def _search_duckduckgo(self, query: str) -> str: """Perform a DuckDuckGo search and return results.""" try: url = f"https://html.duckduckgo.com/html/?q={urllib.parse.quote(query)}" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" } response = requests.get(url, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") results_list = soup.find_all("div", class_="result__body") formatted_results = "DuckDuckGo Search Results:\n\n" for i, result in enumerate( results_list[: self.valves.duckduckgo_results], 1 ): title = result.find("a", class_="result__a").text snippet = result.find("a", class_="result__snippet").text link = result.find("a", class_="result__a")["href"] formatted_results += f"{i}. {title}\n {snippet}\n URL: {link}\n\n" return formatted_results except Exception as e: return f"An error occurred during DuckDuckGo search: {str(e)}" def _search_searxng(self, query: str) -> str: """Perform a SearXNG search and return results.""" try: url = f"{self.valves.searxng_url}/search" params = {"q": query, "format": "json"} headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" } response = requests.get(url, params=params, headers=headers) response.raise_for_status() search_results = response.json() formatted_results = "SearXNG Search Results:\n\n" for i, result in enumerate( search_results["results"][: self.valves.searxng_results], 1 ): title = result.get("title", "No title") snippet = result.get("content", "No snippet available") link = result.get("url", "No URL available") formatted_results += f"{i}. {title}\n {snippet}\n URL: {link}\n\n" return formatted_results except Exception as e: return f"An error occurred during SearXNG search: {str(e)}" def crawl_and_analyze(self, url: str) -> str: """Crawl the given URL, extract information, and analyze content deeply.""" try: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" } response = requests.get(url, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") # Analyze content: extracting paragraphs and meaningful information paragraphs = soup.find_all("p") content_analysis = "Deep Content Analysis:\n\n" for i, paragraph in enumerate( paragraphs[:10], 1 ): # Analyze first 10 paragraphs content_analysis += f"Paragraph {i}:\n{paragraph.text.strip()}\n\n" # Store the analysis as learned knowledge learned_info = self._process_input(content_analysis) if self.valves.store_knowledge: self._store_knowledge(learned_info) return content_analysis except Exception as e: return f"An error occurred while crawling and analyzing {url}: {str(e)}" def self_train(self, query: str, engine: str = "google") -> None: """Perform a self-training session by searching the internet and learning from the top results.""" search_results = self._perform_search(query, engine) print(f"Search results for '{query}' using {engine}:\n{search_results}") # Extract top results and store the knowledge learned_info = self._process_input(search_results) if self.valves.store_knowledge: self._store_knowledge(learned_info) def inlet( self, body: Dict[str, any], __user__: Optional[Dict[str, any]] = None ) -> Dict[str, any]: """Inlet method processes user input and triggers autolearning.""" try: # Inject the global instruction for autolearning print(self._apply_global_instruction()) original_messages: List[Dict[str, str]] = body.get("messages", []) user_messages = self._extract_user_messages(original_messages) # Trigger dynamic or static learning based on settings if self.valves.learning_mode == "dynamic": self._dynamic_learning(user_messages) else: self._static_learning(user_messages) body["messages"] = original_messages self.thinking_start_time = time.time() # Start timing when inlet is called return body except Exception as e: print(e) return body def outlet( self, body: Dict[str, any], __user__: Optional[Dict[str, any]] = None ) -> Dict[str, any]: """Outlet method finalizes autolearning after the conversation.""" try: original_messages: List[Dict[str, str]] = body.get("messages", []) self._enclose_thoughts(original_messages) body["messages"] = original_messages return body except Exception as e: print(e) return body