[{"id":"c023613c-ebc4-4708-bdae-3c454bf415d5","userId":"05ab441b-c69e-4b09-827f-ab99c4699ae7","tool":{"id":"web_scrape","name":"Enhanced Web Scrape","meta":{"description":"An improved web scraping tool that extracts text content using Jina Reader, now with better filtering, user-configuration, and UI feedback using emitters.","manifest":{"title":"Enhanced Web Scrape","description":"An improved web scraping tool that extracts text content using Jina Reader, now with better filtering, user-configuration, and UI feedback using emitters.","author":"ekatiyar","author_url":"https://github.com/ekatiyar","github":"https://github.com/ekatiyar/open-webui-tools","original_author":"Pyotr Growpotkin","original_author_url":"https://github.com/christ-offer/","original_github":"https://github.com/christ-offer/open-webui-tools","funding_url":"https://github.com/open-webui","version":"0.0.4","license":"MIT"}},"content":"\"\"\"\ntitle: Enhanced Web Scrape\ndescription: An improved web scraping tool that extracts text content using Jina Reader, now with better filtering, user-configuration, and UI feedback using emitters.\nauthor: ekatiyar\nauthor_url: https://github.com/ekatiyar\ngithub: https://github.com/ekatiyar/open-webui-tools\noriginal_author: Pyotr Growpotkin\noriginal_author_url: https://github.com/christ-offer/\noriginal_github: https://github.com/christ-offer/open-webui-tools\nfunding_url: https://github.com/open-webui\nversion: 0.0.4\nlicense: MIT\n\"\"\"\n\nimport requests\nfrom typing import Callable, Any\nimport re\nfrom pydantic import BaseModel, Field\n\nimport unittest\n\ndef extract_title(text):\n \"\"\"\n Extracts the title from a string containing structured text.\n\n :param text: The input string containing the title.\n :return: The extracted title string, or None if the title is not found.\n \"\"\"\n match = re.search(r'Title: (.*)\\n', text)\n return match.group(1).strip() if match else None\n\ndef clean_urls(text) -> str:\n \"\"\"\n Cleans URLs from a string containing structured text.\n\n :param text: The input string containing the URLs.\n :return: The cleaned string with URLs removed.\n \"\"\"\n return re.sub(r'\\((http[^)]+)\\)', '', text)\n\nclass EventEmitter:\n def __init__(self, event_emitter: Callable[[dict], Any] = None):\n self.event_emitter = event_emitter\n\n async def progress_update(self, description):\n await self.emit(description)\n\n async def error_update(self, description):\n await self.emit(description, \"error\", True)\n\n async def success_update(self, description):\n await self.emit(description, \"success\", True)\n\n async def emit(self, description=\"Unknown State\", status=\"in_progress\", done=False):\n if self.event_emitter:\n await self.event_emitter(\n {\n \"type\": \"status\",\n \"data\": {\n \"status\": status,\n \"description\": description,\n \"done\": done,\n },\n }\n )\n\nclass Tools:\n class Valves(BaseModel):\n DISABLE_CACHING: bool = Field(\n default=False, description=\"Bypass Jina Cache when scraping\"\n )\n GLOBAL_JINA_API_KEY: str = Field(\n default=\"\",\n description=\"(Optional) Jina API key. Allows a higher rate limit when scraping. Used when a User-specific API key is not available.\"\n )\n\n class UserValves(BaseModel):\n CLEAN_CONTENT: bool = Field(\n default=True, description=\"Remove links and image urls from scraped content. This reduces the number of tokens.\"\n )\n JINA_API_KEY: str = Field(\n default=\"\",\n description=\"(Optional) Jina API key. Allows a higher rate limit when scraping.\"\n )\n\n def __init__(self):\n self.valves = self.Valves()\n self.citation = True\n\n async def web_scrape(self, url: str, __event_emitter__: Callable[[dict], Any] = None, __user__: dict = {}) -> str:\n \"\"\"\n Scrape and process a web page using r.jina.ai\n\n :param url: The URL of the web page to scrape.\n :return: The scraped and processed webpage content, or an error message.\n \"\"\"\n emitter = EventEmitter(__event_emitter__)\n\n await emitter.progress_update(f\"Scraping {url}\")\n jina_url = f\"https://r.jina.ai/{url}\"\n\n headers = {\n \"X-No-Cache\": \"true\" if self.valves.DISABLE_CACHING else \"false\",\n \"X-With-Generated-Alt\": \"true\",\n }\n\n if \"valves\" in __user__ and __user__[\"valves\"].JINA_API_KEY:\n headers[\"Authorization\"] = f\"Bearer {__user__['valves'].JINA_API_KEY}\"\n elif self.valves.GLOBAL_JINA_API_KEY:\n headers[\"Authorization\"] = f\"Bearer {self.valves.GLOBAL_JINA_API_KEY}\"\n\n try:\n response = requests.get(jina_url, headers=headers)\n response.raise_for_status()\n\n should_clean = \"valves\" not in __user__ or __user__[\"valves\"].CLEAN_CONTENT\n if should_clean:\n await emitter.progress_update(\"Received content, cleaning up ...\")\n content = clean_urls(response.text) if should_clean else response.text\n\n title = extract_title(content)\n await emitter.success_update(f\"Successfully Scraped {title if title else url}\")\n return content\n\n except requests.RequestException as e:\n error_message = f\"Error scraping web page: {str(e)}\"\n await emitter.error_update(error_message)\n return error_message\n \nclass WebScrapeTest(unittest.IsolatedAsyncioTestCase):\n async def test_web_scrape(self):\n url = \"https://toscrape.com/\"\n content = await Tools().web_scrape(url)\n self.assertEqual(\"Scraping Sandbox\", extract_title(content))\n self.assertEqual(len(content), 770)\n\nif __name__ == \"__main__\":\n print(\"Running tests...\")\n unittest.main()"},"downloads":3075,"upvotes":0,"downvotes":0,"updatedAt":1723312692,"createdAt":1723162249,"user":{"id":"05ab441b-c69e-4b09-827f-ab99c4699ae7","username":"whirlybird","name":"","profileImageUrl":"https://www.gravatar.com/avatar/1de70ad46562a438f07551b53492eee22532ee730844222dba20a17bc1d00b08?d=mp","createdAt":1722482588}}]