Tool
Tool Enhaced Web Scrape
Web scrape
Tool ID
tool_enhaced_web_scrape
Creator
@jariass
Downloads
268+

Tool Content
python
[{"id":"c023613c-ebc4-4708-bdae-3c454bf415d5","userId":"05ab441b-c69e-4b09-827f-ab99c4699ae7","tool":{"id":"web_scrape","name":"Enhanced Web Scrape","meta":{"description":"An improved web scraping tool that extracts text content using Jina Reader, now with better filtering, user-configuration, and UI feedback using emitters.","manifest":{"title":"Enhanced Web Scrape","description":"An improved web scraping tool that extracts text content using Jina Reader, now with better filtering, user-configuration, and UI feedback using emitters.","author":"ekatiyar","author_url":"https://github.com/ekatiyar","github":"https://github.com/ekatiyar/open-webui-tools","original_author":"Pyotr Growpotkin","original_author_url":"https://github.com/christ-offer/","original_github":"https://github.com/christ-offer/open-webui-tools","funding_url":"https://github.com/open-webui","version":"0.0.4","license":"MIT"}},"content":"\"\"\"\ntitle: Enhanced Web Scrape\ndescription: An improved web scraping tool that extracts text content using Jina Reader, now with better filtering, user-configuration, and UI feedback using emitters.\nauthor: ekatiyar\nauthor_url: https://github.com/ekatiyar\ngithub: https://github.com/ekatiyar/open-webui-tools\noriginal_author: Pyotr Growpotkin\noriginal_author_url: https://github.com/christ-offer/\noriginal_github: https://github.com/christ-offer/open-webui-tools\nfunding_url: https://github.com/open-webui\nversion: 0.0.4\nlicense: MIT\n\"\"\"\n\nimport requests\nfrom typing import Callable, Any\nimport re\nfrom pydantic import BaseModel, Field\n\nimport unittest\n\ndef extract_title(text):\n  \"\"\"\n  Extracts the title from a string containing structured text.\n\n  :param text: The input string containing the title.\n  :return: The extracted title string, or None if the title is not found.\n  \"\"\"\n  match = re.search(r'Title: (.*)\\n', text)\n  return match.group(1).strip() if match else None\n\ndef clean_urls(text) -> str:\n    \"\"\"\n    Cleans URLs from a string containing structured text.\n\n    :param text: The input string containing the URLs.\n    :return: The cleaned string with URLs removed.\n    \"\"\"\n    return re.sub(r'\\((http[^)]+)\\)', '', text)\n\nclass EventEmitter:\n    def __init__(self, event_emitter: Callable[[dict], Any] = None):\n        self.event_emitter = event_emitter\n\n    async def progress_update(self, description):\n        await self.emit(description)\n\n    async def error_update(self, description):\n        await self.emit(description, \"error\", True)\n\n    async def success_update(self, description):\n        await self.emit(description, \"success\", True)\n\n    async def emit(self, description=\"Unknown State\", status=\"in_progress\", done=False):\n        if self.event_emitter:\n            await self.event_emitter(\n                {\n                    \"type\": \"status\",\n                    \"data\": {\n                        \"status\": status,\n                        \"description\": description,\n                        \"done\": done,\n                    },\n                }\n            )\n\nclass Tools:\n    class Valves(BaseModel):\n        DISABLE_CACHING: bool = Field(\n            default=False, description=\"Bypass Jina Cache when scraping\"\n        )\n        GLOBAL_JINA_API_KEY: str = Field(\n            default=\"\",\n            description=\"(Optional) Jina API key. Allows a higher rate limit when scraping. Used when a User-specific API key is not available.\"\n        )\n\n    class UserValves(BaseModel):\n        CLEAN_CONTENT: bool = Field(\n            default=True, description=\"Remove links and image urls from scraped content. This reduces the number of tokens.\"\n        )\n        JINA_API_KEY: str = Field(\n            default=\"\",\n            description=\"(Optional) Jina API key. Allows a higher rate limit when scraping.\"\n        )\n\n    def __init__(self):\n        self.valves = self.Valves()\n        self.citation = True\n\n    async def web_scrape(self, url: str, __event_emitter__: Callable[[dict], Any] = None, __user__: dict = {}) -> str:\n        \"\"\"\n        Scrape and process a web page using r.jina.ai\n\n        :param url: The URL of the web page to scrape.\n        :return: The scraped and processed webpage content, or an error message.\n        \"\"\"\n        emitter = EventEmitter(__event_emitter__)\n\n        await emitter.progress_update(f\"Scraping {url}\")\n        jina_url = f\"https://r.jina.ai/{url}\"\n\n        headers = {\n            \"X-No-Cache\": \"true\" if self.valves.DISABLE_CACHING else \"false\",\n            \"X-With-Generated-Alt\": \"true\",\n        }\n\n        if \"valves\" in __user__ and __user__[\"valves\"].JINA_API_KEY:\n            headers[\"Authorization\"] = f\"Bearer {__user__['valves'].JINA_API_KEY}\"\n        elif self.valves.GLOBAL_JINA_API_KEY:\n            headers[\"Authorization\"] = f\"Bearer {self.valves.GLOBAL_JINA_API_KEY}\"\n\n        try:\n            response = requests.get(jina_url, headers=headers)\n            response.raise_for_status()\n\n            should_clean = \"valves\" not in __user__ or __user__[\"valves\"].CLEAN_CONTENT\n            if should_clean:\n                await emitter.progress_update(\"Received content, cleaning up ...\")\n            content = clean_urls(response.text) if should_clean else response.text\n\n            title = extract_title(content)\n            await emitter.success_update(f\"Successfully Scraped {title if title else url}\")\n            return content\n\n        except requests.RequestException as e:\n            error_message = f\"Error scraping web page: {str(e)}\"\n            await emitter.error_update(error_message)\n            return error_message\n        \nclass WebScrapeTest(unittest.IsolatedAsyncioTestCase):\n    async def test_web_scrape(self):\n        url = \"https://toscrape.com/\"\n        content = await Tools().web_scrape(url)\n        self.assertEqual(\"Scraping Sandbox\", extract_title(content))\n        self.assertEqual(len(content), 770)\n\nif __name__ == \"__main__\":\n    print(\"Running tests...\")\n    unittest.main()"},"downloads":3075,"upvotes":0,"downvotes":0,"updatedAt":1723312692,"createdAt":1723162249,"user":{"id":"05ab441b-c69e-4b09-827f-ab99c4699ae7","username":"whirlybird","name":"","profileImageUrl":"https://www.gravatar.com/avatar/1de70ad46562a438f07551b53492eee22532ee730844222dba20a17bc1d00b08?d=mp","createdAt":1722482588}}]