Easy Wikipedia Tool • Open WebUI Community

""" title: Easy Wikipedia author: Jason Mulligan <[email protected]> author_url: https://github.com/avoidwork funding_url: https://github.com/avoidwork/easy-wikipedia version: 1.0.7 """ import requests import json import re from html.parser import HTMLParser from urllib.parse import quote BASE_URL = "https://en.wikipedia.org" LANGUAGE = "en-US" class EasyWikipediaHTMLParser(HTMLParser): _tags = ["h1", "h2", "h3", "h4", "p"] _text = [] _stream = [] _capture = False def handle_starttag(self, tag, _args): if tag in self._tags: self._capture = True self._stream.clear() def handle_endtag(self, tag): if tag in self._tags: self._capture = False text = "".join(self._stream) text = text.strip().rstrip() text = re.sub(r"\[[^\]]+\]", "", text) if len(text) > 1: trailing_char = "\n" if tag == "p" else "" formatted_text = f"{text}{trailing_char}" self._text.append(formatted_text) self._stream.clear() def handle_data(self, data): if self._capture: invalid = len(data.strip()) > 1 and all( word.startswith(".") for word in data.split(" ") ) if not invalid: self._stream.append(data) def close(self): text = "\n".join(self._text) self._text.clear() return text def parse_html(page_html: str) -> str: parser = EasyWikipediaHTMLParser() parser.feed(page_html) return parser.close() def sanitize(text: str) -> str: return text.strip().strip('"').strip("'") def get_page(title: str) -> str: page_title = sanitize(title) url = f"{BASE_URL}/api/rest_v1/page/html/{quote(page_title)}?redirect=false&stash=false" headers = { "Accept": 'text/html; charset=utf-8; profile="https://www.mediawiki.org/wiki/Specs/HTML/2.1.0"', "Accept-Language": LANGUAGE, } resp = requests.get(url, headers=headers) data = resp.text if len(data) == 0: return f'Failed to fetch page for "{page_title}".' return parse_html(data) class Tools: def __init__(self) -> None: self.citation = True pass def search(self, query: str) -> str: """ Search Wikipedia & retrieve the first result. :param query: Search query. :return: Summary of the page. """ search_query = sanitize(query) url = f"{BASE_URL}/w/api.php?action=opensearch&search={quote(search_query)}&limit=1&namespace=0&format=json" headers = {"Accept": "application/json", "Accept-Language": LANGUAGE} resp = requests.get(url, headers=headers) data = resp.json() if not isinstance(data, list) or len(data) == 0 or len(data[1]) == 0: return f'Failed to find results for "{search_query}".' return get_page(title=data[1][0])