Whitepaper
Docs
Sign In
Tool
Tool
v0.2.1
Gitea Repository Scraper
Tool ID
gitea_repository_scraper
Creator
@pakobbix
Downloads
23+
Scrape the content of a Gitea repository and return it to the LLM Context.
Get
README
Tool Code
Show
""" title: Gitea repository scraper description: Scrape the content of a Gitea repository and return it to the LLM Context. author: Pakobbix author_url: zephyre.one github: funding_url: version: 0.2.1 license: MIT """ """ Changelog: - v0.2.1 - Fixed the EventEmitter to close progress. Added filename and delimiter for better LLM parsing. - v0.2.0 - Added Valves class to store the Gitea username, base URL, and access token using the open-webui UI. - v0.1.0 - Initial version """ import requests, asyncio from typing import Callable, Any from pydantic import BaseModel, Field # Took it from the examples from the open-webui documentation. class EventEmitter: def __init__(self, event_emitter: Callable[[dict], Any] = None): self.event_emitter = event_emitter # This is the progress update to show the progress of the task within the UI. async def progress_update(self, description): await self.emit(description) # This is the error update to show the error of the task within the UI. async def error_update(self, description): await self.emit(description, "error", True) # This is the success update to show the success of the task within the UI. async def success_update(self, description): await self.emit(description, "success", True) # This is the emit function to show the status of the task within the UI with the corrospending status type. async def emit(self, description="Unknown State", status="in_progress", done=False): if self.event_emitter: await self.event_emitter( { "type": "status", "data": { "status": status, "description": description, "done": done, }, } ) class Tools: # Valves class to store the Gitea username, base URL, and access token using the open-webui UI class Valves(BaseModel): username: str = Field("", description="Gitea username of the user") base_url: str = Field( "", description="Base URL of the Gitea instance, e.g., https://gitea.example.com", ) access_token: str = Field( "", description="Access token for Gitea API authentication" ) def __init__(self): # Initialize the Valves class to store the Gitea username, base URL, and access token to use in the functions. self.valves = self.Valves() async def get_repository_files_content( self, repo_name: str, __event_emitter__: Callable[[dict], Any] = None ): emitter = EventEmitter(__event_emitter__) await emitter.progress_update( "Creating request to fetch repository files content" ) self.access_token = self.valves.access_token if not self.access_token: await emitter.error_update( "Gitea access token is not set in the environment variables." ) raise Exception( "Gitea access token is not set in the environment variables." ) self.repo_url = ( self.valves.base_url + "/api/v1/repos/" + self.valves.username + "/" + repo_name + "/contents/" ) await emitter.success_update("Creating request to fetch repository completed") return await self._fetch_contents(self.repo_url) async def _fetch_contents( self, url: str, __event_emitter__: Callable[[dict], Any] = None ): emitter = EventEmitter(__event_emitter__) await emitter.progress_update("Fetching contents") # Use the access token for authentication headers = {"Authorization": f"token " + self.access_token} # Make the request to the Gitea API to fetch the contents of the repository response = requests.get(url, headers=headers) if response.status_code != 200: await emitter.error_update( f"Failed to fetch repository: {response.status_code}" ) raise Exception(f"Failed to fetch repository: {response.status_code}") all_contents = "" # Iterate through the items in the response for item in response.json(): # Check if the item is a file or a directory if item["type"] == "file": # Fetch the file content using the download URL file_response = requests.get(item["download_url"], headers=headers) if file_response.status_code != 200: print( f"Failed to fetch file {item['download_url']}: {file_response.status_code}" ) continue # Check MIME type to ignore binary files and images if "text" in file_response.headers["Content-Type"]: all_contents += f"{file_response.text}\n" else: # Ignore binary files and images print(f"Ignored binary file or image: {item['download_url']}") elif item["type"] == "dir": # Recursively fetch contents of the directory all_contents += await self._fetch_contents(item["url"]) await emitter.success_update("Fetching contents completed") # Return the concatenated contents of all files in the repository to the LLM Context. return all_contents