Whitepaper
Docs
Sign In
Tool
Tool
Read Google Drive Files
Tool ID
google_drive
Creator
@suciocerdo
Downloads
956+
a tool for reading the content of public google drive files
Get
README
No README available
Tool Code
Show
#THIS CODE WORKS BUT NOT YET AS A TOOL. STILL TRYING TO FIGURE OUT WHY import requests from bs4 import BeautifulSoup import json import re class Tools: def __init__(self): pass def scrape_file_content(self, file_url: str) -> dict: """ Scrape the content of a given file URL and return it as a dictionary. :param file_url: The URL of the file. :return: A dictionary containing the file URL and scraped content. """ try: response = requests.get(file_url, timeout=120) response.raise_for_status() html_content = response.text soup = BeautifulSoup(html_content, "html.parser") content = soup.get_text(separator=" ", strip=True) # Optionally truncate content if it's very large truncated_content = " ".join(content.split()[:5000]) return {"url": file_url, "content": truncated_content} except requests.exceptions.RequestException as e: return {"url": file_url, "content": f"Error retrieving file: {str(e)}"} def get_combined_files_content(self, folder_url: str, specific_files: list) -> list: """ Get the combined content of multiple files as a list of dictionaries. :param folder_url: The URL of the folder containing multiple files. :param specific_files: A list of specific file URLs. :return: A list of dictionaries with each file's URL and content. """ # Extract content from the folder and specific files all_files = self.extract_files_from_folder(folder_url) + specific_files print(f"Processing a total of {len(all_files)} files.") # Collect the content of each file as a list of dictionaries all_content = [self.scrape_file_content(url) for url in all_files] return all_content def extract_files_from_folder(self, folder_url: str) -> list: """ Extract all file URLs from a given folder using regex. :param folder_url: The URL of the folder. :return: A list of file URLs. """ try: folder_id = folder_url.split('/')[-1] export_url = f"https://example.com/embeddedfolderview?id={folder_id}#list" response = requests.get(export_url, timeout=120) response.raise_for_status() html_content = response.text # Extract file URLs using regex drive_link_pattern = re.compile(r"https://example.com/file/d/[\w-]+/view") matches = drive_link_pattern.findall(html_content) file_links = [f"https://example.com/uc?id={match.split('/d/')[1].split('/')[0]}" for match in matches] print(f"Found {len(file_links)} files in the folder.") return list(set(file_links)) except requests.exceptions.RequestException as e: print(f"Error accessing folder: {e}") return [] def get_files_summary(self, folder_url: str, specific_files: list) -> str: """ Get a JSON-formatted summary of the contents of multiple files. :param folder_url: The URL of the folder containing multiple files. :param specific_files: A list of specific file URLs. :return: A JSON string with the content of all files. """ combined_content = self.get_combined_files_content(folder_url, specific_files) # Convert the list of dictionaries into a JSON-formatted string json_output = json.dumps(combined_content, ensure_ascii=False, indent=2) return json_output # Initialize the Tools instance tools = Tools() # Dummy URLs for a public folder and specific files folder_url = 'https://example.com/drive/folders/folderID' # Dummy folder URL. Folder needs to be shared to anyone with the link specific_files = [ 'https://example.com/uc?id=dummyFileID1', # Dummy file URL 1. File needs to be shared to anyone with the link 'https://example.com/uc?id=dummyFileID2' # Dummy file URL 2. File needs to be shared to anyone with the link ] # Retrieve the result in JSON format result_json = tools.get_files_summary(folder_url, specific_files)