Tool
Read Google Drive Files
a tool for reading the content of public google drive files
Tool ID
google_drive
Creator
@suciocerdo
Downloads
199+

Tool Content
python
#THIS CODE WORKS BUT NOT YET AS A TOOL. STILL TRYING TO FIGURE OUT WHY

import requests
from bs4 import BeautifulSoup
import json
import re

class Tools:
    def __init__(self):
        pass

    def scrape_file_content(self, file_url: str) -> dict:
        """
        Scrape the content of a given file URL and return it as a dictionary.

        :param file_url: The URL of the file.
        :return: A dictionary containing the file URL and scraped content.
        """
        try:
            response = requests.get(file_url, timeout=120)
            response.raise_for_status()
            html_content = response.text

            soup = BeautifulSoup(html_content, "html.parser")
            content = soup.get_text(separator=" ", strip=True)

            # Optionally truncate content if it's very large
            truncated_content = " ".join(content.split()[:5000])

            return {"url": file_url, "content": truncated_content}

        except requests.exceptions.RequestException as e:
            return {"url": file_url, "content": f"Error retrieving file: {str(e)}"}

    def get_combined_files_content(self, folder_url: str, specific_files: list) -> list:
        """
        Get the combined content of multiple files as a list of dictionaries.

        :param folder_url: The URL of the folder containing multiple files.
        :param specific_files: A list of specific file URLs.
        :return: A list of dictionaries with each file's URL and content.
        """
        # Extract content from the folder and specific files
        all_files = self.extract_files_from_folder(folder_url) + specific_files
        print(f"Processing a total of {len(all_files)} files.")

        # Collect the content of each file as a list of dictionaries
        all_content = [self.scrape_file_content(url) for url in all_files]

        return all_content

    def extract_files_from_folder(self, folder_url: str) -> list:
        """
        Extract all file URLs from a given folder using regex.

        :param folder_url: The URL of the folder.
        :return: A list of file URLs.
        """
        try:
            folder_id = folder_url.split('/')[-1]
            export_url = f"https://example.com/embeddedfolderview?id={folder_id}#list"
            response = requests.get(export_url, timeout=120)
            response.raise_for_status()
            html_content = response.text

            # Extract file URLs using regex
            drive_link_pattern = re.compile(r"https://example.com/file/d/[\w-]+/view")
            matches = drive_link_pattern.findall(html_content)
            
            file_links = [f"https://example.com/uc?id={match.split('/d/')[1].split('/')[0]}" for match in matches]
            
            print(f"Found {len(file_links)} files in the folder.")
            return list(set(file_links))

        except requests.exceptions.RequestException as e:
            print(f"Error accessing folder: {e}")
            return []

    def get_files_summary(self, folder_url: str, specific_files: list) -> str:
        """
        Get a JSON-formatted summary of the contents of multiple files.

        :param folder_url: The URL of the folder containing multiple files.
        :param specific_files: A list of specific file URLs.
        :return: A JSON string with the content of all files.
        """
        combined_content = self.get_combined_files_content(folder_url, specific_files)

        # Convert the list of dictionaries into a JSON-formatted string
        json_output = json.dumps(combined_content, ensure_ascii=False, indent=2)

        return json_output

# Initialize the Tools instance
tools = Tools()

# Dummy URLs for a public folder and specific files
folder_url = 'https://example.com/drive/folders/folderID'  # Dummy folder URL. Folder needs to be shared to anyone with the link
specific_files = [
    'https://example.com/uc?id=dummyFileID1',  # Dummy file URL 1. File needs to be shared to anyone with the link
    'https://example.com/uc?id=dummyFileID2'   # Dummy file URL 2. File needs to be shared to anyone with the link
]

# Retrieve the result in JSON format
result_json = tools.get_files_summary(folder_url, specific_files)