#THIS CODE WORKS BUT NOT YET AS A TOOL. STILL TRYING TO FIGURE OUT WHY
import requests
from bs4 import BeautifulSoup
import json
import re
class Tools:
def __init__(self):
pass
def scrape_file_content(self, file_url: str) -> dict:
"""
Scrape the content of a given file URL and return it as a dictionary.
:param file_url: The URL of the file.
:return: A dictionary containing the file URL and scraped content.
"""
try:
response = requests.get(file_url, timeout=120)
response.raise_for_status()
html_content = response.text
soup = BeautifulSoup(html_content, "html.parser")
content = soup.get_text(separator=" ", strip=True)
# Optionally truncate content if it's very large
truncated_content = " ".join(content.split()[:5000])
return {"url": file_url, "content": truncated_content}
except requests.exceptions.RequestException as e:
return {"url": file_url, "content": f"Error retrieving file: {str(e)}"}
def get_combined_files_content(self, folder_url: str, specific_files: list) -> list:
"""
Get the combined content of multiple files as a list of dictionaries.
:param folder_url: The URL of the folder containing multiple files.
:param specific_files: A list of specific file URLs.
:return: A list of dictionaries with each file's URL and content.
"""
# Extract content from the folder and specific files
all_files = self.extract_files_from_folder(folder_url) + specific_files
print(f"Processing a total of {len(all_files)} files.")
# Collect the content of each file as a list of dictionaries
all_content = [self.scrape_file_content(url) for url in all_files]
return all_content
def extract_files_from_folder(self, folder_url: str) -> list:
"""
Extract all file URLs from a given folder using regex.
:param folder_url: The URL of the folder.
:return: A list of file URLs.
"""
try:
folder_id = folder_url.split('/')[-1]
export_url = f"https://example.com/embeddedfolderview?id={folder_id}#list"
response = requests.get(export_url, timeout=120)
response.raise_for_status()
html_content = response.text
# Extract file URLs using regex
drive_link_pattern = re.compile(r"https://example.com/file/d/[\w-]+/view")
matches = drive_link_pattern.findall(html_content)
file_links = [f"https://example.com/uc?id={match.split('/d/')[1].split('/')[0]}" for match in matches]
print(f"Found {len(file_links)} files in the folder.")
return list(set(file_links))
except requests.exceptions.RequestException as e:
print(f"Error accessing folder: {e}")
return []
def get_files_summary(self, folder_url: str, specific_files: list) -> str:
"""
Get a JSON-formatted summary of the contents of multiple files.
:param folder_url: The URL of the folder containing multiple files.
:param specific_files: A list of specific file URLs.
:return: A JSON string with the content of all files.
"""
combined_content = self.get_combined_files_content(folder_url, specific_files)
# Convert the list of dictionaries into a JSON-formatted string
json_output = json.dumps(combined_content, ensure_ascii=False, indent=2)
return json_output
# Initialize the Tools instance
tools = Tools()
# Dummy URLs for a public folder and specific files
folder_url = 'https://example.com/drive/folders/folderID' # Dummy folder URL. Folder needs to be shared to anyone with the link
specific_files = [
'https://example.com/uc?id=dummyFileID1', # Dummy file URL 1. File needs to be shared to anyone with the link
'https://example.com/uc?id=dummyFileID2' # Dummy file URL 2. File needs to be shared to anyone with the link
]
# Retrieve the result in JSON format
result_json = tools.get_files_summary(folder_url, specific_files)