NOTICE
Open WebUI Community is currently undergoing a major revamp to improve user experience and performance ✨

Tool
Wayback Machine API Integration Tool
Make calls the the wayback machine to see if a url has been cached.
Tool ID
wayback_machine_api
Creator
@sramelyk
Downloads
537+

Tool Content
python

"""
title: Wayback Machine API Integration
author: AI Assistant
version: 1.0
license: MIT
description: A tool that integrates the Wayback Machine API for retrieving archived web pages.
requirements: requests
"""

import os
import requests
from typing import Optional, Dict, Any
from pydantic import BaseModel, Field

class Tools:
    class Valves(BaseModel):
        API_BASE_URL: str = Field(
            default="https://archive.org/wayback/available",
            description="The base URL for Wayback Machine API"
        )
        USER_AGENT: str = Field(
            default="WaybackMachineAPI/1.0",
            description="User agent string for API requests"
        )

    def __init__(self):
        self.valves = self.Valves()
        self.api_base_url = self.valves.API_BASE_URL
        self.user_agent = self.valves.USER_AGENT

    def get_archived_snapshot(self, url: str, timestamp: Optional[str] = None) -> Dict[str, Any]:
        """
        Retrieve the closest archived snapshot of a given URL from the Wayback Machine.
        :param url: The URL to check for archived snapshots.
        :param timestamp: Optional timestamp to find the closest snapshot (format: YYYYMMDDhhmmss).
        :return: A dictionary containing the response data.
        """
        try:
            params = {"url": url}
            if timestamp:
                params["timestamp"] = timestamp

            headers = {
                "User-Agent": self.user_agent
            }

            response = requests.get(self.api_base_url, params=params, headers=headers, timeout=30)
            response.raise_for_status()

            data = response.json()
            
            if "archived_snapshots" in data and "closest" in data["archived_snapshots"]:
                snapshot = data["archived_snapshots"]["closest"]
                return {
                    "status": "success",
                    "data": {
                        "available": snapshot["available"],
                        "url": snapshot["url"],
                        "timestamp": snapshot["timestamp"],
                        "status": snapshot["status"]
                    },
                    "original_url": url
                }
            else:
                return {
                    "status": "not_found",
                    "message": "No archived snapshots found for the given URL.",
                    "original_url": url
                }

        except requests.exceptions.RequestException as e:
            return {
                "status": "error",
                "message": str(e),
                "original_url": url
            }

    def get_capture_history(self, url: str) -> Dict[str, Any]:
        """
        Retrieve the capture history for a given URL from the Wayback CDX Server API.
        :param url: The URL to retrieve capture history for.
        :return: A dictionary containing the response data.
        """
        try:
            cdx_api_url = "https://web.archive.org/cdx/search/cdx"
            params = {
                "url": url,
                "output": "json",
                "fl": "timestamp,original,mimetype,statuscode",
                "collapse": "timestamp:8"  # Group by YYYYMMDD
            }

            headers = {
                "User-Agent": self.user_agent
            }

            response = requests.get(cdx_api_url, params=params, headers=headers, timeout=30)
            response.raise_for_status()

            data = response.json()
            
            if len(data) > 1:  # First row is the header
                return {
                    "status": "success",
                    "data": [dict(zip(data[0], row)) for row in data[1:]],
                    "original_url": url
                }
            else:
                return {
                    "status": "not_found",
                    "message": "No capture history found for the given URL.",
                    "original_url": url
                }

        except requests.exceptions.RequestException as e:
            return {
                "status": "error",
                "message": str(e),
                "original_url": url
            }