audio-oxide-old/json-md.py

import os
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md
import re

# Make sure you have the required libraries:
# pip install requests beautifulsoup4 markdownify

def sanitize_filename(url):
    """Creates a safe filename from a URL."""
    if "docs.rs/" in url:
        url = url.split("docs.rs/", 1)[1]
    sanitized = re.sub(r'[^a-zA-Z0-9\.]+', '_', url)
    return sanitized.strip('_').lower()[:100]

def scrape_and_convert_url(url, session):
    """
    Fetches a single URL, finds its main documentation content,
    and converts it to Markdown text.
    """
    print(f"Fetching: {url}")
    try:
        response = session.get(url, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        # --- THIS IS THE CORRECTED LOGIC ---
        # First, try to find the main content by its ID, which is the most reliable selector.
        main_content = soup.find(id="main-content")

        # If that fails (which is unlikely), fall back to the class name as a safety measure.
        if not main_content:
            main_content = soup.find(class_="main-content")
        # --- END OF CORRECTION ---

        if main_content:
            markdown_text = md(str(main_content), heading_style="ATX")
            print(f"  -> Success: Converted content.")
            return markdown_text
        else:
            print(f"  -> Warning: Could not find a recognizable main content section in page.")
            return None

    except requests.exceptions.RequestException as e:
        print(f"  -> Error: Failed to fetch URL {url}. Reason: {e}")
        return None
    except Exception as e:
        print(f"  -> Error: An unexpected error occurred for {url}. Reason: {e}")
        return None

def process_url_list(urls, output_file):
    """
    Processes a list of URLs, converts their content to Markdown,
    and saves everything into a single output file.
    """
    with requests.Session() as session:
        session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'})

        with open(output_file, 'w', encoding='utf-8') as f:
            f.write("# Iced Pocket Guide\n\n")

            processed_urls = set()
            for url in urls:
                cleaned_url = url.split('#')[0]
                if not cleaned_url or not cleaned_url.startswith("http") or cleaned_url in processed_urls:
                    if cleaned_url in processed_urls:
                        print(f"Skipping duplicate URL: {cleaned_url}")
                    else:
                        print(f"Skipping invalid or anchor-only URL: {url}")
                    continue

                processed_urls.add(cleaned_url)

                f.write(f"\n---\n\n")
                f.write(f"## Source: [{cleaned_url}]({cleaned_url})\n\n")

                content = scrape_and_convert_url(cleaned_url, session)

                if content:
                    f.write(content)
                else:
                    f.write("*Failed to retrieve or convert content for this URL.*")

                f.write("\n\n")

    print(f"\n🎉 Conversion complete! All content saved to '{output_file}'.")


if __name__ == '__main__':
    pocket_guide_urls = [
        "https://docs.rs/iced/0.13.1/iced/index.html",
        "https://docs.rs/iced/0.13.1/iced/advanced/index.html",
        "https://docs.rs/iced/0.13.1/iced/application/index.html",
        "https://docs.rs/iced/0.13.1/iced/application/struct.Application.html",
        "https://docs.rs/iced/0.13.1/iced/widget/index.html",
        "https://docs.rs/iced/0.13.1/iced/widget/struct.Container.html",
        "https://docs.rs/iced/0.13.1/iced/widget/struct.Column.html",
        "https://docs.rs/iced/0.13.1/iced/widget/struct.Row.html",
        "https://docs.rs/iced/0.13.1/iced/enum.Length.html",
        "https://docs.rs/iced/0.13.1/iced/alignment/index.html",
        "https://docs.rs/iced/0.13.1/iced/enum.Alignment.html",
        "https://docs.rs/iced/0.13.1/iced/type.Element.html",
        "https://docs.rs/iced/0.13.1/iced/struct.Task.html",
        "https://docs.rs/iced/0.13.1/iced/task/index.html",
        "https://docs.rs/iced/0.13.1/iced/struct.Subscription.html",
        "https://docs.rs/iced/0.13.1/iced/stream/index.html",
        "https://docs.rs/iced/0.13.1/iced/daemon/index.html",
        "https://docs.rs/iced/0.13.1/iced/daemon/struct.Daemon.html",
        "https://docs.rs/iced/0.13.1/iced/theme/index.html",
        "https://docs.rs/iced/0.13.1/iced/enum.Theme.html",
        "https://docs.rs/iced/0.13.1/iced/settings/index.html",
        "https://docs.rs/iced/0.13.1/iced/settings/struct.Settings.html",
        "https://docs.rs/iced/0.13.1/iced/window/index.html",
        "https://docs.rs/iced/0.13.1/iced/keyboard/index.html",
        "https://docs.rs/iced/0.13.1/iced/mouse/index.html",
        "https://docs.rs/iced/0.13.1/iced/touch/index.html"
    ]

    OUTPUT_FILENAME = "iced_pocket_guide.md"

    process_url_list(pocket_guide_urls, OUTPUT_FILENAME)