audio-oxide-old/json-md.py

123 lines
5.1 KiB
Python

import os
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md
import re
# Make sure you have the required libraries:
# pip install requests beautifulsoup4 markdownify
def sanitize_filename(url):
"""Creates a safe filename from a URL."""
if "docs.rs/" in url:
url = url.split("docs.rs/", 1)[1]
sanitized = re.sub(r'[^a-zA-Z0-9\.]+', '_', url)
return sanitized.strip('_').lower()[:100]
def scrape_and_convert_url(url, session):
"""
Fetches a single URL, finds its main documentation content,
and converts it to Markdown text.
"""
print(f"Fetching: {url}")
try:
response = session.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# --- THIS IS THE CORRECTED LOGIC ---
# First, try to find the main content by its ID, which is the most reliable selector.
main_content = soup.find(id="main-content")
# If that fails (which is unlikely), fall back to the class name as a safety measure.
if not main_content:
main_content = soup.find(class_="main-content")
# --- END OF CORRECTION ---
if main_content:
markdown_text = md(str(main_content), heading_style="ATX")
print(f" -> Success: Converted content.")
return markdown_text
else:
print(f" -> Warning: Could not find a recognizable main content section in page.")
return None
except requests.exceptions.RequestException as e:
print(f" -> Error: Failed to fetch URL {url}. Reason: {e}")
return None
except Exception as e:
print(f" -> Error: An unexpected error occurred for {url}. Reason: {e}")
return None
def process_url_list(urls, output_file):
"""
Processes a list of URLs, converts their content to Markdown,
and saves everything into a single output file.
"""
with requests.Session() as session:
session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'})
with open(output_file, 'w', encoding='utf-8') as f:
f.write("# Iced Pocket Guide\n\n")
processed_urls = set()
for url in urls:
cleaned_url = url.split('#')[0]
if not cleaned_url or not cleaned_url.startswith("http") or cleaned_url in processed_urls:
if cleaned_url in processed_urls:
print(f"Skipping duplicate URL: {cleaned_url}")
else:
print(f"Skipping invalid or anchor-only URL: {url}")
continue
processed_urls.add(cleaned_url)
f.write(f"\n---\n\n")
f.write(f"## Source: [{cleaned_url}]({cleaned_url})\n\n")
content = scrape_and_convert_url(cleaned_url, session)
if content:
f.write(content)
else:
f.write("*Failed to retrieve or convert content for this URL.*")
f.write("\n\n")
print(f"\n🎉 Conversion complete! All content saved to '{output_file}'.")
if __name__ == '__main__':
pocket_guide_urls = [
"https://docs.rs/iced/0.13.1/iced/index.html",
"https://docs.rs/iced/0.13.1/iced/advanced/index.html",
"https://docs.rs/iced/0.13.1/iced/application/index.html",
"https://docs.rs/iced/0.13.1/iced/application/struct.Application.html",
"https://docs.rs/iced/0.13.1/iced/widget/index.html",
"https://docs.rs/iced/0.13.1/iced/widget/struct.Container.html",
"https://docs.rs/iced/0.13.1/iced/widget/struct.Column.html",
"https://docs.rs/iced/0.13.1/iced/widget/struct.Row.html",
"https://docs.rs/iced/0.13.1/iced/enum.Length.html",
"https://docs.rs/iced/0.13.1/iced/alignment/index.html",
"https://docs.rs/iced/0.13.1/iced/enum.Alignment.html",
"https://docs.rs/iced/0.13.1/iced/type.Element.html",
"https://docs.rs/iced/0.13.1/iced/struct.Task.html",
"https://docs.rs/iced/0.13.1/iced/task/index.html",
"https://docs.rs/iced/0.13.1/iced/struct.Subscription.html",
"https://docs.rs/iced/0.13.1/iced/stream/index.html",
"https://docs.rs/iced/0.13.1/iced/daemon/index.html",
"https://docs.rs/iced/0.13.1/iced/daemon/struct.Daemon.html",
"https://docs.rs/iced/0.13.1/iced/theme/index.html",
"https://docs.rs/iced/0.13.1/iced/enum.Theme.html",
"https://docs.rs/iced/0.13.1/iced/settings/index.html",
"https://docs.rs/iced/0.13.1/iced/settings/struct.Settings.html",
"https://docs.rs/iced/0.13.1/iced/window/index.html",
"https://docs.rs/iced/0.13.1/iced/keyboard/index.html",
"https://docs.rs/iced/0.13.1/iced/mouse/index.html",
"https://docs.rs/iced/0.13.1/iced/touch/index.html"
]
OUTPUT_FILENAME = "iced_pocket_guide.md"
process_url_list(pocket_guide_urls, OUTPUT_FILENAME)